Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F1432765
PegTokenizer.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
6 KB
Referenced Files
None
Subscribers
None
PegTokenizer.php
View Options
<?php
declare
(
strict_types
=
1
);
/**
* Tokenizer for wikitext, using WikiPEG and a
* separate PEG grammar file
* (Grammar.pegphp)
*
* Use along with a {@link Wt2Html/TreeBuilder/TreeBuilderStage} and the
* {@link DOMPostProcessor}(s) for HTML output.
*/
namespace
Wikimedia\Parsoid\Wt2Html
;
use
Generator
;
use
Wikimedia\Assert\Assert
;
use
Wikimedia\Parsoid\Config\Env
;
use
Wikimedia\Parsoid\Tokens\EOFTk
;
use
Wikimedia\Parsoid\Tokens\SourceRange
;
use
Wikimedia\WikiPEG\SyntaxError
;
class
PegTokenizer
extends
PipelineStage
{
private
$options
;
private
$offsets
;
private
?
SyntaxError
$lastError
=
null
;
private
?
Grammar
$grammar
=
null
;
public
function
__construct
(
Env
$env
,
array
$options
=
[],
string
$stageId
=
""
,
?
PipelineStage
$prevStage
=
null
)
{
parent
::
__construct
(
$env
,
$prevStage
);
$this
->
env
=
$env
;
$this
->
options
=
$options
;
$this
->
offsets
=
[];
}
private
function
initGrammar
()
{
if
(
!
$this
->
grammar
)
{
$this
->
grammar
=
new
Grammar
;
}
}
/**
* Get the constructor options.
*
* @internal
* @return array
*/
public
function
getOptions
():
array
{
return
$this
->
options
;
}
/**
* Set start and end offsets of the source that generated this DOM.
*
* @param SourceRange $so
*/
public
function
setSourceOffsets
(
SourceRange
$so
):
void
{
$this
->
offsets
[
'startOffset'
]
=
$so
->
start
;
$this
->
offsets
[
'endOffset'
]
=
$so
->
end
;
}
/**
* See PipelineStage::process docs as well. This doc block refines
* the generic arg types to be specific to this pipeline stage.
*
* @param string $input wikitext to tokenize
* @param array{sol:bool} $opts
* - atTopLevel: (bool) Whether we are processing the top-level document
* - sol: (bool) Whether input should be processed in start-of-line context
* @return array|false The token array, or false for a syntax error
*/
public
function
process
(
$input
,
array
$opts
)
{
Assert
::
invariant
(
is_string
(
$input
),
"Input should be a string"
);
return
$this
->
tokenizeSync
(
$input
,
$opts
);
}
/**
* The text is tokenized in chunks (one per top-level block)
* and registered event listeners are called with the chunk
* to let it get processed further.
*
* The main worker. Sets up event emission ('chunk' and 'end' events).
* Consumers are supposed to register with PegTokenizer before calling
* process().
*
* @param string $text
* @param array{sol:bool} $opts
* - sol (bool) Whether text should be processed in start-of-line context.
* @return Generator
*/
public
function
processChunkily
(
$text
,
array
$opts
):
Generator
{
if
(
!
$this
->
grammar
)
{
$this
->
initGrammar
();
}
Assert
::
invariant
(
is_string
(
$text
),
"Input should be a string"
);
Assert
::
invariant
(
isset
(
$opts
[
'sol'
]
),
"Sol should be set"
);
// Kick it off!
$pipelineOffset
=
$this
->
offsets
[
'startOffset'
]
??
0
;
$args
=
[
'env'
=>
$this
->
env
,
'pipelineId'
=>
$this
->
getPipelineId
(),
'pegTokenizer'
=>
$this
,
'pipelineOffset'
=>
$pipelineOffset
,
'sol'
=>
$opts
[
'sol'
],
'stream'
=>
true
,
'startRule'
=>
'start_async'
,
];
try
{
// Wrap wikipeg's generator with our own generator
// to catch exceptions and track time usage.
// @phan-suppress-next-line PhanTypeInvalidYieldFrom
yield
from
$this
->
grammar
->
parse
(
$text
,
$args
);
yield
[
new
EOFTk
()
];
}
catch
(
SyntaxError
$e
)
{
$this
->
lastError
=
$e
;
throw
$e
;
}
}
/**
* Tokenize via a rule passed in as an arg.
* The text is tokenized synchronously in one shot.
*
* @param string $text
* @param array{sol:bool} $args
* - sol: (bool) Whether input should be processed in start-of-line context.
* - startRule: (string) which tokenizer rule to tokenize with
* @return array|false The token array, or false for a syntax error
*/
public
function
tokenizeSync
(
string
$text
,
array
$args
)
{
if
(
!
$this
->
grammar
)
{
$this
->
initGrammar
();
}
Assert
::
invariant
(
isset
(
$args
[
'sol'
]
),
"Sol should be set"
);
$args
+=
[
'pegTokenizer'
=>
$this
,
'pipelineId'
=>
$this
->
getPipelineId
(),
'pipelineOffset'
=>
$this
->
offsets
[
'startOffset'
]
??
0
,
'startRule'
=>
'start'
,
'env'
=>
$this
->
env
];
$start
=
null
;
$profile
=
null
;
if
(
$this
->
env
->
profiling
()
)
{
$profile
=
$this
->
env
->
getCurrentProfile
();
$start
=
microtime
(
true
);
}
try
{
$toks
=
$this
->
grammar
->
parse
(
$text
,
$args
);
}
catch
(
SyntaxError
$e
)
{
$this
->
lastError
=
$e
;
return
false
;
}
if
(
$profile
)
{
$profile
->
bumpTimeUse
(
'PEG'
,
1000
*
(
microtime
(
true
)
-
$start
),
'PEG'
);
}
return
$toks
;
}
/**
* Tokenizes a string as a rule
*
* @param string $text The input text
* @param string $rule The rule name
* @param bool $sol Start of line flag
* @return array|false Array of tokens/strings or false on error
*/
public
function
tokenizeAs
(
string
$text
,
string
$rule
,
bool
$sol
)
{
$args
=
[
'startRule'
=>
$rule
,
'sol'
=>
$sol
,
'pipelineOffset'
=>
0
];
return
$this
->
tokenizeSync
(
$text
,
$args
);
}
/**
* Tokenize a URL.
* @param string $text
* @return array|false Array of tokens/strings or false on error
*/
public
function
tokenizeURL
(
string
$text
)
{
return
$this
->
tokenizeAs
(
$text
,
'url'
,
/* sol */
true
);
}
/**
* Tokenize table cell attributes.
* @param string $text
* @param bool $sol
* @return array|false Array of tokens/strings or false on error
*/
public
function
tokenizeTableCellAttributes
(
string
$text
,
bool
$sol
)
{
return
$this
->
tokenizeAs
(
$text
,
'row_syntax_table_args'
,
$sol
);
}
/**
* If a tokenize method returned false, this will return a string describing the error,
* suitable for use in a log entry. If there has not been any error, returns false.
*
* @return string|false
*/
public
function
getLastErrorLogMessage
()
{
if
(
$this
->
lastError
)
{
return
"Tokenizer parse error at input location {$this->lastError->location}: "
.
$this
->
lastError
->
getMessage
();
}
else
{
return
false
;
}
}
/**
* @inheritDoc
*/
public
function
resetState
(
array
$opts
):
void
{
TokenizerUtils
::
resetAnnotationIncludeRegex
();
if
(
$this
->
grammar
)
{
$this
->
grammar
->
resetState
();
}
parent
::
resetState
(
$opts
);
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Sat, May 16, 22:17 (1 d, 3 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
d0/c9/e502a9310fe5bd59e3c6f55baf26
Default Alt Text
PegTokenizer.php (6 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment