Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F1431779
TokenUtils.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
22 KB
Referenced Files
None
Subscribers
None
TokenUtils.php
View Options
<?php
declare
(
strict_types
=
1
);
/**
* This file contains general utilities for:
* (a) querying token properties and token types
* (b) manipulating tokens, individually and as collections.
*/
namespace
Wikimedia\Parsoid\Utils
;
use
Wikimedia\Assert\Assert
;
use
Wikimedia\Assert\UnreachableException
;
use
Wikimedia\Parsoid\Config\Env
;
use
Wikimedia\Parsoid\Core\DomSourceRange
;
use
Wikimedia\Parsoid\Tokens\CommentTk
;
use
Wikimedia\Parsoid\Tokens\EndTagTk
;
use
Wikimedia\Parsoid\Tokens\EOFTk
;
use
Wikimedia\Parsoid\Tokens\KV
;
use
Wikimedia\Parsoid\Tokens\KVSourceRange
;
use
Wikimedia\Parsoid\Tokens\NlTk
;
use
Wikimedia\Parsoid\Tokens\SelfclosingTagTk
;
use
Wikimedia\Parsoid\Tokens\SourceRange
;
use
Wikimedia\Parsoid\Tokens\TagTk
;
use
Wikimedia\Parsoid\Tokens\Token
;
use
Wikimedia\Parsoid\Wikitext\Consts
;
class
TokenUtils
{
public
const
SOL_TRANSPARENT_LINK_REGEX
=
'/(?:^|
\s
)mw:PageProp
\/
(?:Category|redirect|Language)(?=$|
\s
)/D'
;
/**
* Gets a string type value for a token
* @param Token|string $token
* @return string
*/
public
static
function
getTokenType
(
$token
):
string
{
return
is_string
(
$token
)
?
'string'
:
$token
->
getType
();
}
/**
* @param string $name
* @return bool
*/
public
static
function
isWikitextBlockTag
(
string
$name
):
bool
{
return
isset
(
Consts
::
$wikitextBlockElems
[
$name
]
);
}
/**
* In the legacy parser, these block tags open block-tag scope
* See doBlockLevels in the PHP parser (includes/parser/Parser.php).
*
* @param string $name
* @return bool
*/
public
static
function
tagOpensBlockScope
(
string
$name
):
bool
{
return
isset
(
Consts
::
$blockElems
[
$name
]
)
||
isset
(
Consts
::
$alwaysBlockElems
[
$name
]
);
}
/**
* In the legacy parser, these block tags close block-tag scope
* See doBlockLevels in the PHP parser (includes/parser/Parser.php).
*
* @param string $name
* @return bool
*/
public
static
function
tagClosesBlockScope
(
string
$name
):
bool
{
return
isset
(
Consts
::
$antiBlockElems
[
$name
]
)
||
isset
(
Consts
::
$neverBlockElems
[
$name
]
);
}
/**
* Is this a template token?
* @param Token|string|null $token
* @return bool
*/
public
static
function
isTemplateToken
(
$token
):
bool
{
return
$token
instanceof
SelfclosingTagTk
&&
$token
->
getName
()
===
'template'
;
}
/**
* Determine whether the current token was an HTML tag in wikitext.
*
* @param Token|string|null $token
* @return bool
*/
public
static
function
isHTMLTag
(
$token
):
bool
{
return
$token
&&
!
is_string
(
$token
)
&&
(
$token
instanceof
TagTk
||
$token
instanceof
EndTagTk
||
$token
instanceof
SelfClosingTagTk
)
&&
isset
(
$token
->
dataParsoid
->
stx
)
&&
$token
->
dataParsoid
->
stx
===
'html'
;
}
/**
* Is the token a DOMFragment type value?
*
* @param Token $token
* @return bool
*/
public
static
function
hasDOMFragmentType
(
Token
$token
):
bool
{
return
self
::
matchTypeOf
(
$token
,
'#^mw:DOMFragment(/sealed/
\w
+)?$#D'
)
!==
null
;
}
/**
* Is the token a table tag?
*
* @param Token|string $token
* @return bool
*/
public
static
function
isTableTag
(
$token
):
bool
{
return
(
$token
instanceof
TagTk
||
$token
instanceof
EndTagTk
)
&&
isset
(
Consts
::
$HTML
[
'TableTags'
][
$token
->
getName
()]
);
}
/**
* Determine if token is a transparent link tag
*
* @param Token|string $token
* @return bool
*/
public
static
function
isSolTransparentLinkTag
(
$token
):
bool
{
return
(
$token
instanceof
SelfclosingTagTk
||
$token
instanceof
TagTk
||
$token
instanceof
EndTagTk
)
&&
$token
->
getName
()
===
'link'
&&
preg_match
(
self
::
SOL_TRANSPARENT_LINK_REGEX
,
$token
->
getAttributeV
(
'rel'
)
??
''
);
}
/**
* Does this token represent a behavior switch?
*
* @param Env $env
* @param Token|string $token
* @return bool
*/
public
static
function
isBehaviorSwitch
(
Env
$env
,
$token
):
bool
{
return
$token
instanceof
SelfclosingTagTk
&&
(
// Before BehaviorSwitchHandler (ie. PreHandler, etc.)
$token
->
getName
()
===
'behavior-switch'
||
// After BehaviorSwitchHandler
// (ie. ListHandler, ParagraphWrapper, etc.)
(
$token
->
getName
()
===
'meta'
&&
$token
->
hasAttribute
(
'property'
)
&&
preg_match
(
$env
->
getSiteConfig
()->
bswPagePropRegexp
(),
$token
->
getAttributeV
(
'property'
)
??
''
)
)
);
}
/**
* This should come close to matching
* {@link WTUtils::emitsSolTransparentSingleLineWT},
* without the single line caveat.
* @param Env $env
* @param Token|string $token
* @return bool
*/
public
static
function
isSolTransparent
(
Env
$env
,
$token
):
bool
{
if
(
is_string
(
$token
)
)
{
return
(
bool
)
preg_match
(
'/^[
\t
]*$/D'
,
$token
);
}
elseif
(
self
::
isSolTransparentLinkTag
(
$token
)
)
{
return
true
;
}
elseif
(
$token
instanceof
CommentTk
&&
!
self
::
isTranslationUnitMarker
(
$env
,
$token
)
)
{
return
true
;
}
elseif
(
self
::
isBehaviorSwitch
(
$env
,
$token
)
)
{
return
true
;
}
elseif
(
!
$token
instanceof
SelfclosingTagTk
||
$token
->
getName
()
!==
'meta'
)
{
return
false
;
}
else
{
// only metas left
return
!(
isset
(
$token
->
dataParsoid
->
stx
)
&&
$token
->
dataParsoid
->
stx
===
'html'
);
}
}
/**
* HACK: Returns true if $token looks like a TU marker (<!--T:XXX-->) and if we could be in a
* translate-annotated page.
* @param Env $env
* @param CommentTk $token
* @return bool
*/
public
static
function
isTranslationUnitMarker
(
Env
$env
,
CommentTk
$token
):
bool
{
return
$env
->
hasAnnotations
&&
$env
->
getSiteConfig
()->
isAnnotationTag
(
'translate'
)
&&
preg_match
(
'/^T:/'
,
$token
->
value
)
===
1
;
}
/**
* Is token a transparent link tag?
*
* @param Token|string $token
* @return bool
*/
public
static
function
isEmptyLineMetaToken
(
$token
):
bool
{
return
$token
instanceof
SelfclosingTagTk
&&
$token
->
getName
()
===
'meta'
&&
$token
->
getAttributeV
(
'typeof'
)
===
'mw:EmptyLine'
;
}
/**
* Determine whether the token matches the given `typeof` attribute value.
*
* @param Token $t The token to test
* @param string $typeRe Regular expression matching the expected value of
* the `typeof` attribute.
* @return ?string The matching `typeof` value, or `null` if there is
* no match.
*/
public
static
function
matchTypeOf
(
Token
$t
,
string
$typeRe
):
?
string
{
$v
=
$t
->
getAttributeV
(
'typeof'
);
if
(
$v
===
null
)
{
return
null
;
}
Assert
::
invariant
(
is_string
(
$v
),
"Typeof is not simple"
);
foreach
(
preg_split
(
'/
\s
+/'
,
$v
,
-
1
,
PREG_SPLIT_NO_EMPTY
)
as
$ty
)
{
$count
=
preg_match
(
$typeRe
,
$ty
);
Assert
::
invariant
(
$count
!==
false
,
"Bad regexp"
);
if
(
$count
)
{
return
$ty
;
}
}
return
null
;
}
/**
* Determine whether the token matches the given typeof attribute value.
*
* @param Token $t
* @param string $type Expected value of "typeof" attribute, as a literal
* string.
* @return bool True if the token matches.
*/
public
static
function
hasTypeOf
(
Token
$t
,
string
$type
):
bool
{
return
self
::
matchTypeOf
(
$t
,
'/^'
.
preg_quote
(
$type
,
'/'
)
.
'$/D'
)
!==
null
;
}
/**
* Shift TSR of a token
*
* PORT-FIXME: In JS this was sometimes called with $offset=undefined, which meant do
* nothing by default, except if there was a third parameter set to true, in which case it
* meant the same thing as $offset = null. We can't pass in undefined in PHP, so this should
* usually be handled with isset() is the caller. But isset() returns true if the variable is
* null, so let's use false instead of null for whatever the previous code meant by a null
* offset.
*
* @param array<Token|string> $tokens
* @param int|false $offset
*/
public
static
function
shiftTokenTSR
(
array
$tokens
,
$offset
):
void
{
// Bail early if we can
if
(
$offset
===
0
)
{
return
;
}
// JS b/c
if
(
$offset
===
null
)
{
$offset
=
false
;
}
// update/clear tsr
for
(
$i
=
0
,
$n
=
count
(
$tokens
);
$i
<
$n
;
$i
++
)
{
$t
=
$tokens
[
$i
];
switch
(
is_object
(
$t
)
?
get_class
(
$t
)
:
null
)
{
case
TagTk
::
class
:
case
SelfclosingTagTk
::
class
:
case
NlTk
::
class
:
case
CommentTk
::
class
:
case
EndTagTk
::
class
:
$da
=
$t
->
dataParsoid
;
$tsr
=
$da
->
tsr
;
if
(
$tsr
)
{
if
(
$offset
)
{
$da
->
tsr
=
$tsr
->
offset
(
$offset
);
}
else
{
$da
->
tsr
=
null
;
}
}
if
(
$offset
&&
isset
(
$da
->
extTagOffsets
)
)
{
$da
->
extTagOffsets
=
$da
->
extTagOffsets
->
offset
(
$offset
);
}
// SSS FIXME: offset will always be available in
// chunky-tokenizer mode in which case we wont have
// buggy offsets below. The null scenario is only
// for when the token-stream-patcher attempts to
// reparse a string -- it is likely to only patch up
// small string fragments and the complicated use cases
// below should not materialize.
// CSA: token-stream-patcher shouldn't have problems
// now that $frame->srcText is always accurate?
// content offsets for ext-links
if
(
$offset
&&
isset
(
$da
->
tmp
->
extLinkContentOffsets
)
)
{
$da
->
tmp
->
extLinkContentOffsets
=
$da
->
tmp
->
extLinkContentOffsets
->
offset
(
$offset
);
}
// Process attributes
if
(
isset
(
$t
->
attribs
)
)
{
for
(
$j
=
0
,
$m
=
count
(
$t
->
attribs
);
$j
<
$m
;
$j
++
)
{
$a
=
$t
->
attribs
[
$j
];
if
(
is_array
(
$a
->
k
)
)
{
self
::
shiftTokenTSR
(
$a
->
k
,
$offset
);
}
if
(
is_array
(
$a
->
v
)
)
{
self
::
shiftTokenTSR
(
$a
->
v
,
$offset
);
}
// src offsets used to set mw:TemplateParams
if
(
!
$offset
)
{
$a
->
srcOffsets
=
null
;
}
elseif
(
$a
->
srcOffsets
!==
null
)
{
$a
->
srcOffsets
=
$a
->
srcOffsets
->
offset
(
$offset
);
}
}
}
break
;
default
:
break
;
}
}
}
/**
* Strip EOFTk token from token chunk.
* The EOFTk is expected to be the last token of the chunk.
*
* @param array &$tokens
* @return array return the modified token array so that this call can be chained
*/
public
static
function
stripEOFTkFromTokens
(
array
&
$tokens
):
array
{
$n
=
count
(
$tokens
);
if
(
$n
&&
$tokens
[
$n
-
1
]
instanceof
EOFTk
)
{
array_pop
(
$tokens
);
}
return
$tokens
;
}
/**
* Convert string offsets
*
* Offset types are:
* - 'byte': Bytes (UTF-8 encoding), e.g. PHP `substr()` or `strlen()`.
* - 'char': Unicode code points (encoding irrelevant), e.g. PHP `mb_substr()` or `mb_strlen()`.
* - 'ucs2': 16-bit code units (UTF-16 encoding), e.g. JavaScript `.substring()` or `.length`.
*
* Offsets that are mid-Unicode character are "rounded" up to the next full
* character, i.e. the output offset will always point to the start of a
* Unicode code point (or just past the end of the string). Offsets outside
* the string are "rounded" to 0 or just-past-the-end.
*
* @note When constructing the array of offsets to pass to this method,
* populate it with references as `$offsets[] = &$var;`.
*
* @param string $s Unicode string the offsets are offsets into, UTF-8 encoded.
* @param string $from Offset type to convert from.
* @param string $to Offset type to convert to.
* @param int[] $offsets References to the offsets to convert.
*/
public
static
function
convertOffsets
(
string
$s
,
string
$from
,
string
$to
,
array
$offsets
):
void
{
static
$valid
=
[
'byte'
,
'char'
,
'ucs2'
];
if
(
!
in_array
(
$from
,
$valid
,
true
)
)
{
throw
new
\InvalidArgumentException
(
'Invalid $from'
);
}
if
(
!
in_array
(
$to
,
$valid
,
true
)
)
{
throw
new
\InvalidArgumentException
(
'Invalid $to'
);
}
$i
=
0
;
$offsetCt
=
count
(
$offsets
);
if
(
$offsetCt
===
0
)
{
// Nothing to do
return
;
}
sort
(
$offsets
,
SORT_NUMERIC
);
$bytePos
=
0
;
$ucs2Pos
=
0
;
$charPos
=
0
;
$fromPos
=
&
$
{
$from
.
'Pos'
};
// @phan-suppress-current-line PhanPluginDollarDollar
$toPos
=
&
$
{
$to
.
'Pos'
};
// @phan-suppress-current-line PhanPluginDollarDollar
$byteLen
=
strlen
(
$s
);
while
(
$bytePos
<
$byteLen
)
{
// Update offsets that we've reached
while
(
$offsets
[
$i
]
<=
$fromPos
)
{
$offsets
[
$i
]
=
$toPos
;
if
(
++
$i
>=
$offsetCt
)
{
return
;
}
}
// Update positions
++
$charPos
;
$c
=
ord
(
$s
[
$bytePos
]
)
&
0xf8
;
switch
(
$c
)
{
case
0x00
:
case
0x08
:
case
0x10
:
case
0x18
:
case
0x20
:
case
0x28
:
case
0x30
:
case
0x38
:
case
0x40
:
case
0x48
:
case
0x50
:
case
0x58
:
case
0x60
:
case
0x68
:
case
0x70
:
case
0x78
:
++
$bytePos
;
++
$ucs2Pos
;
break
;
case
0xc0
:
case
0xc8
:
case
0xd0
:
case
0xd8
:
$bytePos
+=
2
;
++
$ucs2Pos
;
break
;
case
0xe0
:
case
0xe8
:
$bytePos
+=
3
;
++
$ucs2Pos
;
break
;
case
0xf0
:
$bytePos
+=
4
;
$ucs2Pos
+=
2
;
break
;
default
:
throw
new
\InvalidArgumentException
(
'$s is not UTF-8'
);
}
}
// Convert any offsets past the end of the string to the length of the
// string.
while
(
$i
<
$offsetCt
)
{
$offsets
[
$i
]
=
$toPos
;
++
$i
;
}
}
/**
* Convert offsets in a token array
*
* @see TokenUtils::convertOffsets()
*
* @param string $s The offset reference string
* @param string $from Offset type to convert from
* @param string $to Offset type to convert to
* @param array<Token|string|array> $tokens
*/
public
static
function
convertTokenOffsets
(
string
$s
,
string
$from
,
string
$to
,
array
$tokens
):
void
{
$offsets
=
[];
/* @var array<int> $offsets */
self
::
collectOffsets
(
$tokens
,
static
function
(
$sr
)
use
(
&
$offsets
)
{
if
(
$sr
instanceof
DomSourceRange
)
{
// Adjust the widths to be actual character offsets
if
(
$sr
->
openWidth
!==
null
)
{
Assert
::
invariant
(
$sr
->
start
!==
null
,
"width w/o start"
);
$sr
->
openWidth
=
$sr
->
start
+
$sr
->
openWidth
;
$offsets
[]
=&
$sr
->
openWidth
;
}
if
(
$sr
->
closeWidth
!==
null
)
{
Assert
::
invariant
(
$sr
->
end
!==
null
,
"width w/o end"
);
$sr
->
closeWidth
=
$sr
->
end
-
$sr
->
closeWidth
;
$offsets
[]
=&
$sr
->
closeWidth
;
}
}
if
(
$sr
->
start
!==
null
)
{
$offsets
[]
=&
$sr
->
start
;
}
if
(
$sr
->
end
!==
null
)
{
$offsets
[]
=&
$sr
->
end
;
}
}
);
self
::
convertOffsets
(
$s
,
$from
,
$to
,
$offsets
);
self
::
collectOffsets
(
$tokens
,
static
function
(
$sr
)
use
(
&
$offsets
)
{
if
(
$sr
instanceof
DomSourceRange
)
{
// Adjust widths back from being character offsets
if
(
$sr
->
openWidth
!==
null
)
{
$sr
->
openWidth
-=
$sr
->
start
;
}
if
(
$sr
->
closeWidth
!==
null
)
{
$sr
->
closeWidth
=
$sr
->
end
-
$sr
->
closeWidth
;
}
}
}
);
}
/**
* @param array<Token|string>|array<KV>|KV|Token|DomSourceRange|KVSourceRange|SourceRange|string $input
* @param callable $offsetFunc
*/
private
static
function
collectOffsets
(
$input
,
callable
$offsetFunc
):
void
{
if
(
is_array
(
$input
)
)
{
foreach
(
$input
as
$token
)
{
self
::
collectOffsets
(
$token
,
$offsetFunc
);
}
}
elseif
(
$input
instanceof
KV
)
{
self
::
collectOffsets
(
$input
->
k
,
$offsetFunc
);
self
::
collectOffsets
(
$input
->
v
,
$offsetFunc
);
if
(
$input
->
srcOffsets
)
{
self
::
collectOffsets
(
$input
->
srcOffsets
,
$offsetFunc
);
}
}
elseif
(
$input
instanceof
Token
)
{
if
(
isset
(
$input
->
dataParsoid
->
tsr
)
)
{
self
::
collectOffsets
(
$input
->
dataParsoid
->
tsr
,
$offsetFunc
);
}
if
(
isset
(
$input
->
dataParsoid
->
tmp
->
extLinkContentOffsets
)
)
{
self
::
collectOffsets
(
$input
->
dataParsoid
->
tmp
->
extLinkContentOffsets
,
$offsetFunc
);
}
if
(
isset
(
$input
->
dataParsoid
->
tokens
)
)
{
self
::
collectOffsets
(
$input
->
dataParsoid
->
tokens
,
$offsetFunc
);
}
if
(
isset
(
$input
->
dataParsoid
->
extTagOffsets
)
)
{
self
::
collectOffsets
(
$input
->
dataParsoid
->
extTagOffsets
,
$offsetFunc
);
}
self
::
collectOffsets
(
$input
->
attribs
,
$offsetFunc
);
}
elseif
(
$input
instanceof
KVSourceRange
)
{
self
::
collectOffsets
(
$input
->
key
,
$offsetFunc
);
self
::
collectOffsets
(
$input
->
value
,
$offsetFunc
);
}
elseif
(
$input
instanceof
SourceRange
)
{
// This includes DomSourceRange
$offsetFunc
(
$input
);
}
}
/**
* Tests whether token represents an HTML entity.
* Think `<span typeof="mw:Entity">`.
* @param Token|string|null $token
* @return bool
*/
public
static
function
isEntitySpanToken
(
$token
):
bool
{
return
$token
&&
$token
instanceof
TagTk
&&
$token
->
getName
()
===
'span'
&&
self
::
hasTypeOf
(
$token
,
'mw:Entity'
);
}
/**
* Transform `"\n"` and `"\r\n"` in the input string to {@link NlTk} tokens.
* @param string $str
* @return array (interspersed string and NlTk tokens)
*/
public
static
function
newlinesToNlTks
(
string
$str
):
array
{
$toks
=
preg_split
(
'/
\n
|
\r\n
/'
,
$str
);
$ret
=
[];
// Add one NlTk between each pair, hence toks.length-1
for
(
$i
=
0
,
$n
=
count
(
$toks
)
-
1
;
$i
<
$n
;
$i
++
)
{
$ret
[]
=
$toks
[
$i
];
$ret
[]
=
new
NlTk
(
null
);
}
$ret
[]
=
$toks
[
$i
];
return
$ret
;
}
/**
* Flatten/convert a token array into a string.
* @param string|Token|array<Token|string> $tokens
* @param bool $strict Whether to abort as soon as we find a token we
* can't stringify.
* @param array<string,bool|Env> $opts
* @return string|array{0:string,1:Array<Token|string>}
* The stringified tokens. If $strict is true, returns a two-element
* array containing string prefix and the remainder of the tokens as
* soon as we encounter something we can't stringify.
*
* Unsure why phan is whining about $opts array accesses.
* So for now, I am simply suppressing those warnings.
*/
public
static
function
tokensToString
(
$tokens
,
bool
$strict
=
false
,
array
$opts
=
[]
)
{
if
(
is_string
(
$tokens
)
)
{
return
$tokens
;
}
if
(
!
is_array
(
$tokens
)
)
{
$tokens
=
[
$tokens
];
}
$out
=
''
;
for
(
$i
=
0
,
$l
=
count
(
$tokens
);
$i
<
$l
;
$i
++
)
{
$token
=
$tokens
[
$i
];
if
(
$token
===
null
)
{
throw
new
UnreachableException
(
"No nulls expected."
);
}
elseif
(
$token
instanceof
KV
)
{
// Since this function is occasionally called on KV->v,
// whose signature recursively includes KV[], a mismatch with
// this function, we assert that those values are only
// included in safe places that don't intend to stringify
// their tokens.
throw
new
UnreachableException
(
"No KVs expected."
);
}
elseif
(
is_string
(
$token
)
)
{
$out
.=
$token
;
}
elseif
(
is_array
(
$token
)
)
{
Assert
::
invariant
(
!
$strict
,
"strict case handled above"
);
$out
.=
self
::
tokensToString
(
$token
,
$strict
,
$opts
);
}
elseif
(
$token
instanceof
CommentTk
||
(
empty
(
$opts
[
'retainNLs'
]
)
&&
$token
instanceof
NlTk
)
)
{
// strip comments and newlines
}
elseif
(
!
empty
(
$opts
[
'stripEmptyLineMeta'
]
)
&&
self
::
isEmptyLineMetaToken
(
$token
)
)
{
// If requested, strip empty line meta tokens too.
}
elseif
(
!
empty
(
$opts
[
'includeEntities'
]
)
&&
self
::
isEntitySpanToken
(
$token
)
)
{
$out
.=
$token
->
dataParsoid
->
src
;
$i
+=
2
;
// Skip child and end tag.
}
elseif
(
$strict
)
{
// If strict, return accumulated string on encountering first non-text token
return
[
$out
,
array_slice
(
$tokens
,
$i
)
];
}
elseif
(
// This option shouldn't be used if the tokens have been
// expanded to DOM
!
empty
(
$opts
[
'unpackDOMFragments'
]
)
&&
(
$token
instanceof
TagTk
||
$token
instanceof
SelfclosingTagTk
)
&&
self
::
hasDOMFragmentType
(
$token
)
)
{
// Handle dom fragments
$domFragment
=
$opts
[
'env'
]->
getDOMFragment
(
$token
->
dataParsoid
->
html
);
// Calling `env->removeDOMFragment()` here is case dependent
// but should be rare enough when permissible that it can be
// ignored.
// FIXME: The correct thing to do would be to return
// `$domFragment.innerHTML` for the current scenarios where
// `unpackDOMFragments` is used (expanded attribute
// values and reparses thereof) but we'd need to remove
// the span wrapping and typeof annotation of extension
// content and nowikis. Since we're primarily expecting
// to find <translate> and <nowiki> here, this will do.
$out
.=
$domFragment
->
textContent
;
if
(
$token
instanceof
TagTk
)
{
$i
+=
1
;
// Skip the EndTagTK
Assert
::
invariant
(
$i
>=
$l
||
$tokens
[
$i
]
instanceof
EndTagTk
,
"tag should be followed by endtag"
);
}
}
}
return
$out
;
}
/**
* Convert an array of key-value pairs into a hash of keys to values.
* For duplicate keys, the last entry wins.
* @param array<KV> $kvs
* @return array<string,array<Token|string>>|array<string,string>
*/
public
static
function
kvToHash
(
array
$kvs
):
array
{
$res
=
[];
foreach
(
$kvs
as
$kv
)
{
$key
=
trim
(
self
::
tokensToString
(
$kv
->
k
)
);
// SSS FIXME: Temporary fix to handle extensions which use
// entities in attribute values. We need more robust handling
// of non-string template attribute values in general.
$val
=
self
::
tokensToString
(
$kv
->
v
);
$res
[
mb_strtolower
(
$key
)]
=
self
::
tokenTrim
(
$val
);
}
return
$res
;
}
/**
* Trim space and newlines from leading and trailing text tokens.
* @param string|Token|(Token|string)[] $tokens
* @return string|Token|(Token|string)[]
*/
public
static
function
tokenTrim
(
$tokens
)
{
if
(
!
is_array
(
$tokens
)
)
{
if
(
is_string
(
$tokens
)
)
{
return
trim
(
$tokens
);
}
return
$tokens
;
}
$n
=
count
(
$tokens
);
// strip leading space
foreach
(
$tokens
as
&
$token
)
{
if
(
$token
instanceof
NlTk
)
{
$token
=
''
;
}
elseif
(
is_string
(
$token
)
)
{
$token
=
preg_replace
(
'/^
\s
+/'
,
''
,
$token
,
1
);
if
(
$token
!==
''
)
{
break
;
}
}
else
{
break
;
}
}
// strip trailing space
for
(
$i
=
$n
-
1
;
$i
>=
0
;
$i
--
)
{
$token
=
&
$tokens
[
$i
];
if
(
$token
instanceof
NlTk
)
{
$token
=
''
;
// replace newline with empty
}
elseif
(
is_string
(
$token
)
)
{
$token
=
preg_replace
(
'/
\s
+$/D'
,
''
,
$token
,
1
);
if
(
$token
!==
''
)
{
break
;
}
}
else
{
break
;
}
}
return
$tokens
;
}
/**
* Checks whether the provided meta tag token is an annotation start token
* @param Token $t
* @return bool
*/
public
static
function
isAnnotationStartToken
(
Token
$t
):
bool
{
$type
=
self
::
matchTypeOf
(
$t
,
WTUtils
::
ANNOTATION_META_TYPE_REGEXP
);
return
$type
!==
null
&&
!
str_ends_with
(
$type
,
'/End'
);
}
/**
* Checks whether the provided meta tag token is an annotation end token
* @param Token $t
* @return bool
*/
public
static
function
isAnnotationEndToken
(
Token
$t
):
bool
{
$type
=
self
::
matchTypeOf
(
$t
,
WTUtils
::
ANNOTATION_META_TYPE_REGEXP
);
return
$type
!==
null
&&
str_ends_with
(
$type
,
'/End'
);
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Sat, May 16, 21:03 (1 d, 16 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
8e/bf/f694bbdbedf2954067bdc7cf2a63
Default Alt Text
TokenUtils.php (22 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment