Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F1432349
WTUtils.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
36 KB
Referenced Files
None
Subscribers
None
WTUtils.php
View Options
<?php
declare
(
strict_types
=
1
);
// Suppress UnusedPluginSuppression because
// Phan on PHP 7.4 and PHP 8.1 need different suppressions
// @phan-file-suppress UnusedPluginSuppression,UnusedPluginFileSuppression
namespace
Wikimedia\Parsoid\Utils
;
use
DOMException
;
use
Wikimedia\Assert\UnreachableException
;
use
Wikimedia\Bcp47Code\Bcp47Code
;
use
Wikimedia\Parsoid\Config\Env
;
use
Wikimedia\Parsoid\DOM\Comment
;
use
Wikimedia\Parsoid\DOM\Document
;
use
Wikimedia\Parsoid\DOM\DocumentFragment
;
use
Wikimedia\Parsoid\DOM\Element
;
use
Wikimedia\Parsoid\DOM\Node
;
use
Wikimedia\Parsoid\DOM\Text
;
use
Wikimedia\Parsoid\Ext\ExtensionTagHandler
;
use
Wikimedia\Parsoid\NodeData\DataParsoid
;
use
Wikimedia\Parsoid\NodeData\I18nInfo
;
use
Wikimedia\Parsoid\NodeData\TempData
;
use
Wikimedia\Parsoid\Tokens\CommentTk
;
use
Wikimedia\Parsoid\Wikitext\Consts
;
use
Wikimedia\Parsoid\Wt2Html\Frame
;
/**
* These utilites pertain to querying / extracting / modifying wikitext information from the DOM.
*/
class
WTUtils
{
private
const
FIRST_ENCAP_REGEXP
=
'#(?:^|
\s
)(mw:(?:Transclusion|Param|LanguageVariant|Extension(/
\S
+)))(?=$|
\s
)#D'
;
/**
* Regex corresponding to FIRST_ENCAP_REGEXP, but excluding extensions. If FIRST_ENCAP_REGEXP is
* updated, this one should be as well.
*/
private
const
NON_EXTENSION_ENCAP_REGEXP
=
'#(?:^|
\s
)(mw:(?:Transclusion|Param|LanguageVariant))(?=$|
\s
)#D'
;
/**
* Regexp for checking marker metas typeofs representing
* transclusion markup or template param markup.
*/
private
const
TPL_META_TYPE_REGEXP
=
'#^mw:(?:Transclusion|Param)(?:/End)?$#D'
;
/**
* Regexp for checking marker metas typeofs representing
* annotation markup
*/
public
const
ANNOTATION_META_TYPE_REGEXP
=
'#^mw:(?:Annotation/([
\w\d
]+))(?:/End)?$#uD'
;
/**
* Check whether a node's data-parsoid object includes
* an indicator that the original wikitext was a literal
* HTML element (like table or p)
*
* @param DataParsoid $dp
* @return bool
*/
public
static
function
hasLiteralHTMLMarker
(
DataParsoid
$dp
):
bool
{
return
isset
(
$dp
->
stx
)
&&
$dp
->
stx
===
'html'
;
}
/**
* Run a node through {@link #hasLiteralHTMLMarker}.
* @param ?Node $node
* @return bool
*/
public
static
function
isLiteralHTMLNode
(
?
Node
$node
):
bool
{
return
$node
instanceof
Element
&&
self
::
hasLiteralHTMLMarker
(
DOMDataUtils
::
getDataParsoid
(
$node
)
);
}
/**
* @param Node $node
* @return bool
*/
public
static
function
isZeroWidthWikitextElt
(
Node
$node
):
bool
{
return
isset
(
Consts
::
$ZeroWidthWikitextTags
[
DOMCompat
::
nodeName
(
$node
)]
)
&&
!
self
::
isLiteralHTMLNode
(
$node
);
}
/**
* Is `$node` a block node that is also visible in wikitext?
* An example of an invisible block node is a `<p>`-tag that
* Parsoid generated, or a `<ul>`, `<ol>` tag.
*
* @param Node $node
* @return bool
*/
public
static
function
isBlockNodeWithVisibleWT
(
Node
$node
):
bool
{
return
DOMUtils
::
isWikitextBlockNode
(
$node
)
&&
!
self
::
isZeroWidthWikitextElt
(
$node
);
}
/**
* Helper functions to detect when an A-$node uses [[..]]/[..]/... style
* syntax (for wikilinks, ext links, url links). rel-type is not sufficient
* anymore since mw:ExtLink is used for all the three link syntaxes.
*
* @param Element $node
* @return bool
*/
public
static
function
isATagFromWikiLinkSyntax
(
Element
$node
):
bool
{
if
(
DOMCompat
::
nodeName
(
$node
)
!==
'a'
)
{
return
false
;
}
$dp
=
DOMDataUtils
::
getDataParsoid
(
$node
);
return
DOMUtils
::
hasRel
(
$node
,
'mw:WikiLink'
)
||
(
isset
(
$dp
->
stx
)
&&
$dp
->
stx
!==
"url"
&&
$dp
->
stx
!==
"magiclink"
);
}
/**
* Helper function to detect when an A-node uses ext-link syntax.
* rel attribute is not sufficient anymore since mw:ExtLink is used for
* multiple link types
*
* @param Element $node
* @return bool
*/
public
static
function
isATagFromExtLinkSyntax
(
Element
$node
):
bool
{
if
(
DOMCompat
::
nodeName
(
$node
)
!==
'a'
)
{
return
false
;
}
$dp
=
DOMDataUtils
::
getDataParsoid
(
$node
);
return
DOMUtils
::
hasRel
(
$node
,
'mw:ExtLink'
)
&&
(
!
isset
(
$dp
->
stx
)
||
(
$dp
->
stx
!==
"url"
&&
$dp
->
stx
!==
"magiclink"
)
);
}
/**
* Helper function to detect when an A-node uses url-link syntax.
* rel attribute is not sufficient anymore since mw:ExtLink is used for
* multiple link types
*
* @param Element $node
* @return bool
*/
public
static
function
isATagFromURLLinkSyntax
(
Element
$node
):
bool
{
if
(
DOMCompat
::
nodeName
(
$node
)
!==
'a'
)
{
return
false
;
}
$dp
=
DOMDataUtils
::
getDataParsoid
(
$node
);
return
DOMUtils
::
hasRel
(
$node
,
'mw:ExtLink'
)
&&
isset
(
$dp
->
stx
)
&&
$dp
->
stx
===
"url"
;
}
/**
* Helper function to detect when an A-node uses magic-link syntax.
* rel attribute is not sufficient anymore since mw:ExtLink is used for
* multiple link types
*
* @param Element $node
* @return bool
*/
public
static
function
isATagFromMagicLinkSyntax
(
Element
$node
):
bool
{
if
(
DOMCompat
::
nodeName
(
$node
)
!==
'a'
)
{
return
false
;
}
$dp
=
DOMDataUtils
::
getDataParsoid
(
$node
);
return
DOMUtils
::
hasRel
(
$node
,
'mw:ExtLink'
)
&&
isset
(
$dp
->
stx
)
&&
$dp
->
stx
===
'magiclink'
;
}
/**
* Check whether a node's typeof indicates that it is a template expansion.
*
* @param Element $node
* @return ?string The matched type, or null if no match.
*/
public
static
function
matchTplType
(
Element
$node
):
?
string
{
return
DOMUtils
::
matchTypeOf
(
$node
,
self
::
TPL_META_TYPE_REGEXP
);
}
/**
* Check whether a typeof indicates that it signifies an
* expanded attribute.
*
* @param Element $node
* @return bool
*/
public
static
function
hasExpandedAttrsType
(
Element
$node
):
bool
{
return
DOMUtils
::
matchTypeOf
(
$node
,
'/^mw:ExpandedAttrs(
\/\S
+)*$/'
)
!==
null
;
}
/**
* Check whether a node is a meta tag that signifies a template expansion.
*
* @param Node $node
* @return bool
*/
public
static
function
isTplMarkerMeta
(
Node
$node
):
bool
{
return
DOMUtils
::
matchNameAndTypeOf
(
$node
,
'meta'
,
self
::
TPL_META_TYPE_REGEXP
)
!==
null
;
}
/**
* Check whether a node is a meta signifying the start of a template expansion.
*
* @param Node $node
* @return bool
*/
public
static
function
isTplStartMarkerMeta
(
Node
$node
):
bool
{
$t
=
DOMUtils
::
matchNameAndTypeOf
(
$node
,
'meta'
,
self
::
TPL_META_TYPE_REGEXP
);
return
$t
!==
null
&&
!
str_ends_with
(
$t
,
'/End'
);
}
/**
* Check whether a node is a meta signifying the end of a template expansion.
*
* @param Node $node
* @return bool
*/
public
static
function
isTplEndMarkerMeta
(
Node
$node
):
bool
{
$t
=
DOMUtils
::
matchNameAndTypeOf
(
$node
,
'meta'
,
self
::
TPL_META_TYPE_REGEXP
);
return
$t
!==
null
&&
str_ends_with
(
$t
,
'/End'
);
}
/**
* Find the first wrapper element of encapsulated content.
* @param Node $node
* @return Element|null
*/
public
static
function
findFirstEncapsulationWrapperNode
(
Node
$node
):
?
Element
{
if
(
!
self
::
isEncapsulatedDOMForestRoot
(
$node
)
)
{
return
null
;
}
/** @var Element $node */
DOMUtils
::
assertElt
(
$node
);
$about
=
DOMCompat
::
getAttribute
(
$node
,
'about'
);
$prev
=
$node
;
do
{
$node
=
$prev
;
$prev
=
DiffDOMUtils
::
previousNonDeletedSibling
(
$node
);
}
while
(
$prev
instanceof
Element
&&
DOMCompat
::
getAttribute
(
$prev
,
'about'
)
===
$about
);
// NOTE: findFirstEncapsulationWrapperNode can be called by code
// even before templates have been fully encapsulated everywhere.
// ProcessTreeBuilderFixups::removeAutoInsertedEmptyTags is the main
// culprit here and it makes the contract for this helper murky
// by hiding potential brokenness since this should never return null
// once all templates have been encapsulated!
$elt
=
self
::
isFirstEncapsulationWrapperNode
(
$node
)
?
$node
:
null
;
'@phan-var ?Element $elt'
;
// @var ?Element $elt
return
$elt
;
}
/**
* This tests whether a DOM node is a new node added during an edit session
* or an existing node from parsed wikitext.
*
* As written, this function can only be used on non-template/extension content
* or on the top-level nodes of template/extension content. This test will
* return the wrong results on non-top-level $nodes of template/extension content.
*
* @param Node $node
* @return bool
*/
public
static
function
isNewElt
(
Node
$node
):
bool
{
// We cannot determine newness on text/comment $nodes.
if
(
!(
$node
instanceof
Element
)
)
{
return
false
;
}
// For template/extension content, newness should be
// checked on the encapsulation wrapper $node.
$node
=
self
::
findFirstEncapsulationWrapperNode
(
$node
)
??
$node
;
return
DOMDataUtils
::
getDataParsoid
(
$node
)->
getTempFlag
(
TempData
::
IS_NEW
);
}
/**
* Check whether a pre is caused by indentation in the original wikitext.
* @param Node $node
* @return bool
*/
public
static
function
isIndentPre
(
Node
$node
):
bool
{
return
DOMCompat
::
nodeName
(
$node
)
===
"pre"
&&
!
self
::
isLiteralHTMLNode
(
$node
);
}
/**
* @param Node $node
* @return bool
*/
public
static
function
isInlineMedia
(
Node
$node
):
bool
{
return
self
::
isGeneratedFigure
(
$node
)
&&
DOMCompat
::
nodeName
(
$node
)
!==
'figure'
;
// span, figure-inline
}
/**
* @param Node $node
* @return bool
*/
public
static
function
isGeneratedFigure
(
Node
$node
):
bool
{
// TODO: Remove "Image|Video|Audio" when version 2.4.0 of the content
// is no longer supported
return
DOMUtils
::
matchTypeOf
(
$node
,
'#^mw:(File|Image|Video|Audio)($|/)#D'
)
!==
null
;
}
/**
* Find how much offset is necessary for the DSR of an
* indent-originated pre tag.
*
* @param Node $textNode
* @return int
*/
public
static
function
indentPreDSRCorrection
(
Node
$textNode
):
int
{
// NOTE: This assumes a text-node and doesn't check that it is one.
//
// FIXME: Doesn't handle text nodes that are not direct children of the pre
if
(
self
::
isIndentPre
(
$textNode
->
parentNode
)
)
{
$numNLs
=
substr_count
(
$textNode
->
nodeValue
,
"
\n
"
);
if
(
$textNode
->
parentNode
->
lastChild
===
$textNode
)
{
// We dont want the trailing newline of the last child of the pre
// to contribute a pre-correction since it doesn't add new content
// in the pre-node after the text
if
(
str_ends_with
(
$textNode
->
nodeValue
,
"
\n
"
)
)
{
$numNLs
--;
}
}
return
$numNLs
;
}
else
{
return
0
;
}
}
/**
* Check if $node is a root in an encapsulated DOM forest.
*
* @param Node $node
* @return bool
*/
public
static
function
isEncapsulatedDOMForestRoot
(
Node
$node
):
bool
{
$about
=
$node
instanceof
Element
?
DOMCompat
::
getAttribute
(
$node
,
'about'
)
:
null
;
if
(
$about
!==
null
)
{
// FIXME: Ensure that our DOM spec clarifies this expectation
return
Utils
::
isParsoidObjectId
(
$about
);
}
else
{
return
false
;
}
}
/**
* Does $node represent a redirect link?
*/
public
static
function
isRedirectLink
(
?
Node
$node
):
bool
{
return
$node
instanceof
Element
&&
DOMCompat
::
nodeName
(
$node
)
===
'link'
&&
DOMUtils
::
matchRel
(
$node
,
'#
\b
mw:PageProp/redirect
\b
#'
)
!==
null
;
}
/**
* Does $node represent a category link?
*/
public
static
function
isCategoryLink
(
?
Node
$node
):
bool
{
return
$node
instanceof
Element
&&
DOMCompat
::
nodeName
(
$node
)
===
'link'
&&
DOMUtils
::
matchRel
(
$node
,
'#
\b
mw:PageProp/Category
\b
#'
)
!==
null
;
}
/**
* Does $node represent a link that is sol-transparent?
*/
public
static
function
isSolTransparentLink
(
?
Node
$node
):
bool
{
return
$node
instanceof
Element
&&
DOMCompat
::
nodeName
(
$node
)
===
'link'
&&
DOMUtils
::
matchRel
(
$node
,
TokenUtils
::
SOL_TRANSPARENT_LINK_REGEX
)
!==
null
;
}
/**
* Check if '$node' emits wikitext that is sol-transparent in wikitext form.
* This is a test for wikitext that doesn't introduce line breaks.
*
* Comment, whitespace text $nodes, category links, redirect links, behavior
* switches, and include directives currently satisfy this definition.
*
* This should come close to matching TokenUtils.isSolTransparent()
*
* @param Node $node
* @return bool
*/
public
static
function
emitsSolTransparentSingleLineWT
(
Node
$node
):
bool
{
if
(
$node
instanceof
Text
)
{
// NB: We differ here to meet the nl condition.
return
(
bool
)
preg_match
(
'/^[
\t
]*$/D'
,
$node
->
nodeValue
);
}
elseif
(
self
::
isRenderingTransparentNode
(
$node
)
)
{
// NB: The only metas in a DOM should be for behavior switches and
// include directives, other than explicit HTML meta tags. This
// differs from our counterpart in Util where ref meta tokens
// haven't been expanded to spans yet.
return
true
;
}
else
{
return
false
;
}
}
/**
* This is the span added to headings to add fallback ids for when legacy
* and HTML5 ids don't match up. This prevents broken links to legacy ids.
*
* @param Node $node
* @return bool
*/
public
static
function
isFallbackIdSpan
(
Node
$node
):
bool
{
return
DOMUtils
::
hasNameAndTypeOf
(
$node
,
'span'
,
'mw:FallbackId'
);
}
/**
* These are primarily 'metadata'-like $nodes that don't show up in output rendering.
* - In Parsoid output, they are represented by link/meta tags.
* - In the PHP parser, they are completely stripped from the input early on.
* Because of this property, these rendering-transparent $nodes are also
* SOL-transparent for the purposes of parsing behavior.
*
* @param Node $node
* @return bool
*/
public
static
function
isRenderingTransparentNode
(
Node
$node
):
bool
{
// FIXME: Can we change this entire thing to
// $node instanceof Comment ||
// DOMUtils::getDataParsoid($node).stx !== 'html' &&
// (DOMCompat::nodeName($node) === 'meta' || DOMCompat::nodeName($node) === 'link')
//
return
$node
instanceof
Comment
||
self
::
isSolTransparentLink
(
$node
)
||
(
// Catch-all for everything else.
$node
instanceof
Element
&&
DOMCompat
::
nodeName
(
$node
)
===
'meta'
&&
!
self
::
isMarkerAnnotation
(
$node
)
&&
(
DOMDataUtils
::
getDataParsoid
(
$node
)->
stx
??
''
)
!==
'html'
)
||
self
::
isFallbackIdSpan
(
$node
);
}
/**
* Is $node nested inside a table tag that uses HTML instead of native
* wikitext?
*
* @param Node $node
* @return bool
*/
public
static
function
inHTMLTableTag
(
Node
$node
):
bool
{
$p
=
$node
->
parentNode
;
while
(
DOMUtils
::
isTableTag
(
$p
)
)
{
if
(
self
::
isLiteralHTMLNode
(
$p
)
)
{
return
true
;
}
elseif
(
DOMCompat
::
nodeName
(
$p
)
===
'table'
)
{
// Don't cross <table> boundaries
return
false
;
}
$p
=
$p
->
parentNode
;
}
return
false
;
}
/**
* Is $node the first wrapper element of encapsulated content?
*
* @param Node $node
* @return bool
*/
public
static
function
isFirstEncapsulationWrapperNode
(
Node
$node
):
bool
{
return
DOMUtils
::
matchTypeOf
(
$node
,
self
::
FIRST_ENCAP_REGEXP
)
!==
null
;
}
/**
* Is $node the first wrapper element of extension content?
*
* @param Node $node
* @return bool
*/
public
static
function
isFirstExtensionWrapperNode
(
Node
$node
):
bool
{
return
DOMUtils
::
matchTypeOf
(
$node
,
"#mw:Extension/#"
)
!==
null
;
}
/**
* Checks whether a first encapsulation wrapper node is encapsulating an extension
* that outputs MediaWiki Core DOM Spec HTML (https://www.mediawiki.org/wiki/Specs/HTML)
* @param Node $node
* @param Env $env
* @return bool
*/
public
static
function
isExtensionOutputtingCoreMwDomSpec
(
Node
$node
,
Env
$env
):
bool
{
if
(
DOMUtils
::
matchTypeOf
(
$node
,
self
::
NON_EXTENSION_ENCAP_REGEXP
)
!==
null
)
{
return
false
;
}
$extTagName
=
self
::
getExtTagName
(
$node
);
$extConfig
=
$env
->
getSiteConfig
()->
getExtTagConfig
(
$extTagName
);
$htmlType
=
$extConfig
[
'options'
][
'outputHasCoreMwDomSpecMarkup'
]
??
null
;
return
$htmlType
===
true
;
}
/**
* Is $node an encapsulation wrapper elt?
*
* All root-level $nodes of generated content are considered
* encapsulation wrappers and share an about-id.
*
* @param Node $node
* @return bool
*/
public
static
function
isEncapsulationWrapper
(
Node
$node
):
bool
{
// True if it has an encapsulation type or while walking backwards
// over elts with identical about ids, we run into a $node with an
// encapsulation type.
if
(
!(
$node
instanceof
Element
)
)
{
return
false
;
}
return
self
::
findFirstEncapsulationWrapperNode
(
$node
)
!==
null
;
}
/**
* Is $node a DOMFragment wrapper?
*
* @param Node $node
* @return bool
*/
public
static
function
isDOMFragmentWrapper
(
Node
$node
):
bool
{
// See TokenUtils::hasDOMFragmentType
return
DOMUtils
::
matchTypeOf
(
$node
,
'#^mw:DOMFragment(/sealed/
\w
+)?$#D'
)
!==
null
;
}
/**
* Is $node a sealed DOMFragment of a specific type?
*
* @param Node $node
* @param string $type
* @return bool
*/
public
static
function
isSealedFragmentOfType
(
Node
$node
,
string
$type
):
bool
{
return
DOMUtils
::
hasTypeOf
(
$node
,
"mw:DOMFragment/sealed/$type"
);
}
/**
* Is $node a Parsoid-generated <section> tag?
*
* @param Node $node
* @return bool
*/
public
static
function
isParsoidSectionTag
(
Node
$node
):
bool
{
return
$node
instanceof
Element
&&
DOMCompat
::
nodeName
(
$node
)
===
'section'
&&
$node
->
hasAttribute
(
'data-mw-section-id'
);
}
/**
* Is the $node from extension content?
* @param Node $node
* @param ?string $extType If non-null, checks for that specific extension
* @return bool
*/
public
static
function
fromExtensionContent
(
Node
$node
,
?
string
$extType
=
null
):
bool
{
$re
=
$extType
?
"#mw:Extension/$extType#"
:
"#mw:Extension/
\w
+#"
;
while
(
$node
&&
!
DOMUtils
::
atTheTop
(
$node
)
)
{
if
(
DOMUtils
::
matchTypeOf
(
$node
,
$re
)
)
{
return
true
;
}
$node
=
$node
->
parentNode
;
}
return
false
;
}
/**
* Is $node from encapsulated (template, extension, etc.) content?
* @param Node $node
* @return bool
*/
public
static
function
fromEncapsulatedContent
(
Node
$node
):
bool
{
while
(
$node
&&
!
DOMUtils
::
atTheTop
(
$node
)
)
{
if
(
self
::
findFirstEncapsulationWrapperNode
(
$node
)
!==
null
)
{
return
true
;
}
$node
=
$node
->
parentNode
;
}
return
false
;
}
/**
* Compute, when possible, the wikitext source for a $node in
* an environment env. Returns null if the source cannot be
* extracted.
* @param Frame $frame
* @param Element $node
* @return string|null
*/
public
static
function
getWTSource
(
Frame
$frame
,
Element
$node
):
?
string
{
$dp
=
DOMDataUtils
::
getDataParsoid
(
$node
);
$dsr
=
$dp
->
dsr
??
null
;
// FIXME: We could probably change the null return to ''
// Just need to verify that code that uses this won't break
return
Utils
::
isValidDSR
(
$dsr
)
?
$dsr
->
substr
(
$frame
->
getSrcText
()
)
:
null
;
}
/**
* Gets all siblings that follow '$node' that have an 'about' as
* their about id.
*
* This is used to fetch transclusion/extension content by using
* the about-id as the key. This works because
* transclusion/extension content is a forest of dom-trees formed
* by adjacent dom-nodes. This is the contract that template
* encapsulation, dom-reuse, and VE code all have to abide by.
*
* The only exception to this adjacency rule is IEW nodes in
* fosterable positions (in tables) which are not span-wrapped to
* prevent them from getting fostered out.
*
* @param Node $node
* @param ?string $about
* @return Node[]
*/
public
static
function
getAboutSiblings
(
Node
$node
,
?
string
$about
):
array
{
$nodes
=
[
$node
];
if
(
$about
===
null
)
{
return
$nodes
;
}
$node
=
$node
->
nextSibling
;
while
(
$node
&&
(
(
$node
instanceof
Element
&&
DOMCompat
::
getAttribute
(
$node
,
'about'
)
===
$about
)
||
(
DOMUtils
::
isFosterablePosition
(
$node
)
&&
DOMUtils
::
isIEW
(
$node
)
)
)
)
{
$nodes
[]
=
$node
;
$node
=
$node
->
nextSibling
;
}
// Remove already consumed trailing IEW, if any
while
(
count
(
$nodes
)
>
0
&&
DOMUtils
::
isIEW
(
$nodes
[
count
(
$nodes
)
-
1
]
)
)
{
array_pop
(
$nodes
);
}
return
$nodes
;
}
/**
* This function is only intended to be used on encapsulated $nodes
* (Template/Extension/Param content).
*
* Given a '$node' that has an about-id, it is assumed that it is generated
* by templates or extensions. This function skips over all
* following content nodes and returns the first non-template node
* that follows it.
*
* @param Node $node
* @return Node|null
*/
public
static
function
skipOverEncapsulatedContent
(
Node
$node
):
?
Node
{
$about
=
$node
instanceof
Element
?
DOMCompat
::
getAttribute
(
$node
,
'about'
)
:
null
;
if
(
$about
!==
null
)
{
// Guaranteed not to be empty. It will at least include $node.
$aboutSiblings
=
self
::
getAboutSiblings
(
$node
,
$about
);
return
end
(
$aboutSiblings
)->
nextSibling
;
}
else
{
return
$node
->
nextSibling
;
}
}
/**
* Comment encoding/decoding.
*
* * Some relevant phab tickets: T94055, T70146, T60184, T95039
*
* The wikitext comment rule is very simple: <!-- starts a comment,
* and --> ends a comment. This means we can have almost anything as the
* contents of a comment (except the string "-->", but see below), including
* several things that are not valid in HTML5 comments:
*
* * For one, the html5 comment parsing algorithm [0] leniently accepts
* --!> as a closing comment tag, which differs from the php+tidy combo.
*
* * If the comment's data matches /^-?>/, html5 will end the comment.
* For example, <!-->stuff<--> breaks up as
* <!--> (the comment) followed by, stuff<--> (as text).
*
* * Finally, comment data shouldn't contain two consecutive hyphen-minus
* characters (--), nor end in a hyphen-minus character (/-$/) as defined
* in the spec [1].
*
* We work around all these problems by using HTML entity encoding inside
* the comment body. The characters -, >, and & must be encoded in order
* to prevent premature termination of the comment by one of the cases
* above. Encoding other characters is optional; all entities will be
* decoded during wikitext serialization.
*
* In order to allow *arbitrary* content inside a wikitext comment,
* including the forbidden string "-->" we also do some minimal entity
* decoding on the wikitext. We are also limited by our inability
* to encode DSR attributes on the comment $node, so our wikitext entity
* decoding must be 1-to-1: that is, there must be a unique "decoded"
* string for every wikitext sequence, and for every decoded string there
* must be a unique wikitext which creates it.
*
* The basic idea here is to replace every string ab*c with the string with
* one more b in it. This creates a string with no instance of "ac",
* so you can use 'ac' to encode one more code point. In this case
* a is "--&", "b" is "amp;", and "c" is "gt;" and we use ac to
* encode "-->" (which is otherwise unspeakable in wikitext).
*
* Note that any user content which does not match the regular
* expression /--(>|&(amp;)*gt;)/ is unchanged in its wikitext
* representation, as shown in the first two examples below.
*
* User-authored comment text Wikitext HTML5 DOM
* -------------------------- ------------- ----------------------
* & - > & - > & + >
* Use > here Use > here Use &gt; here
* --> --> ++>
* --> --&gt; ++&gt;
* --&gt; --&amp;gt; ++&amp;gt;
*
* [0] http://www.w3.org/TR/html5/syntax.html#comment-start-state
* [1] http://www.w3.org/TR/html5/syntax.html#comments
*
* Map a wikitext-escaped comment to an HTML DOM-escaped comment.
*
* @param string $comment Wikitext-escaped comment.
* @return string DOM-escaped comment.
*/
public
static
function
encodeComment
(
string
$comment
):
string
{
// Undo wikitext escaping to obtain "true value" of comment.
$trueValue
=
preg_replace_callback
(
'/--&(amp;)*gt;/'
,
static
function
(
$m
)
{
return
Utils
::
decodeWtEntities
(
$m
[
0
]
);
},
$comment
);
// Now encode '-', '>' and '&' in the "true value" as HTML entities,
// so that they can be safely embedded in an HTML comment.
// This part doesn't have to map strings 1-to-1.
return
preg_replace_callback
(
'/[->&]/'
,
static
function
(
$m
)
{
return
Utils
::
entityEncodeAll
(
$m
[
0
]
);
},
$trueValue
);
}
/**
* Map an HTML DOM-escaped comment to a wikitext-escaped comment.
* @param string $comment DOM-escaped comment.
* @return string Wikitext-escaped comment.
*/
public
static
function
decodeComment
(
string
$comment
):
string
{
// Undo HTML entity escaping to obtain "true value" of comment.
$trueValue
=
Utils
::
decodeWtEntities
(
$comment
);
// ok, now encode this "true value" of the comment in such a way
// that the string "-->" never shows up. (See above.)
return
preg_replace_callback
(
'/--(&(amp;)*gt;|>)/'
,
static
function
(
$m
)
{
$s
=
$m
[
0
];
return
$s
===
'-->'
?
'-->'
:
'--&'
.
substr
(
$s
,
3
);
},
$trueValue
);
}
/**
* Utility function: we often need to know the wikitext DSR length for
* an HTML DOM comment value.
*
* @param Comment|CommentTk $node A comment node containing a DOM-escaped comment.
* @return int The wikitext length in UTF-8 bytes necessary to encode this
* comment, including 7 characters for the `<!--` and `-->` delimiters.
*/
public
static
function
decodedCommentLength
(
$node
):
int
{
// Add 7 for the "<!--" and "-->" delimiters in wikitext.
$syntaxLen
=
7
;
if
(
$node
instanceof
Comment
)
{
$value
=
$node
->
nodeValue
;
if
(
$node
->
previousSibling
&&
DOMUtils
::
hasTypeOf
(
$node
->
previousSibling
,
"mw:Placeholder/UnclosedComment"
)
)
{
$syntaxLen
=
4
;
}
}
elseif
(
$node
instanceof
CommentTk
)
{
// @phan-suppress-next-line PhanUndeclaredProperty dynamic property
if
(
isset
(
$node
->
dataParsoid
->
unclosedComment
)
)
{
$syntaxLen
=
4
;
}
$value
=
$node
->
value
;
}
else
{
throw
new
UnreachableException
(
'Should not be here!'
);
}
return
strlen
(
self
::
decodeComment
(
$value
)
)
+
$syntaxLen
;
}
/**
* @param Node $node
* @return ?string
*/
public
static
function
getExtTagName
(
Node
$node
):
?
string
{
$match
=
DOMUtils
::
matchTypeOf
(
$node
,
'#^mw:Extension/(.+?)$#D'
);
return
$match
?
mb_strtolower
(
substr
(
$match
,
strlen
(
'mw:Extension/'
)
)
)
:
null
;
}
/**
* @param Env $env
* @param Node $node
* @return ?ExtensionTagHandler
*/
public
static
function
getNativeExt
(
Env
$env
,
Node
$node
):
?
ExtensionTagHandler
{
$extTagName
=
self
::
getExtTagName
(
$node
);
return
$extTagName
?
$env
->
getSiteConfig
()->
getExtTagImpl
(
$extTagName
)
:
null
;
}
/**
* Is this an include directive?
* @param string $name
* @return bool
*/
public
static
function
isIncludeTag
(
string
$name
):
bool
{
return
$name
===
'includeonly'
||
$name
===
'noinclude'
||
$name
===
'onlyinclude'
;
}
/**
* Check if tag is annotation or extension directive
* Adapted from similar grammar function
*
* @param Env $env
* @param string $name
* @return bool
*/
public
static
function
isAnnOrExtTag
(
Env
$env
,
string
$name
):
bool
{
$tagName
=
mb_strtolower
(
$name
);
$siteConfig
=
$env
->
getSiteConfig
();
$extTags
=
$siteConfig
->
getExtensionTagNameMap
();
$isInstalledExt
=
isset
(
$extTags
[
$tagName
]
);
$isIncludeTag
=
self
::
isIncludeTag
(
$tagName
);
$isAnnotationTag
=
$siteConfig
->
isAnnotationTag
(
$tagName
);
if
(
!
$isAnnotationTag
)
{
// avoid crashing on <tvar|name> even if we don't support that syntax explicitly
$pipepos
=
strpos
(
$tagName
,
'|'
);
if
(
$pipepos
)
{
$strBeforePipe
=
substr
(
$tagName
,
0
,
$pipepos
);
$isAnnotationTag
=
$siteConfig
->
isAnnotationTag
(
$strBeforePipe
);
}
}
return
$isInstalledExt
||
$isIncludeTag
||
$isAnnotationTag
;
}
/**
* Creates a DocumentFragment containing a single span with type "mw:I18n". The created span
* should be filled in with setDataNodeI18n to be valid.
* @param Document $doc
* @return DocumentFragment
* @throws DOMException
*/
public
static
function
createEmptyLocalizationFragment
(
Document
$doc
):
DocumentFragment
{
$frag
=
$doc
->
createDocumentFragment
();
$span
=
$doc
->
createElement
(
'span'
);
DOMUtils
::
addTypeOf
(
$span
,
'mw:I18n'
);
$frag
->
appendChild
(
$span
);
return
$frag
;
}
/**
* Creates an internationalization (i18n) message that will be localized into the page content
* language. The returned DocumentFragment contains, as a single child, a span
* element with the appropriate information for later localization.
* @param Document $doc
* @param string $key message key for the message to be localized
* @param ?array $params parameters for localization
* @return DocumentFragment
* @throws DOMException
*/
public
static
function
createPageContentI18nFragment
(
Document
$doc
,
string
$key
,
?
array
$params
=
null
):
DocumentFragment
{
$frag
=
self
::
createEmptyLocalizationFragment
(
$doc
);
$i18n
=
I18nInfo
::
createPageContentI18n
(
$key
,
$params
);
DOMDataUtils
::
setDataNodeI18n
(
$frag
->
firstChild
,
$i18n
);
return
$frag
;
}
/**
* Creates an internationalization (i18n) message that will be localized into the user
* interface language. The returned DocumentFragment contains, as a single child, a span
* element with the appropriate information for later localization.
* @param Document $doc
* @param string $key message key for the message to be localized
* @param ?array $params parameters for localization
* @return DocumentFragment
* @throws DOMException
*/
public
static
function
createInterfaceI18nFragment
(
Document
$doc
,
string
$key
,
?
array
$params
=
null
):
DocumentFragment
{
$frag
=
self
::
createEmptyLocalizationFragment
(
$doc
);
$i18n
=
I18nInfo
::
createInterfaceI18n
(
$key
,
$params
);
DOMDataUtils
::
setDataNodeI18n
(
$frag
->
firstChild
,
$i18n
);
return
$frag
;
}
/**
* Creates an internationalization (i18n) message that will be localized into an arbitrary
* language. The returned DocumentFragment contains, as a single child, a span
* element with the appropriate information for later localization.
* The use of this method is discouraged; use ::createPageContentI18nFragment(...) and
* ::createInterfaceI18nFragment(...) where possible rather than, respectively,
* ::createLangI18nFragment(..., $wgContLang, ...) and
* ::createLangI18nFragment(..., $wgLang,...).
* @param Document $doc
* @param Bcp47Code $lang language for the localization
* @param string $key message key for the message to be localized
* @param ?array $params parameters for localization
* @return DocumentFragment
* @throws DOMException
*/
public
static
function
createLangI18nFragment
(
Document
$doc
,
Bcp47Code
$lang
,
string
$key
,
?
array
$params
=
null
):
DocumentFragment
{
$frag
=
self
::
createEmptyLocalizationFragment
(
$doc
);
$i18n
=
I18nInfo
::
createLangI18n
(
$lang
,
$key
,
$params
);
DOMDataUtils
::
setDataNodeI18n
(
$frag
->
firstChild
,
$i18n
);
return
$frag
;
}
/**
* Adds to $element the internationalization information needed for the attribute $name to be
* localized in a later pass into the page content language.
* @param Element $element element on which to add internationalization information
* @param string $name name of the attribute whose value will be localized
* @param string $key message key used for the attribute value localization
* @param ?array $params parameters for localization
*/
public
static
function
addPageContentI18nAttribute
(
Element
$element
,
string
$name
,
string
$key
,
?
array
$params
=
null
):
void
{
$i18n
=
I18nInfo
::
createPageContentI18n
(
$key
,
$params
);
DOMUtils
::
addTypeOf
(
$element
,
'mw:LocalizedAttrs'
);
DOMDataUtils
::
setDataAttrI18n
(
$element
,
$name
,
$i18n
);
}
/** Adds to $element the internationalization information needed for the attribute $name to be
* localized in a later pass into the user interface language.
* @param Element $element element on which to add internationalization information
* @param string $name name of the attribute whose value will be localized
* @param string $key message key used for the attribute value localization
* @param ?array $params parameters for localization
*/
public
static
function
addInterfaceI18nAttribute
(
Element
$element
,
string
$name
,
string
$key
,
?
array
$params
=
null
):
void
{
$i18n
=
I18nInfo
::
createInterfaceI18n
(
$key
,
$params
);
DOMUtils
::
addTypeOf
(
$element
,
'mw:LocalizedAttrs'
);
DOMDataUtils
::
setDataAttrI18n
(
$element
,
$name
,
$i18n
);
}
/**
* Adds to $element the internationalization information needed for the attribute $name to be
* localized in a later pass into the provided language.
* The use of this method is discouraged; ; use ::addPageContentI18nAttribute(...) and
* ::addInterfaceI18nAttribute(...) where possible rather than, respectively,
* ::addLangI18nAttribute(..., $wgContLang, ...) and ::addLangI18nAttribute(..., $wgLang, ...).
* @param Element $element element on which to add internationalization information
* @param Bcp47Code $lang language in which the message will be localized
* @param string $name name of the attribute whose value will be localized
* @param string $key message key used for the attribute value localization
* @param ?array $params parameters for localization
*/
public
static
function
addLangI18nAttribute
(
Element
$element
,
Bcp47Code
$lang
,
string
$name
,
string
$key
,
?
array
$params
=
null
):
void
{
$i18n
=
I18nInfo
::
createLangI18n
(
$lang
,
$key
,
$params
);
DOMUtils
::
addTypeOf
(
$element
,
'mw:LocalizedAttrs'
);
DOMDataUtils
::
setDataAttrI18n
(
$element
,
$name
,
$i18n
);
}
/** Check whether a node is an annotation meta; if yes, returns its type
* @param Node $node
* @return ?string
*/
public
static
function
matchAnnotationMeta
(
Node
$node
):
?
string
{
return
DOMUtils
::
matchNameAndTypeOf
(
$node
,
'meta'
,
self
::
ANNOTATION_META_TYPE_REGEXP
);
}
/**
* Extract the annotation type, excluding potential "/End" suffix; returns null if not a valid
* annotation meta. &$isStart is set to true if the annotation is a start tag, false otherwise.
*
* @param Node $node
* @param bool &$isStart
* @return ?string The matched type, or null if no match.
*/
public
static
function
extractAnnotationType
(
Node
$node
,
bool
&
$isStart
=
false
):
?
string
{
$t
=
DOMUtils
::
matchTypeOf
(
$node
,
self
::
ANNOTATION_META_TYPE_REGEXP
);
if
(
$t
!==
null
&&
preg_match
(
self
::
ANNOTATION_META_TYPE_REGEXP
,
$t
,
$matches
)
)
{
$isStart
=
!
str_ends_with
(
$t
,
'/End'
);
return
$matches
[
1
];
}
return
null
;
}
/**
* Check whether a node is a meta signifying the start of an annotated part of the DOM
*
* @param Node $node
* @return bool
*/
public
static
function
isAnnotationStartMarkerMeta
(
Node
$node
):
bool
{
if
(
!
$node
instanceof
Element
||
DOMCompat
::
nodeName
(
$node
)
!==
'meta'
)
{
return
false
;
}
$isStart
=
false
;
$t
=
self
::
extractAnnotationType
(
$node
,
$isStart
);
return
$t
!==
null
&&
$isStart
;
}
/**
* Check whether a node is a meta signifying the end of an annotated part of the DOM
*
* @param Node $node
* @return bool
*/
public
static
function
isAnnotationEndMarkerMeta
(
Node
$node
):
bool
{
if
(
!
$node
instanceof
Element
||
DOMCompat
::
nodeName
(
$node
)
!==
'meta'
)
{
return
false
;
}
$isStart
=
false
;
$t
=
self
::
extractAnnotationType
(
$node
,
$isStart
);
return
$t
!==
null
&&
!
$isStart
;
}
/**
* Check whether the meta tag was moved from its initial position
* @param Node $node
* @return bool
*/
public
static
function
isMovedMetaTag
(
Node
$node
):
bool
{
if
(
$node
instanceof
Element
&&
self
::
matchAnnotationMeta
(
$node
)
!==
null
)
{
$parsoidData
=
DOMDataUtils
::
getDataParsoid
(
$node
);
if
(
isset
(
$parsoidData
->
wasMoved
)
)
{
return
$parsoidData
->
wasMoved
;
}
}
return
false
;
}
/** Returns true if a node is a (start or end) annotation meta tag
* @param ?Node $n
* @return bool
*/
public
static
function
isMarkerAnnotation
(
?
Node
$n
):
bool
{
return
$n
!==
null
&&
self
::
matchAnnotationMeta
(
$n
)
!==
null
;
}
/**
* Extracts the media format from the attribute string
*
* @param Element $node
* @return string
*/
public
static
function
getMediaFormat
(
Element
$node
):
string
{
// TODO: Remove "Image|Video|Audio" when version 2.4.0 of the content
// is no longer supported
$mediaType
=
DOMUtils
::
matchTypeOf
(
$node
,
'#^mw:(File|Image|Video|Audio)(/|$)#'
);
$parts
=
explode
(
'/'
,
$mediaType
??
''
);
return
$parts
[
1
]
??
''
;
}
/**
* @param Element $node
* @return bool
*/
public
static
function
hasVisibleCaption
(
Element
$node
):
bool
{
$format
=
self
::
getMediaFormat
(
$node
);
return
in_array
(
$format
,
[
'Thumb'
,
/* 'Manualthumb', FIXME(T305759) */
'Frame'
],
true
);
}
/**
* Ref dom post-processing happens after adding media info, so the
* linkbacks aren't available in the textContent added to the alt.
* However, when serializing, they are in the caption elements. So, this
* special handler drops the linkbacks for the purpose of comparison.
*
* @param Node $node
* @return string
*/
public
static
function
textContentFromCaption
(
Node
$node
):
string
{
$content
=
''
;
$c
=
$node
->
firstChild
;
while
(
$c
)
{
if
(
$c
instanceof
Text
)
{
$content
.=
$c
->
nodeValue
;
}
elseif
(
$c
instanceof
Element
&&
!
DOMUtils
::
isMetaDataTag
(
$c
)
&&
!
DOMUtils
::
hasTypeOf
(
$c
,
"mw:Extension/ref"
)
)
{
$content
.=
self
::
textContentFromCaption
(
$c
);
}
$c
=
$c
->
nextSibling
;
}
return
$content
;
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Sat, May 16, 21:39 (1 d, 8 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
7b/2b/e51fc3c4d10f875f961d7709bd47
Default Alt Text
WTUtils.php (36 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment