Page MenuHomeWickedGov Phorge

WTSUtils.php
No OneTemporary

Size
12 KB
Referenced Files
None
Subscribers
None

WTSUtils.php

<?php
declare( strict_types = 1 );
namespace Wikimedia\Parsoid\Html2Wt;
use Wikimedia\Assert\UnreachableException;
use Wikimedia\Parsoid\Core\DomSourceRange;
use Wikimedia\Parsoid\DOM\Element;
use Wikimedia\Parsoid\DOM\Node;
use Wikimedia\Parsoid\DOM\Text;
use Wikimedia\Parsoid\NodeData\DataMw;
use Wikimedia\Parsoid\NodeData\DataMwAttrib;
use Wikimedia\Parsoid\Tokens\EndTagTk;
use Wikimedia\Parsoid\Tokens\KV;
use Wikimedia\Parsoid\Tokens\TagTk;
use Wikimedia\Parsoid\Utils\DiffDOMUtils;
use Wikimedia\Parsoid\Utils\DOMCompat;
use Wikimedia\Parsoid\Utils\DOMDataUtils;
use Wikimedia\Parsoid\Utils\DOMUtils;
use Wikimedia\Parsoid\Utils\PHPUtils;
use Wikimedia\Parsoid\Utils\WTUtils;
class WTSUtils {
public static function isValidSep( string $sep ): bool {
/* TODO (Anomie)
You might be able to simplify the regex a bit using a no-backtracking group:
'/^(?>\s*<!--.*?-->)*\s*$/s'
Although I'm not sure that'll actually run faster.*/
return (bool)preg_match( '/^(\s|<!--([^\-]|-(?!->))*-->)*$/uD', $sep );
}
public static function hasValidTagWidths( ?DomSourceRange $dsr ): bool {
return $dsr !== null && $dsr->hasValidTagWidths();
}
/**
* Get the attributes on a node in an array of KV objects.
*
* @param Element $node
* @return KV[]
*/
public static function getAttributeKVArray( Element $node ): array {
$kvs = [];
foreach ( DOMUtils::attributes( $node ) as $name => $value ) {
$kvs[] = new KV( $name, $value );
}
return $kvs;
}
/**
* Create a `TagTk` corresponding to a DOM node.
*
* @param Element $node
* @return TagTk
*/
public static function mkTagTk( Element $node ): TagTk {
$attribKVs = self::getAttributeKVArray( $node );
return new TagTk(
DOMCompat::nodeName( $node ),
$attribKVs,
DOMDataUtils::getDataParsoid( $node )
);
}
/**
* Create a `EndTagTk` corresponding to a DOM node.
*
* @param Element $node
* @return EndTagTk
*/
public static function mkEndTagTk( Element $node ): EndTagTk {
$attribKVs = self::getAttributeKVArray( $node );
return new EndTagTk(
DOMCompat::nodeName( $node ),
$attribKVs,
DOMDataUtils::getDataParsoid( $node )
);
}
/**
* For new elements, attrs are always considered modified. However, For
* old elements, we only consider an attribute modified if we have shadow
* info for it and it doesn't match the current value.
* Returns array with data:
* [
* value => mixed,
* modified => bool (If the value of the attribute changed since we parsed the wikitext),
* fromsrc => bool (Whether we got the value from source-based roundtripping)
* ]
*
* @param Element $node
* @param string $name
* @param ?string $curVal
* @return array
*/
public static function getShadowInfo( Element $node, string $name, ?string $curVal ): array {
$dp = DOMDataUtils::getDataParsoid( $node );
// Not the case, continue regular round-trip information.
if ( !isset( $dp->a ) || !array_key_exists( $name, $dp->a ) ) {
return [
'value' => $curVal,
// Mark as modified if a new element
'modified' => WTUtils::isNewElt( $node ),
'fromsrc' => false
];
} elseif ( $dp->a[$name] !== $curVal ) {
return [
'value' => $curVal,
'modified' => true,
'fromsrc' => false
];
} elseif ( !isset( $dp->sa ) || !array_key_exists( $name, $dp->sa ) ) {
return [
'value' => $curVal,
'modified' => false,
'fromsrc' => false
];
} else {
return [
'value' => $dp->sa[$name],
'modified' => false,
'fromsrc' => true
];
}
}
/**
* Get shadowed information about an attribute on a node.
* Returns array with data:
* [
* value => mixed,
* modified => bool (If the value of the attribute changed since we parsed the wikitext),
* fromsrc => bool (Whether we got the value from source-based roundtripping)
* ]
*
* @param Element $node
* @param string $name
* @return array
*/
public static function getAttributeShadowInfo( Element $node, string $name ): array {
return self::getShadowInfo(
$node,
$name,
DOMCompat::getAttribute( $node, $name )
);
}
public static function commentWT( string $comment ): string {
return '<!--' . WTUtils::decodeComment( $comment ) . '-->';
}
/**
* In wikitext, did origNode occur next to a block node which has been
* deleted? While looking for next, we look past DOM nodes that are
* transparent in rendering. (See emitsSolTransparentSingleLineWT for
* which nodes.)
*
* @param ?Node $origNode
* @param bool $before
* @return bool
*/
public static function nextToDeletedBlockNodeInWT(
?Node $origNode, bool $before
): bool {
if ( !$origNode || DOMUtils::atTheTop( $origNode ) ) {
return false;
}
while ( true ) {
// Find the nearest node that shows up in HTML (ignore nodes that show up
// in wikitext but don't affect sol-state or HTML rendering -- note that
// whitespace is being ignored, but that whitespace occurs between block nodes).
$node = $origNode;
do {
$node = $before ? $node->previousSibling : $node->nextSibling;
if ( DiffUtils::maybeDeletedNode( $node ) ) {
return DiffUtils::isDeletedBlockNode( $node );
}
} while ( $node && WTUtils::emitsSolTransparentSingleLineWT( $node ) );
if ( $node ) {
return false;
} else {
// Walk up past zero-width wikitext parents
$node = $origNode->parentNode;
if ( !WTUtils::isZeroWidthWikitextElt( $node ) ) {
// If the parent occupies space in wikitext,
// clearly, we are not next to a deleted block node!
// We'll eventually hit BODY here and return.
return false;
}
$origNode = $node;
}
}
}
/**
* Check if whitespace preceding this node would NOT trigger an indent-pre.
*
* @param Node $node
* @param Node $sepNode
* @return bool
*/
public static function precedingSpaceSuppressesIndentPre( Node $node, Node $sepNode ): bool {
if ( $node !== $sepNode && $node instanceof Text ) {
// if node is the same as sepNode, then the separator text
// at the beginning of it has been stripped out already, and
// we cannot use it to test it for indent-pre safety
return (bool)preg_match( '/^[ \t]*\n/', $node->nodeValue );
} elseif ( DOMCompat::nodeName( $node ) === 'br' ) {
return true;
} elseif ( WTUtils::isFirstEncapsulationWrapperNode( $node ) ) {
DOMUtils::assertElt( $node );
// Dont try any harder than this
return !$node->hasChildNodes() || DOMCompat::getInnerHTML( $node )[0] === "\n";
} else {
return WTUtils::isBlockNodeWithVisibleWT( $node );
}
}
public static function traceNodeName( Node $node ): string {
switch ( $node->nodeType ) {
case XML_ELEMENT_NODE:
return ( DiffUtils::isDiffMarker( $node ) ) ? 'DIFF_MARK' : 'NODE: ' . DOMCompat::nodeName( $node );
case XML_TEXT_NODE:
return 'TEXT: ' . PHPUtils::jsonEncode( $node->nodeValue );
case XML_COMMENT_NODE:
return 'CMT : ' . PHPUtils::jsonEncode( self::commentWT( $node->nodeValue ) );
default:
return DOMCompat::nodeName( $node );
}
}
/**
* In selser mode, check if an unedited node's wikitext from source wikitext
* is reusable as is.
*
* @param SerializerState $state
* @param Node $node
* @return bool
*/
public static function origSrcValidInEditedContext( SerializerState $state, Node $node ): bool {
$env = $state->getEnv();
$prev = null;
if ( WTUtils::isRedirectLink( $node ) ) {
return DOMUtils::atTheTop( $node->parentNode ) && !$node->previousSibling;
} elseif ( self::dsrContainsOpenExtendedRangeAnnotationTag( $node, $state ) ) {
return false;
} elseif ( DOMCompat::nodeName( $node ) === 'th' || DOMCompat::nodeName( $node ) === 'td' ) {
DOMUtils::assertElt( $node );
// The wikitext representation for them is dependent
// on cell position (first cell is always single char).
// If there is no previous sibling, nothing to worry about.
$prev = $node->previousSibling;
if ( !$prev ) {
return true;
}
if (
DiffUtils::hasInsertedDiffMark( $prev ) ||
DiffUtils::hasInsertedDiffMark( $node )
) {
return false;
}
// If previous sibling is unmodified, nothing to worry about.
if (
!DiffUtils::isDiffMarker( $prev ) &&
!DiffUtils::directChildrenChanged( $prev )
) {
return true;
}
// If it didn't have a stx marker that indicated that the cell
// showed up on the same line via the "||" or "!!" syntax, nothing
// to worry about.
return ( DOMDataUtils::getDataParsoid( $node )->stx ?? '' ) !== 'row';
} elseif ( $node instanceof Element && DOMCompat::nodeName( $node ) === 'tr' &&
empty( DOMDataUtils::getDataParsoid( $node )->startTagSrc )
) {
// If this <tr> didn't have a startTagSrc, it would have been
// the first row of a table in original wikitext. So, it is safe
// to reuse the original source for the row (without a "|-") as long as
// it continues to be the first row of the table. If not, since we need to
// insert a "|-" to separate it from the newly added row (in an edit),
// we cannot simply reuse orig. wikitext for this <tr>.
return !DiffDOMUtils::previousNonSepSibling( $node );
} elseif ( DOMUtils::isNestedListOrListItem( $node ) ) {
if ( DOMUtils::isList( $node ) ) {
// Lists never get bullets assigned to them. So, unless they
// start a fresh list ( => they have a previous sibling ),
// we cannot reuse source for nested lists.
if ( !$node->previousSibling ) {
return false;
}
} else {
// Consider this wikitext snippet and its output below:
//
// ** a
// *** b
//
// <ul><li-*>
// <ul><li-*> a <-- cannot reuse source of this <li>
// <ul><li-***> b</li></ul> <-- can reuse source of this <li>
// </li></ul>
// </li></ul>
//
// If we reuse the src for the inner li with the a, we'd be missing
// one bullet because the tag handler for lists in the serializer only
// emits start tag src when it hits a first child that isn't a list
// element. We need to walk up and get the other bullet(s).
//
// The above logic can be condensed into this observation.
// Reusable nested <li> nodes will always have multiple bullets.
// Don't reuse source from any nested list
$dp = DOMDataUtils::getDataParsoid( $node );
if ( !isset( $dp->dsr ) || $dp->dsr->openWidth < 2 ) {
return false;
}
}
// If a previous sibling was modified, we can't reuse the start dsr.
$prev = $node->previousSibling;
while ( $prev ) {
if ( DiffUtils::isDiffMarker( $prev ) || DiffUtils::hasInsertedDiffMark( $prev ) ) {
return false;
}
$prev = $prev->previousSibling;
}
return true;
} elseif ( WTUtils::isMovedMetaTag( $node ) ) {
return false;
} else {
return true;
}
}
/**
* We keep track in $state of all extended ranges that are currently open by a <meta> tag.
* This method checks whether the wikitext source pointed by the dsr of the node contains either
* an opening or closing tag matching that annotation (<translate> or </translate> for example.)
* @param Node $node
* @param SerializerState $state
* @return bool
*/
private static function dsrContainsOpenExtendedRangeAnnotationTag( Node $node,
SerializerState $state
): bool {
if ( empty( $state->openAnnotations ) || !$node instanceof Element ) {
return false;
}
$dsr = DOMDataUtils::getDataParsoid( $node )->dsr ?? null;
if ( !$dsr ) {
return false;
}
$src = $state->getOrigSrc( $dsr->innerRange() );
foreach ( $state->openAnnotations as $ann => $extended ) {
if ( $extended ) {
if ( preg_match( '</?' . $ann . '.*>', $src ) ) {
return true;
}
}
}
return false;
}
/**
* FIXME: This method should probably be moved to DOMDataUtils class since
* it is used by both html2wt and wt2html code
*
* @param DataMw $dataMw
* @param string $key
* @param bool $keep
* @return ?DataMwAttrib
*/
public static function getAttrFromDataMw(
DataMw $dataMw, string $key, bool $keep
): ?DataMwAttrib {
$arr = $dataMw->attribs ?? [];
$i = false;
foreach ( $arr as $k => $a ) {
if ( is_string( $a->key ) ) {
$txt = $a->key;
} elseif ( is_array( $a->key ) ) {
$txt = $a->key['txt'] ?? null;
} else {
throw new UnreachableException( 'Control should never get here!' );
}
if ( $txt === $key ) {
$i = $k;
break;
}
}
if ( $i === false ) {
return null;
}
$ret = $arr[$i];
if ( !$keep && !isset( $ret->value['html'] ) ) {
array_splice( $arr, $i, 1 );
$dataMw->attribs = $arr;
}
return $ret;
}
/**
* Escape `<nowiki>` tags.
*
* @param string $text
* @return string
*/
public static function escapeNowikiTags( string $text ): string {
return preg_replace( '#<(/?nowiki\s*/?\s*)>#i', '&lt;$1&gt;', $text );
}
}

File Metadata

Mime Type
text/x-php
Expires
Sat, May 16, 21:37 (1 d, 10 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
82/b3/b060c2362baea338533463cb7555
Default Alt Text
WTSUtils.php (12 KB)

Event Timeline