Page MenuHomeWickedGov Phorge

ConstrainedText.php
No OneTemporary

Size
12 KB
Referenced Files
None
Subscribers
None

ConstrainedText.php

<?php
declare( strict_types = 1 );
namespace Wikimedia\Parsoid\Html2Wt\ConstrainedText;
use Wikimedia\Parsoid\Config\Env;
use Wikimedia\Parsoid\DOM\Element;
use Wikimedia\Parsoid\DOM\Node;
use Wikimedia\Parsoid\NodeData\DataParsoid;
use Wikimedia\Parsoid\Utils\DiffDOMUtils;
use Wikimedia\Parsoid\Utils\DOMCompat;
use Wikimedia\Parsoid\Utils\DOMDataUtils;
use Wikimedia\Parsoid\Utils\DOMUtils;
use Wikimedia\Parsoid\Utils\PHPUtils;
use Wikimedia\Parsoid\Utils\Utils;
/**
* A chunk of wikitext output. This base class contains the
* wikitext and a pointer to the DOM node which is responsible for
* generating it. Subclasses can add additional properties to record
* context or wikitext boundary restrictions for proper escaping.
* The chunk is serialized with the `escape` method, which might
* alter the wikitext in order to ensure it doesn't run together
* with its context (usually by adding `<nowiki>` tags).
*
* The main entry point is the static function `ConstrainedText::escapeLine()`.
*/
class ConstrainedText {
/**
* This adds necessary escapes to a line of chunks. We provide
* the `ConstrainedText#escape` function with its left and right
* context, and it can determine what escapes are needed.
*
* The `line` parameter is an array of `ConstrainedText` *chunks*
* which make up a line (or part of a line, in some cases of nested
* processing).
*
* @param ConstrainedText[] $line
* @return string
*/
public static function escapeLine( array $line ): string {
// The left context will be precise (that is, it is the result
// of `ConstrainedText#escape` and will include any escapes
// triggered by chunks on the left), but the right context
// is just the (unescaped) text property from the chunk.
// As we work left to right we will piece together a fully-escaped
// string. Be careful not to shoot yourself in the foot -- if the
// escaped text is significantly different from the chunk's `text`
// property, the preceding chunk may not have made the correct
// decisions about emitting an escape suffix. We could solve
// this by looping until the state converges (or until we detect
// a loop) but for now let's hope that's not necessary.
$state = new State( $line );
$safeLeft = '';
for ( $state->pos = 0; $state->pos < count( $line ); $state->pos++ ) {
$chunk = $line[$state->pos];
// Process the escapes for this chunk, given escaped previous chunk
$state->rightContext = substr( $state->rightContext, strlen( $chunk->text ) );
$thisEscape = $chunk->escape( $state );
$state->leftContext .=
( $thisEscape->prefix ?? '' ) .
$thisEscape->text .
( $thisEscape->suffix ?? '' );
if ( $thisEscape->greedy ) {
// protect the left context: this will be matched greedily
// by this chunk, so there's no chance that a subsequent
// token will include this in its prefix.
$safeLeft .= $state->leftContext;
$state->leftContext = '';
}
}
// right context should be empty here.
return $safeLeft . $state->leftContext;
}
/**
* The wikitext string associated with this chunk.
* @var string
*/
public $text;
/**
* The DOM Node associated with this chunk.
* @var Node
*/
public $node;
/**
* The prefix string to add if the start of the chunk doesn't match its
* constraints.
* @var ?string
*/
public $prefix;
/**
* The suffix string to add if the end of the chunk doesn't match its
* constraints.
* @var ?string
*/
public $suffix;
/**
* Does this chunk come from selser?
* @var bool
*/
public $selser;
/**
* Suppress separators?
* @var bool
*/
public $noSep;
/**
* @param array{text:string,node:Node,prefix?:string,suffix?:string} $args Options.
*/
public function __construct( array $args ) {
$this->text = $args['text'];
$this->node = $args['node'];
$this->prefix = $args['prefix'] ?? null;
$this->suffix = $args['suffix'] ?? null;
$this->selser = false;
$this->noSep = false;
}
/**
* Ensure that the argument `o`, which is perhaps a string, is a instance of
* `ConstrainedText`.
* @param string|ConstrainedText $o
* @param Node $node
* The {@link Node} corresponding to `o`.
* @return ConstrainedText
*/
public static function cast( $o, Node $node ): ConstrainedText {
if ( $o instanceof ConstrainedText ) {
return $o;
}
return new ConstrainedText( [ 'text' => $o ?? '', 'node' => $node ] );
}
/**
* Use the provided `state`, which gives context and access to the entire
* list of chunks, to determine the proper escape prefix/suffix.
* Returns an object with a `text` property as well as optional
* `prefix` and 'suffix' properties giving desired escape strings.
* @param State $state Context state
* @return Result
*/
public function escape( State $state ): Result {
// default implementation: no escaping, no prefixes or suffixes.
return new Result( $this->text, $this->prefix, $this->suffix );
}
/**
* Simple equality. This enforces type equality
* (ie subclasses are not equal).
* @param ConstrainedText $ct
* @return bool
*/
public function equals( ConstrainedText $ct ): bool {
return $this === $ct || (
get_class( $this ) === self::class &&
get_class( $ct ) === self::class &&
$this->text === $ct->text
);
}
/**
* Useful shortcut: execute a regular expression on the raw wikitext.
* @param string $re
* @param Env $env
* @return array|null
* An array containing the matched results or null if there were no matches.
*/
public function matches( string $re, Env $env ): ?array {
$r = preg_match( $re, $this->text, $m );
if ( $r === false ) {
if ( version_compare( PHP_VERSION, '8.0.0', '>' ) ) {
$error_msg = preg_last_error_msg();
} else {
$error_msg = "preg_last_error: " . preg_last_error();
}
$env->log( 'error', $error_msg, $re, $this->text );
throw new \Error( 'Bad regular expression' );
}
return $r === 0 ? null : $m;
}
/**
* SelSer support: when we come across an unmodified node in during
* selective serialization, we know we can use the original wikitext
* for that node unmodified. *But* there may be boundary conditions
* on the left and right sides of the selser'ed text which are going
* to require escaping.
*
* So rather than turning the node into a plain old `ConstrainedText`
* chunk, allow subclasses of `ConstrainedText` to register as potential
* handlers of selser nodes. A selser'ed magic link, for example,
* will then turn into a `MagicLinkText` and thus be able to enforce
* the proper boundary constraints.
*
* @param string $text
* @param Element $node
* @param DataParsoid $dataParsoid
* @param Env $env
* @param array $opts
* @return ConstrainedText[]
*/
public static function fromSelSer(
string $text, Element $node, DataParsoid $dataParsoid,
Env $env, array $opts = []
): array {
// Main dispatch point: iterate through registered subclasses, asking
// each if they can handle this node (by invoking `fromSelSerImpl`).
// We define parent types before subtypes, so search the list backwards
// to be sure we check subtypes before parent types.
$types = self::$types;
for ( $i = count( $types ) - 1; $i >= 0; $i-- ) {
$ct = call_user_func(
[ $types[$i], 'fromSelSerImpl' ],
$text, $node, $dataParsoid, $env, $opts
);
if ( !$ct ) {
continue;
}
if ( !is_array( $ct ) ) {
$ct = [ $ct ];
}
// tag these chunks as coming from selser
foreach ( $ct as $t ) {
$t->selser = true;
}
return $ct;
}
// ConstrainedText::fromSelSerImpl should handle everything which reaches it
// so nothing should make it here.
throw new \Error( 'Should never happen.' );
}
/**
* Base case: the given node type does not correspond to a special
* `ConstrainedText` subclass. We still have to be careful: the leftmost
* (rightmost) children of `node` may still be exposed to our left (right)
* context. If so (ie, their DSR bounds coincide) split the selser text
* and emit multiple `ConstrainedText` chunks to preserve the proper
* boundary conditions.
*
* @param string $text
* @param Element $node
* @param DataParsoid $dataParsoid
* @param Env $env
* @param array $opts
* @return ConstrainedText|ConstrainedText[]
*/
protected static function fromSelSerImpl(
string $text, Element $node, DataParsoid $dataParsoid,
Env $env, array $opts
) {
// look at leftmost and rightmost children, it may be that we need
// to turn these into ConstrainedText chunks in order to preserve
// the proper escape conditions on the prefix/suffix text.
$firstChild = DiffDOMUtils::firstNonDeletedChild( $node );
$lastChild = DiffDOMUtils::lastNonDeletedChild( $node );
$firstChildDp = $firstChild instanceof Element ?
DOMDataUtils::getDataParsoid( $firstChild ) : null;
$lastChildDp = $lastChild instanceof Element ?
DOMDataUtils::getDataParsoid( $lastChild ) : null;
$prefixChunks = [];
$suffixChunks = [];
$len = null;
$ignorePrefix = $opts['ignorePrefix'] ?? false;
$ignoreSuffix = $opts['ignoreSuffix'] ?? false;
// check to see if first child's DSR start is the same as this node's
// DSR start. If so, the first child is exposed to the (modified)
// left-hand context, and so recursively convert it to the proper
// list of specialized chunks.
if (
!$ignorePrefix &&
$firstChildDp && Utils::isValidDSR( $firstChildDp->dsr ?? null ) &&
$dataParsoid->dsr->start === $firstChildDp->dsr->start
) {
DOMUtils::assertElt( $firstChild ); // implied by $firstChildDp
$len = $firstChildDp->dsr->length();
if ( $len < 0 ) { // T254412: Bad DSR
$env->log( "error/html2wt/dsr",
"Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ),
"Node: " . DOMCompat::getOuterHTML( $firstChild ) );
} else {
if ( $len > strlen( $text ) ) { // T254412: Bad DSR
$env->log( "error/html2wt/dsr",
"Bad DSR: " . PHPUtils::jsonEncode( $firstChildDp->dsr ),
"Node: " . DOMCompat::getOuterHTML( $firstChild ) );
$len = strlen( $text );
}
$prefixChunks = self::fromSelSer(
substr( $text, 0, $len ), $firstChild, $firstChildDp, $env,
// this child node's right context will be protected:
[ 'ignoreSuffix' => true ]
);
$text = substr( $text, $len );
}
}
// check to see if last child's DSR end is the same as this node's
// DSR end. If so, the last child is exposed to the (modified)
// right-hand context, and so recursively convert it to the proper
// list of specialized chunks.
if (
!$ignoreSuffix && $lastChild !== $firstChild &&
$lastChildDp && Utils::isValidDSR( $lastChildDp->dsr ?? null ) &&
$dataParsoid->dsr->end === $lastChildDp->dsr->end
) {
DOMUtils::assertElt( $lastChild ); // implied by $lastChildDp
$len = $lastChildDp->dsr->length();
if ( $len < 0 ) { // T254412: Bad DSR
$env->log( "error/html2wt/dsr",
"Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ),
"Node: " . DOMCompat::getOuterHTML( $lastChild ) );
} else {
if ( $len > strlen( $text ) ) { // T254412: Bad DSR
$env->log( "error/html2wt/dsr",
"Bad DSR: " . PHPUtils::jsonEncode( $lastChildDp->dsr ),
"Node: " . DOMCompat::getOuterHTML( $lastChild ) );
$len = strlen( $text );
}
$suffixChunks = self::fromSelSer(
substr( $text, -$len ), $lastChild, $lastChildDp, $env,
// this child node's left context will be protected:
[ 'ignorePrefix' => true ]
);
$text = substr( $text, 0, -$len );
}
}
// glue together prefixChunks, whatever's left of `text`, and suffixChunks
$chunks = [ self::cast( $text, $node ) ];
$chunks = array_merge( $prefixChunks, $chunks, $suffixChunks );
// top-level chunks only:
if ( !( $ignorePrefix || $ignoreSuffix ) ) {
// ensure that the first chunk belongs to `node` in order to
// emit separators correctly before `node`
if ( $chunks[0]->node !== $node ) {
array_unshift( $chunks, self::cast( '', $node ) );
}
// set 'noSep' flag on all but the first chunk, so we don't get
// extra separators from `SSP.emitChunk`
foreach ( $chunks as $i => $t ) {
if ( $i > 0 ) {
$t->noSep = true;
}
}
}
return $chunks;
}
/**
* List of types we attempt `fromSelSer` with. This should include all the
* concrete subclasses of `ConstrainedText` (`RegExpConstrainedText` is
* missing since it is an abstract class). We also include the
* `ConstrainedText` class as the first element (even though it is
* an abstract base class) as a little bit of a hack: it simplifies
* `ConstrainedText.fromSelSer` by factoring some of its work into
* `ConstrainedText.fromSelSerImpl`.
* @var class-string[]
*/
private static $types = [
// Base class is first, as a special case
self::class,
// All concrete subclasses of ConstrainedText
WikiLinkText::class, ExtLinkText::class, AutoURLLinkText::class,
MagicLinkText::class, LanguageVariantText::class
];
}

File Metadata

Mime Type
text/x-php
Expires
Sat, May 16, 17:26 (10 h, 19 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
34/9d/b9f512488c7e1f8ee97532a7c7b7
Default Alt Text
ConstrainedText.php (12 KB)

Event Timeline