Page MenuHomePhorge

compareLanguageConverterOutput.php
No OneTemporary

Size
9 KB
Referenced Files
None
Subscribers
None

compareLanguageConverterOutput.php

<?php
/**
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
use MediaWiki\Config\ServiceOptions;
use MediaWiki\Content\TextContent;
use MediaWiki\Language\Language;
use MediaWiki\Parser\ParserOptions;
use MediaWiki\Parser\ParserOutput;
use MediaWiki\Rest\Handler\Helper\PageRestHelperFactory;
use MediaWiki\Revision\SlotRecord;
use MediaWiki\Title\Title;
use MediaWiki\User\User;
use Wikimedia\Bcp47Code\Bcp47Code;
use Wikimedia\Diff\ArrayDiffFormatter;
use Wikimedia\Diff\ComplexityException;
use Wikimedia\Diff\Diff;
use Wikimedia\Stats\NullStatsdDataFactory;
use Wikimedia\Stats\StatsFactory;
// @codeCoverageIgnoreStart
require_once __DIR__ . '/Maintenance.php';
// @codeCoverageIgnoreEnd
/**
* Maintenance script that compares variant conversion output between Parser and
* HtmlOutputRendererHelper.
*
* @ingroup Maintenance
*/
class CompareLanguageConverterOutput extends Maintenance {
public function __construct() {
parent::__construct();
$this->addDescription( 'Compares variant conversion output between Parser and HtmlOutputRendererHelper' );
$this->addArg(
'page-title',
'Name of the page to be parsed and compared',
true
);
$this->addArg(
'target-variant',
'Target variant language code to transform the content to',
true
);
}
public function execute() {
$mwInstance = $this->getServiceContainer();
$pageName = $this->getArg( 'page-title' );
$pageTitle = Title::newFromText( $pageName );
if ( !$pageTitle || !$pageTitle->exists() ) {
$this->fatalError( "Title with name $pageName not found" );
}
$targetVariantCode = $this->getArg( 'target-variant' );
$languageNameUtils = $mwInstance->getLanguageNameUtils();
if ( !$languageNameUtils->isValidBuiltInCode( $targetVariantCode ) ) {
$this->fatalError( "$targetVariantCode is not a supported variant" );
}
$targetVariant = $mwInstance->getLanguageFactory()->getLanguage(
$targetVariantCode
);
$user = User::newSystemUser( User::MAINTENANCE_SCRIPT_USER, [ 'steal' => true ] );
$baseLanguage = $pageTitle->getPageLanguage();
$parserOutput = $this->getParserOutput( $pageTitle, $baseLanguage, $targetVariant );
$parsoidOutput = $this->getParsoidOutput( $pageTitle, $targetVariant, $user );
$converterUsed = $this->getConverterUsed( $parsoidOutput );
$this->compareOutput( $parserOutput->getContentHolderText(),
$parsoidOutput->getText( [ 'deduplicateStyles' => false ] ), $converterUsed );
return true;
}
private function newPageRestHelperFactory(): PageRestHelperFactory {
$services = $this->getServiceContainer();
$factory = new PageRestHelperFactory(
new ServiceOptions( PageRestHelperFactory::CONSTRUCTOR_OPTIONS, $services->getMainConfig() ),
$services->getRevisionLookup(),
$services->getRevisionRenderer(),
$services->getTitleFormatter(),
$services->getPageStore(),
$services->getParsoidOutputStash(),
new NullStatsdDataFactory(),
$services->getParserOutputAccess(),
$services->getParsoidSiteConfig(),
$services->getHtmlTransformFactory(),
$services->getContentHandlerFactory(),
$services->getLanguageFactory(),
$services->getRedirectStore(),
$services->getLanguageConverterFactory(),
$services->getTitleFactory(),
$services->getConnectionProvider(),
$services->getChangeTagsStore(),
StatsFactory::newNull()
);
return $factory;
}
private function getParserOptions( Language $language ): ParserOptions {
$parserOpts = ParserOptions::newFromAnon();
$parserOpts->setTargetLanguage( $language );
$parserOpts->disableContentConversion( false );
$parserOpts->disableTitleConversion( false );
return $parserOpts;
}
private function getParserOutput(
Title $pageTitle,
Language $baseLanguage,
Language $targetVariant
): ParserOutput {
// We update the default language variant because we want Parser to
// perform variant conversion to it.
global $wgDefaultLanguageVariant;
$wgDefaultLanguageVariant = $targetVariant->getCode();
$mwInstance = $this->getServiceContainer();
$languageFactory = $mwInstance->getLanguageFactory();
$parser = $mwInstance->getParser();
$parserOptions = $this->getParserOptions(
$languageFactory->getParentLanguage( $baseLanguage )
);
$content = $mwInstance->getRevisionLookup()
->getRevisionByTitle( $pageTitle )
->getContent( SlotRecord::MAIN );
$wikiContent = ( $content instanceof TextContent ) ? $content->getText() : '';
$po = $parser->parse( $wikiContent, $pageTitle, $parserOptions );
// TODO T371008 consider if using the Content framework makes sense instead of creating the pipeline
$pipeline = $mwInstance->getDefaultOutputPipeline();
$options = [ 'deduplicateStyles' => false ];
return $pipeline->run( $po, $parserOptions, $options );
}
private function getParsoidOutput(
Title $pageTitle,
Bcp47Code $targetVariant,
User $user
): ParserOutput {
$htmlOutputRendererHelper = $this->newPageRestHelperFactory()->newHtmlOutputRendererHelper( $pageTitle, [
'stash' => false,
'flavor' => 'view',
], $user );
$htmlOutputRendererHelper->setVariantConversionLanguage( $targetVariant );
return $htmlOutputRendererHelper->getHtml();
}
private function getWords( string $output ): array {
$tagsRemoved = strip_tags( $output );
$words = preg_split( '/\s+/', trim( $tagsRemoved ), -1, PREG_SPLIT_NO_EMPTY );
return $words;
}
private function getBody( string $output ): string {
$dom = new DOMDocument();
// phpcs:disable Generic.PHP.NoSilencedErrors.Discouraged
@$dom->loadHTML( $output );
$body = $dom->getElementsByTagName( 'body' )->item( 0 );
if ( $body === null ) {
// Body element not present
return $output;
}
return $body->textContent;
}
private function compareOutput(
string $parserText,
string $parsoidText,
string $converterUsed
): void {
$parsoidWords = $this->getWords( $this->getBody( $parsoidText ) );
$parserWords = $this->getWords( $parserText );
$parserWordCount = count( $parserWords );
$parsoidWordCount = count( $parsoidWords );
$this->output( "Word count: Parsoid: $parsoidWordCount; Parser: $parserWordCount\n" );
$this->outputSimilarity( $parsoidWords, $parserWords );
$this->output( "\n" );
$this->outputDiff( $parsoidWords, $parserWords, $converterUsed );
}
private function getConverterUsed( ParserOutput $parsoidOutput ): string {
$isCoreConverterUsed = strpos(
$parsoidOutput->getRawText(),
'Variant conversion performed using the core LanguageConverter'
);
if ( $isCoreConverterUsed ) {
return 'Core LanguageConverter';
} else {
return 'Parsoid LanguageConverter';
}
}
// Inspired from: https://stackoverflow.com/a/55927237/903324
private function mb_sprintf( string $format, ...$args ): string {
$params = $args;
return sprintf(
preg_replace_callback(
'/(?<=%|%-)\d+(?=s)/',
static function ( array $matches ) use ( &$params ) {
$value = array_shift( $params );
return (string)( strlen( $value ) - mb_strlen( $value ) + $matches[0] );
},
$format
),
...$args
);
}
private function outputSimilarity( array $parsoidWords, array $parserWords ): void {
$parsoidOutput = implode( ' ', $parsoidWords );
$parserOutput = implode( ' ', $parserWords );
$this->output(
'Total characters: Parsoid: ' . strlen( $parsoidOutput ) .
'; Parser: ' . strlen( $parserOutput ) . "\n"
);
$similarityPercent = 0;
$similarCharacters = similar_text( $parsoidOutput, $parserOutput, $similarityPercent );
$similarityPercent = round( $similarityPercent, 2 );
$this->output(
"Similarity via similar_text(): $similarityPercent%; Similar characters: $similarCharacters"
);
}
private function outputDiff( array $parsoidWords, array $parserWords, string $converterUsed ): void {
$out = str_repeat( '-', 96 ) . "\n";
$out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", 'Line', 'Parsoid', 'Parser', 'Diff' );
$out .= sprintf( "| %5s | %-35s | %-35s | %-8s |\n", '', "($converterUsed)", '', '' );
$out .= str_repeat( '-', 96 ) . "\n";
try {
$diff = new Diff( $parsoidWords, $parserWords );
} catch ( ComplexityException $e ) {
$this->output( $e->getMessage() );
$this->error( 'Encountered ComplexityException while computing diff' );
}
// Print the difference between the words
$wordDiffFormat = ( new ArrayDiffFormatter() )->format( $diff );
foreach ( $wordDiffFormat as $index => $wordDiff ) {
$action = $wordDiff['action'];
$old = $wordDiff['old'] ?? null;
$new = $wordDiff['new'] ?? null;
$out .= $this->mb_sprintf(
"| %5s | %-35s | %-35s | %-8s |\n",
str_pad( (string)( $index + 1 ), 5, ' ', STR_PAD_LEFT ),
mb_strimwidth( $old ?? '- N/A -', 0, 35, '…' ),
mb_strimwidth( $new ?? '- N/A -', 0, 35, '…' ),
$action
);
}
// Print the footer.
$out .= str_repeat( '-', 96 ) . "\n";
$this->output( "\n" . $out );
}
}
// @codeCoverageIgnoreStart
$maintClass = CompareLanguageConverterOutput::class;
require_once RUN_MAINTENANCE_IF_MAIN;
// @codeCoverageIgnoreEnd

File Metadata

Mime Type
text/x-php
Expires
Wed, Sep 10, 15:09 (16 h, 45 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
d3/45/3ba0660ca72ec4c005ee1f2cd691
Default Alt Text
compareLanguageConverterOutput.php (9 KB)

Event Timeline