WikiTextStructure.php
No OneTemporary
Actions

Size

7 KB

Referenced Files

None

Subscribers

None

WikiTextStructure.php
View Options

	<?php

	namespace MediaWiki\Content;

	use HtmlFormatter\HtmlFormatter;
	use MediaWiki\Parser\ParserOutput;
	use MediaWiki\Parser\Sanitizer;

	/**
	* Class allowing to explore the structure of parsed wikitext.
	*/
	class WikiTextStructure {

	private ?string $openingText = null;
	private ?string $allText = null;
	/** @var string[] */
	private array $auxText = [];
	private ParserOutput $parserOutput;

	/**
	* Selectors to elements that are excluded entirely from search
	*/
	private const EXCLUDED_ELEMENT_SELECTORS = [
	// "it looks like you don't have javascript enabled..." – do not need to index
	'audio', 'video',
	// CSS stylesheets aren't content
	'style',
	// The [1] for references from Cite
	'sup.reference',
	// The ↑ next to references in the references section from Cite
	'.mw-cite-backlink',
	// Headings are already indexed in their own field.
	'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
	// Collapsed fields are hidden by default, so we don't want them showing up.
	'.autocollapse',
	// Content explicitly decided to be not searchable by editors such
	// as custom navigation templates.
	'.navigation-not-searchable',
	// User-facing interface code prompting the user to act from WikibaseMediaInfo
	'.wbmi-entityview-emptyCaption',
	];

	/**
	* Selectors to elements that are considered auxiliary to the article text for search
	*/
	private const AUXILIARY_ELEMENT_SELECTORS = [
	// Thumbnail captions aren't really part of the text proper
	'.thumbcaption',
	'figcaption',
	// Neither are tables
	'table',
	// Common style for "See also:".
	'.rellink',
	// Common style for calling out helpful links at the top of the article.
	'.dablink',
	// New class users can use to mark stuff as auxiliary to searches.
	'.searchaux',
	];

	/**
	* @param ParserOutput $parserOutput
	*/
	public function __construct( ParserOutput $parserOutput ) {
	$this->parserOutput = $parserOutput;
	}

	/**
	* Gets headings from the page.
	*
	* @return string[]
	* First strip out things that look like references. We can't use HTML filtering because
	* the references come back as <sup> tags without a class. To keep from breaking stuff like
	* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
	* we don't remove the whole <sup> tag.
	*
	* We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
	* I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
	*
	* So we only strip things that look like <sup> tags wrapping a reference. And since the data
	* looks like:
	* Reference in heading <sup>[1]</sup><sup>[2]</sup>
	* we can not really use HtmlFormatter as we have no suitable selector.
	*/
	public function headings() {
	$headings = [];
	$tocData = $this->parserOutput->getTOCData();
	if ( $tocData === null ) {
	return $headings;
	}
	$ignoredHeadings = $this->getIgnoredHeadings();
	foreach ( $tocData->getSections() as $heading ) {
	$heading = $heading->line;

	// Some wikis wrap the brackets in a span:
	// https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
	$heading = preg_replace( '/<\/?span>/', '', $heading );
	// Normalize [] so the following regexp would work.
	$heading = preg_replace( [ '/[/', '/]/' ], [ '[', ']' ], $heading );
	$heading = preg_replace( '/<sup>\s\[\s\d+\s\]\s<\/sup>/i', '', $heading );

	// Strip tags from the heading or else we'll display them (escaped) in search results
	$heading = trim( Sanitizer::stripAllTags( $heading ) );

	// Note that we don't take the level of the heading into account - all headings are equal.
	// Except the ones we ignore.
	if ( !in_array( $heading, $ignoredHeadings ) ) {
	$headings[] = $heading;
	}
	}

	return $headings;
	}

	/**
	* Parse a message content into an array. This function is generally used to
	* parse settings stored as i18n messages (see search-ignored-headings).
	*
	* @param string $message
	*
	* @return string[]
	*/
	public static function parseSettingsInMessage( $message ) {
	$lines = explode( "\n", $message );
	// Remove comments
	$lines = preg_replace( '/#.*$/', '', $lines );
	// Remove extra spaces
	$lines = array_map( 'trim', $lines );

	// Remove empty lines
	return array_filter( $lines );
	}

	/**
	* Gets a list of heading to ignore.
	*
	* @return string[]
	*/
	private function getIgnoredHeadings() {
	static $ignoredHeadings = null;
	if ( $ignoredHeadings === null ) {
	$ignoredHeadings = [];
	$source = wfMessage( 'search-ignored-headings' )->inContentLanguage();
	if ( !$source->isDisabled() ) {
	$lines = self::parseSettingsInMessage( $source->plain() );
	// Now we just have headings!
	$ignoredHeadings = $lines;
	}
	}

	return $ignoredHeadings;
	}

	/**
	* Extract parts of the text - opening, main and auxiliary.
	*/
	private function extractWikitextParts() {
	if ( $this->allText !== null ) {
	return;
	}
	$text = $this->parserOutput->getRawText();
	if ( $text === '' ) {
	$this->allText = "";

	// empty text - nothing to seek here
	return;
	}

	$this->openingText = $this->extractTextBeforeFirstHeading( $text );

	$formatter = new HtmlFormatter( $text );

	// Strip elements from the page that we never want in the search text.
	$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
	$formatter->filterContent();

	// Strip elements from the page that are auxiliary text. These will still be
	// searched, but matches will be ranked lower and non-auxiliary matches will be
	// preferred in highlighting.
	$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
	$auxiliaryElements = $formatter->filterContent();
	$this->allText = trim( Sanitizer::stripAllTags( $formatter->getText() ) );
	foreach ( $auxiliaryElements as $auxiliaryElement ) {
	$this->auxText[] =
	trim( Sanitizer::stripAllTags( $formatter->getText( $auxiliaryElement ) ) );
	}
	}

	/**
	* Get text before first heading.
	*
	* @param string $text
	*
	* @return string\|null
	*/
	private function extractTextBeforeFirstHeading( $text ) {
	$matches = [];
	if ( !preg_match( '/<h[123456]\b/', $text, $matches, PREG_OFFSET_CAPTURE ) ) {
	// There isn't a first heading, so we interpret this as the article
	// being entirely without heading.
	return null;
	}
	$text = substr( $text, 0, $matches[ 0 ][ 1 ] );
	if ( !$text ) {
	// There isn't any text before the first heading, so we declare there isn't
	// a first heading.
	return null;
	}

	$formatter = new HtmlFormatter( $text );
	$formatter->remove( self::EXCLUDED_ELEMENT_SELECTORS );
	$formatter->remove( self::AUXILIARY_ELEMENT_SELECTORS );
	$formatter->filterContent();
	$text = trim( Sanitizer::stripAllTags( $formatter->getText() ) );

	if ( !$text ) {
	// There isn't any text after filtering before the first heading, so we declare
	// that there isn't a first heading.
	return null;
	}

	return $text;
	}

	/**
	* @return string\|null
	*/
	public function getOpeningText() {
	$this->extractWikitextParts();

	return $this->openingText;
	}

	/**
	* @return string
	*/
	public function getMainText() {
	$this->extractWikitextParts();

	return $this->allText;
	}

	/**
	* @return string[]
	*/
	public function getAuxiliaryText() {
	$this->extractWikitextParts();

	return $this->auxText;
	}

	/**
	* Get the "defaultsort" property
	*
	* @return string\|null
	*/
	public function getDefaultSort() {
	$sort = $this->parserOutput->getPageProperty( 'defaultsort' );
	if ( $sort === false ) {
	return null;
	}

	return $sort;
	}
	}

	/** @deprecated class alias since 1.43 */
	class_alias( WikiTextStructure::class, 'WikiTextStructure' );

File Metadata

Mime Type: text/x-php
Expires: Fri, Jul 3, 16:45 (11 h, 25 m)
Storage Engine: local-disk
Storage Format: Raw Data
Storage Handle: c1/b7/b2844e2540229e37d32db8d6d62f
Default Alt Text: WikiTextStructure.php (7 KB)

WikiTextStructure.phpNo OneTemporaryActions

WikiTextStructure.phpView Options

File Metadata

Event Timeline

WikiTextStructure.php
No OneTemporary
Actions

WikiTextStructure.php
View Options