Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F2750566
WikiTextStructure.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
7 KB
Referenced Files
None
Subscribers
None
WikiTextStructure.php
View Options
<?php
namespace
MediaWiki\Content
;
use
HtmlFormatter\HtmlFormatter
;
use
MediaWiki\Parser\ParserOutput
;
use
MediaWiki\Parser\Sanitizer
;
/**
* Class allowing to explore the structure of parsed wikitext.
*/
class
WikiTextStructure
{
private
?
string
$openingText
=
null
;
private
?
string
$allText
=
null
;
/** @var string[] */
private
array
$auxText
=
[];
private
ParserOutput
$parserOutput
;
/**
* Selectors to elements that are excluded entirely from search
*/
private
const
EXCLUDED_ELEMENT_SELECTORS
=
[
// "it looks like you don't have javascript enabled..." – do not need to index
'audio'
,
'video'
,
// CSS stylesheets aren't content
'style'
,
// The [1] for references from Cite
'sup.reference'
,
// The ↑ next to references in the references section from Cite
'.mw-cite-backlink'
,
// Headings are already indexed in their own field.
'h1'
,
'h2'
,
'h3'
,
'h4'
,
'h5'
,
'h6'
,
// Collapsed fields are hidden by default, so we don't want them showing up.
'.autocollapse'
,
// Content explicitly decided to be not searchable by editors such
// as custom navigation templates.
'.navigation-not-searchable'
,
// User-facing interface code prompting the user to act from WikibaseMediaInfo
'.wbmi-entityview-emptyCaption'
,
];
/**
* Selectors to elements that are considered auxiliary to the article text for search
*/
private
const
AUXILIARY_ELEMENT_SELECTORS
=
[
// Thumbnail captions aren't really part of the text proper
'.thumbcaption'
,
'figcaption'
,
// Neither are tables
'table'
,
// Common style for "See also:".
'.rellink'
,
// Common style for calling out helpful links at the top of the article.
'.dablink'
,
// New class users can use to mark stuff as auxiliary to searches.
'.searchaux'
,
];
/**
* @param ParserOutput $parserOutput
*/
public
function
__construct
(
ParserOutput
$parserOutput
)
{
$this
->
parserOutput
=
$parserOutput
;
}
/**
* Gets headings from the page.
*
* @return string[]
* First strip out things that look like references. We can't use HTML filtering because
* the references come back as <sup> tags without a class. To keep from breaking stuff like
* ==Applicability of the strict mass–energy equivalence formula, ''E'' = ''mc''<sup>2</sup>==
* we don't remove the whole <sup> tag.
*
* We also don't want to strip the <sup> tag and remove everything that looks like [2] because,
* I don't know, maybe there is a band named Word [2] Foo r something. Whatever.
*
* So we only strip things that look like <sup> tags wrapping a reference. And since the data
* looks like:
* Reference in heading <sup>[1]</sup><sup>[2]</sup>
* we can not really use HtmlFormatter as we have no suitable selector.
*/
public
function
headings
()
{
$headings
=
[];
$tocData
=
$this
->
parserOutput
->
getTOCData
();
if
(
$tocData
===
null
)
{
return
$headings
;
}
$ignoredHeadings
=
$this
->
getIgnoredHeadings
();
foreach
(
$tocData
->
getSections
()
as
$heading
)
{
$heading
=
$heading
->
line
;
// Some wikis wrap the brackets in a span:
// https://en.wikipedia.org/wiki/MediaWiki:Cite_reference_link
$heading
=
preg_replace
(
'/<
\/
?span>/'
,
''
,
$heading
);
// Normalize [] so the following regexp would work.
$heading
=
preg_replace
(
[
'/[/'
,
'/]/'
],
[
'['
,
']'
],
$heading
);
$heading
=
preg_replace
(
'/<sup>
\s
*
\[\s
*
\d
+
\s
*
\]\s
*<
\/
sup>/i'
,
''
,
$heading
);
// Strip tags from the heading or else we'll display them (escaped) in search results
$heading
=
trim
(
Sanitizer
::
stripAllTags
(
$heading
)
);
// Note that we don't take the level of the heading into account - all headings are equal.
// Except the ones we ignore.
if
(
!
in_array
(
$heading
,
$ignoredHeadings
)
)
{
$headings
[]
=
$heading
;
}
}
return
$headings
;
}
/**
* Parse a message content into an array. This function is generally used to
* parse settings stored as i18n messages (see search-ignored-headings).
*
* @param string $message
*
* @return string[]
*/
public
static
function
parseSettingsInMessage
(
$message
)
{
$lines
=
explode
(
"
\n
"
,
$message
);
// Remove comments
$lines
=
preg_replace
(
'/#.*$/'
,
''
,
$lines
);
// Remove extra spaces
$lines
=
array_map
(
'trim'
,
$lines
);
// Remove empty lines
return
array_filter
(
$lines
);
}
/**
* Gets a list of heading to ignore.
*
* @return string[]
*/
private
function
getIgnoredHeadings
()
{
static
$ignoredHeadings
=
null
;
if
(
$ignoredHeadings
===
null
)
{
$ignoredHeadings
=
[];
$source
=
wfMessage
(
'search-ignored-headings'
)->
inContentLanguage
();
if
(
!
$source
->
isDisabled
()
)
{
$lines
=
self
::
parseSettingsInMessage
(
$source
->
plain
()
);
// Now we just have headings!
$ignoredHeadings
=
$lines
;
}
}
return
$ignoredHeadings
;
}
/**
* Extract parts of the text - opening, main and auxiliary.
*/
private
function
extractWikitextParts
()
{
if
(
$this
->
allText
!==
null
)
{
return
;
}
$text
=
$this
->
parserOutput
->
getRawText
();
if
(
$text
===
''
)
{
$this
->
allText
=
""
;
// empty text - nothing to seek here
return
;
}
$this
->
openingText
=
$this
->
extractTextBeforeFirstHeading
(
$text
);
$formatter
=
new
HtmlFormatter
(
$text
);
// Strip elements from the page that we never want in the search text.
$formatter
->
remove
(
self
::
EXCLUDED_ELEMENT_SELECTORS
);
$formatter
->
filterContent
();
// Strip elements from the page that are auxiliary text. These will still be
// searched, but matches will be ranked lower and non-auxiliary matches will be
// preferred in highlighting.
$formatter
->
remove
(
self
::
AUXILIARY_ELEMENT_SELECTORS
);
$auxiliaryElements
=
$formatter
->
filterContent
();
$this
->
allText
=
trim
(
Sanitizer
::
stripAllTags
(
$formatter
->
getText
()
)
);
foreach
(
$auxiliaryElements
as
$auxiliaryElement
)
{
$this
->
auxText
[]
=
trim
(
Sanitizer
::
stripAllTags
(
$formatter
->
getText
(
$auxiliaryElement
)
)
);
}
}
/**
* Get text before first heading.
*
* @param string $text
*
* @return string|null
*/
private
function
extractTextBeforeFirstHeading
(
$text
)
{
$matches
=
[];
if
(
!
preg_match
(
'/<h[123456]
\b
/'
,
$text
,
$matches
,
PREG_OFFSET_CAPTURE
)
)
{
// There isn't a first heading, so we interpret this as the article
// being entirely without heading.
return
null
;
}
$text
=
substr
(
$text
,
0
,
$matches
[
0
][
1
]
);
if
(
!
$text
)
{
// There isn't any text before the first heading, so we declare there isn't
// a first heading.
return
null
;
}
$formatter
=
new
HtmlFormatter
(
$text
);
$formatter
->
remove
(
self
::
EXCLUDED_ELEMENT_SELECTORS
);
$formatter
->
remove
(
self
::
AUXILIARY_ELEMENT_SELECTORS
);
$formatter
->
filterContent
();
$text
=
trim
(
Sanitizer
::
stripAllTags
(
$formatter
->
getText
()
)
);
if
(
!
$text
)
{
// There isn't any text after filtering before the first heading, so we declare
// that there isn't a first heading.
return
null
;
}
return
$text
;
}
/**
* @return string|null
*/
public
function
getOpeningText
()
{
$this
->
extractWikitextParts
();
return
$this
->
openingText
;
}
/**
* @return string
*/
public
function
getMainText
()
{
$this
->
extractWikitextParts
();
return
$this
->
allText
;
}
/**
* @return string[]
*/
public
function
getAuxiliaryText
()
{
$this
->
extractWikitextParts
();
return
$this
->
auxText
;
}
/**
* Get the "defaultsort" property
*
* @return string|null
*/
public
function
getDefaultSort
()
{
$sort
=
$this
->
parserOutput
->
getPageProperty
(
'defaultsort'
);
if
(
$sort
===
false
)
{
return
null
;
}
return
$sort
;
}
}
/** @deprecated class alias since 1.43 */
class_alias
(
WikiTextStructure
::
class
,
'WikiTextStructure'
);
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Fri, Jul 3, 16:45 (11 h, 25 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
c1/b7/b2844e2540229e37d32db8d6d62f
Default Alt Text
WikiTextStructure.php (7 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment