Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F1429625
HtmlFormatter.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
12 KB
Referenced Files
None
Subscribers
None
HtmlFormatter.php
View Options
<?php
/**
* Performs transformations of HTML by wrapping around libxml2 and working
* around its countless bugs.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
*/
namespace
HtmlFormatter
;
use
DOMDocument
;
use
DOMElement
;
use
DOMNodeList
;
use
DOMXPath
;
use
InvalidArgumentException
;
class
HtmlFormatter
{
/**
* @var ?DOMDocument
*/
private
?
DOMDocument
$doc
=
null
;
/**
* @var string
*/
private
string
$html
;
/**
* @var string[]
*/
private
array
$itemsToRemove
=
[];
/**
* @var string[]
*/
private
array
$elementsToFlatten
=
[];
/**
* Whether a libxml_disable_entity_loader() call is needed
*/
private
const
DISABLE_LOADER
=
LIBXML_VERSION
<
20900
;
/**
* @var bool
*/
protected
bool
$removeMedia
=
false
;
/**
* @var bool
*/
protected
bool
$removeComments
=
false
;
/**
* @param string $html Text to process
*/
public
function
__construct
(
string
$html
)
{
$this
->
html
=
$html
;
}
/**
* Turns a chunk of HTML into a proper document
* @param string $html HTML to wrap
* @return string
*/
public
static
function
wrapHTML
(
string
$html
):
string
{
return
'<!doctype html><html><head><meta charset="UTF-8"/></head><body>'
.
$html
.
'</body></html>'
;
}
/**
* Override this in descendant class to modify HTML after it has been converted from DOM tree
* @param string $html HTML to process
* @return string Processed HTML
*/
#[\ReturnTypeWillChange]
protected
function
onHtmlReady
(
string
$html
):
string
{
return
$html
;
}
/**
* @return DOMDocument DOM to manipulate
*/
#[\ReturnTypeWillChange]
public
function
getDoc
():
DOMDocument
{
if
(
!
$this
->
doc
)
{
$html
=
$this
->
html
;
if
(
!
str_starts_with
(
$html
,
'<!doctype html>'
)
)
{
// DOMDocument::loadHTML defaults to ASCII for partial html
// Parse as full html with encoding
$html
=
self
::
wrapHTML
(
$html
);
}
// Workaround for bug that caused spaces after references
// to disappear during processing (T55086, T348402)
$html
=
str_replace
(
'> <'
,
'> <'
,
$html
);
\libxml_use_internal_errors
(
true
);
$loader
=
false
;
if
(
self
::
DISABLE_LOADER
)
{
// @codeCoverageIgnoreStart
$loader
=
\libxml_disable_entity_loader
();
// @codeCoverageIgnoreEnd
}
$this
->
doc
=
new
DOMDocument
();
$this
->
doc
->
strictErrorChecking
=
false
;
$this
->
doc
->
loadHTML
(
$html
);
if
(
self
::
DISABLE_LOADER
)
{
// @codeCoverageIgnoreStart
\libxml_disable_entity_loader
(
$loader
);
// @codeCoverageIgnoreEnd
}
\libxml_use_internal_errors
(
false
);
}
return
$this
->
doc
;
}
/**
* Sets whether comments should be removed from output
* @param bool $flag Whether to remove or not
*/
public
function
setRemoveComments
(
bool
$flag
=
true
):
void
{
$this
->
removeComments
=
$flag
;
}
/**
* Sets whether images/videos/sounds should be removed from output
* @param bool $flag Whether to remove or not
*/
public
function
setRemoveMedia
(
bool
$flag
=
true
):
void
{
$this
->
removeMedia
=
$flag
;
}
/**
* Adds one or more selector of content to remove. A subset of CSS selector
* syntax is supported:
*
* <tag>
* <tag>.class
* .<class>
* #<id>
*
* @param string[]|string $selectors Selector(s) of stuff to remove
*/
public
function
remove
(
$selectors
):
void
{
$this
->
itemsToRemove
=
array_merge
(
$this
->
itemsToRemove
,
(
array
)
$selectors
);
}
/**
* Adds one or more element name to the list to flatten (remove tag, but not its content)
* Can accept non-delimited regexes
*
* Note this interface may fail in surprising unexpected ways due to usage of regexes,
* so should not be relied on for HTML markup security measures.
*
* @param string[]|string $elements Name(s) of tag(s) to flatten
*/
public
function
flatten
(
$elements
):
void
{
$this
->
elementsToFlatten
=
array_merge
(
$this
->
elementsToFlatten
,
(
array
)
$elements
);
}
/**
* Instructs the formatter to flatten all tags, and remove comments
*/
public
function
flattenAllTags
():
void
{
$this
->
flatten
(
'[?!]?[a-z0-9]+'
);
$this
->
setRemoveComments
(
true
);
}
/**
* Removes content we've chosen to remove. The text of the removed elements can be
* extracted with the getText method.
* @return DOMElement[] Array of removed DOMElements
*/
#[\ReturnTypeWillChange]
public
function
filterContent
():
array
{
$removals
=
$this
->
parseItemsToRemove
();
// Bail out early if nothing to do
if
(
\array_reduce
(
$removals
,
static
function
(
$carry
,
$item
)
{
return
$carry
&&
!
$item
;
},
true
)
)
{
return
[];
}
$doc
=
$this
->
getDoc
();
// Remove tags
// You can't remove DOMNodes from a DOMNodeList as you're iterating
// over them in a foreach loop. It will seemingly leave the internal
// iterator on the foreach out of wack and results will be quite
// strange. Though, making a queue of items to remove seems to work.
$domElemsToRemove
=
[];
foreach
(
$removals
[
'TAG'
]
as
$tagToRemove
)
{
$tagToRemoveNodes
=
$doc
->
getElementsByTagName
(
$tagToRemove
);
foreach
(
$tagToRemoveNodes
as
$tagToRemoveNode
)
{
if
(
$tagToRemoveNode
)
{
$domElemsToRemove
[]
=
$tagToRemoveNode
;
}
}
}
$removed
=
$this
->
removeElements
(
$domElemsToRemove
);
// Elements with named IDs
$domElemsToRemove
=
[];
foreach
(
$removals
[
'ID'
]
as
$itemToRemove
)
{
$itemToRemoveNode
=
$doc
->
getElementById
(
$itemToRemove
);
if
(
$itemToRemoveNode
)
{
$domElemsToRemove
[]
=
$itemToRemoveNode
;
}
}
$removed
=
array_merge
(
$removed
,
$this
->
removeElements
(
$domElemsToRemove
)
);
// CSS Classes
$domElemsToRemove
=
[];
$xpath
=
new
DOMXPath
(
$doc
);
foreach
(
$removals
[
'CLASS'
]
as
$classToRemove
)
{
// Use spaces to avoid matching for unrelated classnames (T231160)
// https://stackoverflow.com/a/1604480/319266
$elements
=
$xpath
->
query
(
'//*[contains(concat(" ", @class, " "), " '
.
$classToRemove
.
' ")]'
);
/** @var $element DOMElement */
foreach
(
$elements
as
$element
)
{
$classes
=
$element
->
getAttribute
(
'class'
);
if
(
\preg_match
(
"/
\b
$classToRemove
\b
/"
,
$classes
)
&&
$element
->
parentNode
)
{
$domElemsToRemove
[]
=
$element
;
}
}
}
$removed
=
\array_merge
(
$removed
,
$this
->
removeElements
(
$domElemsToRemove
)
);
$return
=
[];
// Tags with CSS Classes
foreach
(
$removals
[
'TAG_CLASS'
]
as
$classToRemove
)
{
$parts
=
explode
(
'.'
,
$classToRemove
);
$elements
=
$xpath
->
query
(
'//'
.
$parts
[
0
]
.
'[@class="'
.
$parts
[
1
]
.
'"]'
);
$return
[]
=
$this
->
removeElements
(
$elements
);
}
return
array_merge
(
array_merge
(
...
$return
),
$removed
);
}
/**
* Removes a list of elements from DOMDocument
* @param DOMElement[]|DOMNodeList $elements
* @return DOMElement[] Array of removed elements
*/
private
function
removeElements
(
$elements
):
array
{
$list
=
$elements
;
if
(
$elements
instanceof
DOMNodeList
)
{
$list
=
[];
foreach
(
$elements
as
$element
)
{
$list
[]
=
$element
;
}
}
/** @var $element DOMElement */
foreach
(
$list
as
$element
)
{
if
(
$element
->
parentNode
)
{
$element
->
parentNode
->
removeChild
(
$element
);
}
}
return
$list
;
}
/**
* Performs final transformations and returns resulting HTML. Note that if you want to call this
* both without an element and with an element you should call it without an element first. If you
* specify the $element in the method it'll change the underlying dom and you won't be able to get
* it back.
*
* @param DOMElement|string|null $element ID of element to get HTML from or
* false to get it from the whole tree
* @return string Processed HTML
*/
#[\ReturnTypeWillChange]
public
function
getText
(
$element
=
null
):
string
{
if
(
$this
->
doc
)
{
if
(
$element
!==
null
&&
!(
$element
instanceof
DOMElement
)
)
{
$element
=
$this
->
doc
->
getElementById
(
$element
);
}
if
(
!
$element
)
{
$element
=
$this
->
doc
->
getElementsByTagName
(
'body'
)->
item
(
0
);
}
$html
=
$this
->
doc
->
saveHTML
(
$element
);
if
(
PHP_EOL
===
"
\r\n
"
)
{
// Cleanup for CRLF mis-processing of unknown origin on Windows.
$html
=
str_replace
(
' '
,
''
,
$html
);
}
}
else
{
$html
=
$this
->
html
;
}
// Remove stuff added by wrapHTML()
$html
=
self
::
removeBeforeIncluding
(
$html
,
'<body>'
);
$html
=
self
::
removeAfterIncluding
(
$html
,
'</body>'
);
$html
=
$this
->
onHtmlReady
(
$html
);
if
(
$this
->
removeComments
)
{
$html
=
self
::
removeBetweenIncluding
(
$html
,
'<!--'
,
'-->'
);
}
if
(
$this
->
elementsToFlatten
)
{
$elements
=
\implode
(
'|'
,
$this
->
elementsToFlatten
);
$html
=
\preg_replace
(
"#</?(?:$elements)
\\
b[^>]*>#is"
,
''
,
$html
);
}
return
$html
;
}
/**
* Removes everything from beginning of string to last occurance of $needle, including $needle.
*
* Equivalent to the regex /^.*?<body>/s when $needle = '<body>'
*/
public
static
function
removeBeforeIncluding
(
string
$haystack
,
string
$needle
):
string
{
$pos
=
strrpos
(
$haystack
,
$needle
);
if
(
$pos
===
false
)
{
return
$haystack
;
}
return
substr
(
$haystack
,
$pos
+
strlen
(
$needle
)
);
}
/**
* Removes everything from the first occurance of $needle to the end of the string, including $needle
*
* Equivalent to the regex /<\/body>.*$/s when $needle = '</body>'
*/
public
static
function
removeAfterIncluding
(
string
$haystack
,
string
$needle
):
string
{
$pos
=
strpos
(
$haystack
,
$needle
);
if
(
$pos
===
false
)
{
return
$haystack
;
}
return
substr
(
$haystack
,
0
,
$pos
);
}
/**
* Removes everything between $open and $close, including $open and $close.
*/
public
static
function
removeBetweenIncluding
(
string
$haystack
,
string
$open
,
string
$close
):
string
{
$pieces
=
[];
$offset
=
0
;
while
(
true
)
{
$openPos
=
strpos
(
$haystack
,
$open
,
$offset
);
if
(
$openPos
==
false
)
{
break
;
}
$closePos
=
strpos
(
$haystack
,
$close
,
$openPos
);
if
(
$closePos
===
false
)
{
break
;
}
$pieces
[]
=
substr
(
$haystack
,
$offset
,
$openPos
-
$offset
);
$offset
=
$closePos
+
strlen
(
$close
);
}
$pieces
[]
=
substr
(
$haystack
,
$offset
);
return
implode
(
''
,
$pieces
);
}
/**
* Helper function for parseItemsToRemove(). This function extracts the selector type
* and the raw name of a selector from a CSS-style selector string and assigns those
* values to parameters passed by reference. For example, if given '#toc' as the
* $selector parameter, it will assign 'ID' as the $type and 'toc' as the $rawName.
* @param string $selector CSS selector to parse
* @param string &$type The type of selector (ID, CLASS, TAG_CLASS, or TAG)
* @param string &$rawName The raw name of the selector
* @return bool Whether the selector was successfully recognised
*/
protected
function
parseSelector
(
string
$selector
,
string
&
$type
,
string
&
$rawName
):
bool
{
$firstChar
=
substr
(
$selector
,
0
,
1
);
if
(
$firstChar
===
'.'
)
{
$type
=
'CLASS'
;
$rawName
=
substr
(
$selector
,
1
);
}
elseif
(
$firstChar
===
'#'
)
{
$type
=
'ID'
;
$rawName
=
substr
(
$selector
,
1
);
}
elseif
(
strpos
(
$selector
,
'.'
)
>
0
)
{
$type
=
'TAG_CLASS'
;
$rawName
=
$selector
;
}
elseif
(
strpos
(
$selector
,
'['
)
===
false
&&
strpos
(
$selector
,
']'
)
===
false
)
{
$type
=
'TAG'
;
$rawName
=
$selector
;
}
else
{
throw
new
InvalidArgumentException
(
__METHOD__
.
"(): unrecognized selector '$selector'"
);
}
return
true
;
}
/**
* Transforms CSS-style selectors into an internal representation suitable for
* processing by filterContent()
* @return array
*/
#[\ReturnTypeWillChange]
protected
function
parseItemsToRemove
():
array
{
$removals
=
[
'ID'
=>
[],
'TAG'
=>
[],
'CLASS'
=>
[],
'TAG_CLASS'
=>
[],
];
foreach
(
$this
->
itemsToRemove
as
$itemToRemove
)
{
$type
=
''
;
$rawName
=
''
;
if
(
$this
->
parseSelector
(
$itemToRemove
,
$type
,
$rawName
)
)
{
$removals
[
$type
][]
=
$rawName
;
}
}
if
(
$this
->
removeMedia
)
{
$removals
[
'TAG'
][]
=
'img'
;
$removals
[
'TAG'
][]
=
'audio'
;
$removals
[
'TAG'
][]
=
'video'
;
}
return
$removals
;
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Sat, May 16, 17:50 (8 h, 47 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
3f/ff/baaabb9060cdaec245324b7da359
Default Alt Text
HtmlFormatter.php (12 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment