Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F1433121
Tokenizer.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
53 KB
Referenced Files
None
Subscribers
None
Tokenizer.php
View Options
<?php
namespace
Wikimedia\RemexHtml\Tokenizer
;
use
Wikimedia\RemexHtml\HTMLData
;
use
Wikimedia\RemexHtml\PropGuard
;
/**
* HTML 5 tokenizer
*
* Based on the W3C recommendation as published 01 November 2016:
* https://www.w3.org/TR/2016/REC-html51-20161101/
*/
class
Tokenizer
{
use
PropGuard
;
// States
public
const
STATE_START
=
1
;
public
const
STATE_DATA
=
2
;
public
const
STATE_RCDATA
=
3
;
public
const
STATE_RAWTEXT
=
4
;
public
const
STATE_SCRIPT_DATA
=
5
;
public
const
STATE_PLAINTEXT
=
6
;
public
const
STATE_EOF
=
7
;
public
const
STATE_CURRENT
=
8
;
// Match indices for the data state regex
private
const
MD_END_TAG_OPEN
=
1
;
private
const
MD_TAG_NAME
=
2
;
private
const
MD_TAG_AFTER_LOWERCASE
=
3
;
private
const
MD_COMMENT
=
4
;
private
const
MD_COMMENT_INNER
=
5
;
private
const
MD_COMMENT_END
=
6
;
private
const
MD_DOCTYPE
=
7
;
private
const
MD_DT_NAME_WS
=
8
;
private
const
MD_DT_NAME
=
9
;
private
const
MD_DT_PUBLIC_WS
=
10
;
private
const
MD_DT_PUBLIC_DQ
=
11
;
private
const
MD_DT_PUBLIC_SQ
=
12
;
private
const
MD_DT_PUBSYS_WS
=
13
;
private
const
MD_DT_PUBSYS_DQ
=
14
;
private
const
MD_DT_PUBSYS_SQ
=
15
;
private
const
MD_DT_SYSTEM_WS
=
16
;
private
const
MD_DT_SYSTEM_DQ
=
17
;
private
const
MD_DT_SYSTEM_SQ
=
18
;
private
const
MD_DT_BOGUS
=
19
;
private
const
MD_DT_END
=
20
;
private
const
MD_CDATA
=
21
;
private
const
MD_BOGUS_COMMENT
=
22
;
// Match indices for the character reference regex
private
const
MC_PREFIX
=
1
;
private
const
MC_DECIMAL
=
2
;
private
const
MC_HEXDEC
=
3
;
private
const
MC_SEMICOLON
=
4
;
private
const
MC_HASH
=
5
;
private
const
MC_NAMED
=
6
;
private
const
MC_SUFFIX
=
7
;
private
const
MC_INVALID
=
8
;
// Match indices for the attribute regex
private
const
MA_SLASH
=
1
;
private
const
MA_NAME
=
2
;
private
const
MA_SIMPLE_NAME
=
3
;
private
const
MA_DQUOTED
=
4
;
private
const
MA_DQUOTED_CHARREF
=
5
;
private
const
MA_DQUOTED_UNSIMPLE
=
6
;
private
const
MA_SQUOTED
=
7
;
private
const
MA_SQUOTED_CHARREF
=
8
;
private
const
MA_SQUOTED_UNSIMPLE
=
9
;
private
const
MA_UNQUOTED
=
10
;
private
const
MA_UNQUOTED_UNSIMPLE
=
11
;
// Characters
protected
const
REPLACEMENT_CHAR
=
"
\x
ef
\x
bf
\x
bd"
;
protected
const
BYTE_ORDER_MARK
=
"
\x
ef
\x
bb
\x
bf"
;
/**
* A list of "common well-behaved entities", used to optimize fast paths
*
* @var array<string,string>
*/
private
static
$commonEntities
=
[
'&'
=>
'&'
,
'''
=>
"'"
,
'<'
=>
'<'
,
'>'
=>
'>'
,
'"'
=>
'"'
,
' '
=>
"
\u
{00A0}"
,
];
protected
$ignoreErrors
;
protected
$ignoreCharRefs
;
protected
$ignoreNulls
;
protected
$skipPreprocess
;
protected
$scriptingFlag
;
protected
$appropriateEndTag
;
protected
$listener
;
protected
$state
;
protected
$preprocessed
;
protected
$text
;
protected
$pos
;
protected
$length
;
protected
$enableCdataCallback
;
protected
$fragmentNamespace
;
protected
$fragmentName
;
/**
* Constructor
*
* @param TokenHandler $listener The object which receives token events
* @param string $text The text to tokenize
* @param array $options Associative array of options, including:
* - ignoreErrors: True to improve performance by ignoring errors. The
* token stream should still be the same, except that error() won't be
* called.
* - ignoreCharRefs: True to ignore character references. Character tokens
* will contain the unexpanded character references, and no errors
* related to invalid character references will be raised. Performance
* will be improved. This is not compliant behaviour.
* - ignoreNulls: True to ignore NULL bytes in the input stream, instead
* of raising errors and converting them to U+FFFD as is usually
* required by the spec.
* - skipPreprocess: True to skip the "preprocessing the input stream"
* stage, which normalizes line endings and raises errors on certain
* control characters. Advisable if the input stream is already
* appropriately normalized.
* - scriptingFlag: True if the scripting flag is enabled. Default true.
* Setting this to false cause the contents of <noscript> elements to be
* processed as normal content. The scriptingFlag option in the
* TreeBuilder should be set to the same value.
*/
public
function
__construct
(
TokenHandler
$listener
,
$text
,
$options
=
[]
)
{
$this
->
listener
=
$listener
;
$this
->
text
=
$text
;
$this
->
pos
=
0
;
$this
->
preprocessed
=
false
;
$this
->
length
=
strlen
(
$text
);
$this
->
ignoreErrors
=
!
empty
(
$options
[
'ignoreErrors'
]
);
$this
->
ignoreCharRefs
=
!
empty
(
$options
[
'ignoreCharRefs'
]
);
$this
->
ignoreNulls
=
!
empty
(
$options
[
'ignoreNulls'
]
);
$this
->
skipPreprocess
=
!
empty
(
$options
[
'skipPreprocess'
]
);
$this
->
scriptingFlag
=
$options
[
'scriptingFlag'
]
??
true
;
}
public
function
setEnableCdataCallback
(
$cb
)
{
$this
->
enableCdataCallback
=
$cb
;
}
/**
* Run the tokenizer on the whole input stream. This is the normal entry point.
*
* @param array $options An associative array of options:
* - state : One of the STATE_* constants, a state in which to start.
* - appropriateEndTag : The "appropriate end tag", which needs to be set
* if entering one of the raw text states.
* - fragmentNamespace : The fragment namespace
* - fragmentName : The fragment tag name
*/
public
function
execute
(
$options
=
[]
)
{
if
(
isset
(
$options
[
'state'
]
)
)
{
$this
->
state
=
$options
[
'state'
];
}
else
{
$this
->
state
=
self
::
STATE_START
;
}
if
(
isset
(
$options
[
'fragmentNamespace'
]
)
)
{
$this
->
setFragmentContext
(
$options
[
'fragmentNamespace'
],
$options
[
'fragmentName'
]
);
}
else
{
$this
->
fragmentNamespace
=
null
;
$this
->
fragmentName
=
null
;
}
$this
->
appropriateEndTag
=
$options
[
'appropriateEndTag'
]
??
null
;
$this
->
preprocess
();
$this
->
listener
->
startDocument
(
$this
,
$this
->
fragmentNamespace
,
$this
->
fragmentName
);
$this
->
executeInternal
(
true
);
}
/**
* Get the preprocessed input text. Source offsets in event parameters are
* relative to this string. If skipPreprocess was specified, this will be
* the same as the input string.
* @return string
*/
public
function
getPreprocessedText
()
{
$this
->
preprocess
();
return
$this
->
text
;
}
/**
* Change the state of the tokenizer during parsing. This for use by the
* tree builder to switch the tokenizer into one of the raw text states.
*
* @param int $state One of the STATE_* constants
* @param string $appropriateEndTag The appropriate end tag
*/
public
function
switchState
(
$state
,
$appropriateEndTag
)
{
$this
->
state
=
$state
;
$this
->
appropriateEndTag
=
$appropriateEndTag
;
}
/**
* Initialize the tokenizer for fragment parsing
*
* @param string $namespace The namespace of the context element
* @param string $tagName The name of the context element
*/
public
function
setFragmentContext
(
$namespace
,
$tagName
)
{
$this
->
fragmentNamespace
=
$namespace
;
$this
->
fragmentName
=
$tagName
;
if
(
strval
(
$namespace
)
!==
''
&&
$namespace
!==
HTMLData
::
NS_HTML
)
{
return
;
}
switch
(
$tagName
)
{
case
'title'
:
case
'textarea'
:
$this
->
state
=
self
::
STATE_RCDATA
;
break
;
case
'style'
:
case
'xmp'
:
case
'iframe'
:
case
'noembed'
:
case
'noframes'
:
$this
->
state
=
self
::
STATE_RAWTEXT
;
break
;
case
'script'
:
$this
->
state
=
self
::
STATE_SCRIPT_DATA
;
break
;
case
'noscript'
:
if
(
$this
->
scriptingFlag
)
{
$this
->
state
=
self
::
STATE_RAWTEXT
;
}
break
;
case
'plaintext'
:
$this
->
state
=
self
::
STATE_PLAINTEXT
;
break
;
}
}
/**
* Notify the tokenizer that the document will be tokenized by repeated step()
* calls. This must be called once only, before the first call to step().
*/
public
function
beginStepping
()
{
$this
->
state
=
self
::
STATE_START
;
$this
->
preprocess
();
$this
->
listener
->
startDocument
(
$this
,
null
,
null
);
}
/**
* Tokenize a minimum amount of text from the input stream, and emit the
* resulting events.
*
* @return bool True if the input continues and step() should be called
* again, false on EOF
*/
public
function
step
()
{
if
(
$this
->
state
===
null
)
{
$this
->
fatal
(
"beginStepping() must be called before step()"
);
}
return
$this
->
executeInternal
(
false
);
}
/**
* Preprocess the input text, if it hasn't been done already.
*/
protected
function
preprocess
()
{
if
(
$this
->
preprocessed
||
$this
->
skipPreprocess
)
{
return
;
}
// Normalize line endings
if
(
strcspn
(
$this
->
text
,
"
\r
"
)
!==
strlen
(
$this
->
text
)
)
{
$this
->
text
=
preg_replace
(
'/
\r\n
?/'
,
"
\n
"
,
$this
->
text
);
$this
->
length
=
strlen
(
$this
->
text
);
}
// Raise parse errors for any control characters
static
$re
;
if
(
$re
===
null
)
{
// Note that we deliberately do not use the 'u' flag on the
// regexp below, as that bypasses the PCRE JIT. Instead we
// rewrite character classes containing codepoints which
// require more than one UTF-8 byte as alternations.
$re
=
'/['
.
// "C0 controls" (u0000 - u001F) but not
// "ASCII whitespace" (u0009, u000A, u000C, u000D, u0020) or
// NULL (u0000)
// https://infra.spec.whatwg.org/#c0-control
// https://infra.spec.whatwg.org/#ascii-whitespace
'
\x
{0001}-
\x
{0008}'
.
'
\x
{000B}'
.
'
\x
{000E}-
\x
{001F}'
.
// "Controls" other than C0 controls (u007F - u009F)
// https://infra.spec.whatwg.org/#control
'
\x
{007F}]|'
.
// (We can't use character classes above u007F)
"
\u
{0080}|
\u
{0081}|
\u
{0082}|
\u
{0083}|"
.
"
\u
{0084}|
\u
{0085}|
\u
{0086}|
\u
{0087}|"
.
"
\u
{0088}|
\u
{0089}|
\u
{008A}|
\u
{008B}|"
.
"
\u
{008C}|
\u
{008D}|
\u
{008E}|
\u
{008F}|"
.
"
\u
{0090}|
\u
{0091}|
\u
{0092}|
\u
{0093}|"
.
"
\u
{0094}|
\u
{0095}|
\u
{0096}|
\u
{0097}|"
.
"
\u
{0098}|
\u
{0099}|
\u
{009A}|
\u
{009B}|"
.
"
\u
{009C}|
\u
{009D}|
\u
{009E}|
\u
{009F}|"
.
// HTML spec calls these "noncharacters"
// https://infra.spec.whatwg.org/#noncharacter
"
\u
{FDD0}|
\u
{FDD1}|
\u
{FDD2}|
\u
{FDD3}|"
.
"
\u
{FDD4}|
\u
{FDD5}|
\u
{FDD6}|
\u
{FDD7}|"
.
"
\u
{FDD8}|
\u
{FDD9}|
\u
{FDDA}|
\u
{FDDB}|"
.
"
\u
{FDDC}|
\u
{FDDD}|
\u
{FDDE}|
\u
{FDDF}|"
.
"
\u
{FDE0}|
\u
{FDE1}|
\u
{FDE2}|
\u
{FDE3}|"
.
"
\u
{FDE4}|
\u
{FDE5}|
\u
{FDE6}|
\u
{FDE7}|"
.
"
\u
{FDE8}|
\u
{FDE9}|
\u
{FDEA}|
\u
{FDEB}|"
.
"
\u
{FDEC}|
\u
{FDED}|
\u
{FDEE}|
\u
{FDEF}|"
.
"
\u
{FFFE}|
\u
{FFFF}|"
.
"
\u
{1FFFE}|
\u
{1FFFF}|"
.
"
\u
{2FFFE}|
\u
{2FFFF}|"
.
"
\u
{3FFFE}|
\u
{3FFFF}|"
.
"
\u
{4FFFE}|
\u
{4FFFF}|"
.
"
\u
{5FFFE}|
\u
{5FFFF}|"
.
"
\u
{6FFFE}|
\u
{6FFFF}|"
.
"
\u
{7FFFE}|
\u
{7FFFF}|"
.
"
\u
{8FFFE}|
\u
{8FFFF}|"
.
"
\u
{9FFFE}|
\u
{9FFFF}|"
.
"
\u
{AFFFE}|
\u
{AFFFF}|"
.
"
\u
{BFFFE}|
\u
{BFFFF}|"
.
"
\u
{CFFFE}|
\u
{CFFFF}|"
.
"
\u
{DFFFE}|
\u
{DFFFF}|"
.
"
\u
{EFFFE}|
\u
{EFFFF}|"
.
"
\u
{FFFFE}|
\u
{FFFFF}|"
.
"
\u
{10FFFE}|
\u
{10FFFF}/S"
;
}
if
(
!
$this
->
ignoreErrors
)
{
$pos
=
0
;
while
(
$pos
<
$this
->
length
)
{
$count
=
preg_match
(
$re
,
$this
->
text
,
$m
,
PREG_OFFSET_CAPTURE
,
$pos
);
if
(
$count
===
false
)
{
$this
->
throwPregError
();
}
elseif
(
!
$count
)
{
break
;
}
$pos
=
$m
[
0
][
1
];
$this
->
error
(
"disallowed control character"
,
$pos
);
$pos
+=
strlen
(
$m
[
0
][
0
]
);
}
}
}
/**
* The main state machine, the common implementation of step() and execute().
* @param bool $loop Set to true to loop until finished, false to step once.
* @return bool True if the input continues, false on EOF
*/
protected
function
executeInternal
(
$loop
)
{
$eof
=
false
;
do
{
switch
(
$this
->
state
)
{
case
self
::
STATE_DATA
:
$this
->
state
=
$this
->
dataState
(
$loop
);
break
;
case
self
::
STATE_RCDATA
:
$this
->
state
=
$this
->
textElementState
(
false
);
break
;
case
self
::
STATE_RAWTEXT
:
$this
->
state
=
$this
->
textElementState
(
true
);
break
;
case
self
::
STATE_SCRIPT_DATA
:
$this
->
state
=
$this
->
scriptDataState
();
break
;
case
self
::
STATE_PLAINTEXT
:
$this
->
state
=
$this
->
plaintextState
();
break
;
case
self
::
STATE_START
:
$this
->
state
=
self
::
STATE_DATA
;
break
;
case
self
::
STATE_EOF
:
$this
->
listener
->
endDocument
(
$this
->
length
);
$eof
=
true
;
break
2
;
default
:
$this
->
fatal
(
'invalid state'
);
}
}
while
(
$loop
);
return
!
$eof
;
}
/**
* Consume input text starting from the "data state".
*
* @param bool $loop True to loop while still in the data state, false to
* process a single less-than sign.
* @return int The next state index
*/
protected
function
dataState
(
$loop
)
{
$re
=
"~ <
(?:
( /? ) # 1. End tag open
( # 2. Tag name
# Try to match the ASCII letter required for the start of a start
# or end tag. If this fails, a slash matched above can be
# backtracked and then fed into the bogus comment alternative below.
# As an optimization, notice if the tag is all-lowercase;
# we can skip strtolower and null handling in that case.
(?:
[a-z]++
# Portion after initial all-lowercase prefix
( [^
\t\n\f
/>]*+ ) |
# Capture initial uppercase letter
[A-Z]
# Then capture the rest of the tag name
[^
\t\n\f
/>]*+
)
)
|
# Comment
!--
( # 4. Comment match detector
> | -> | # Invalid short close
( # 5. Comment contents
(?:
(?! --> )
(?! --!> )
(?! --!
\\
z )
(?! --
\\
z )
(?! -
\\
z )
.
)*+
)
( # 6. Comment close
--> | # Normal close
--!> | # Comment end bang
--! | # EOF in comment end bang state
-- | # EOF in comment end state
- | # EOF in comment end dash state
# EOF in comment state
)
) |
( (?i) # 7. Doctype
! DOCTYPE
# There must be at least one whitespace character to suppress
# a parse error, but if there isn't one, this is still a
# DOCTYPE. There is no way for the DOCTYPE string to end up
# as a character node, the DOCTYPE subexpression must always
# wholly match if we matched up to this point.
( [
\t\n\f
]*+ ) # 8. Required whitespace
( [^
\t\n\f
>]*+ ) # 9. DOCTYPE name
[
\t\n\f
]*+
(?:
# After DOCTYPE name state
PUBLIC
( [
\t\n\f
]* ) # 10. Required whitespace
(?:
\"
( [^
\"
>]* )
\"
? | # 11. Double-quoted identifier
' ( [^'>]* ) '? | # 12. Single-quoted identifier
# Non-match: bogus
)
(?:
# After DOCTYPE public identifier state
# Assert quoted identifier before here
(?<=
\"
| ' )
( [
\t\n\f
]* ) # 13. Required whitespace
(?:
\"
( [^
\"
>]* )
\"
? | # 14. Double-quoted identifier
' ( [^'>]* ) '? | # 15. Single-quoted identifier
# Non-match: no system ID
)
)?
|
SYSTEM
( [
\t\n\f
]* ) # 16. Required whitespace
(?:
\"
( [^
\"
>]* )
\"
? | # 17. Double-quoted identifier
' ( [^'>]* ) '? | # 18. Single-quoted identifier
# Non-match: bogus
)
| # No keyword is OK
)
[
\t\n\f
]*
( [^>]*+ ) # 19. Bogus DOCTYPE
( >? ) # 20. End of DOCTYPE
) |
( !
\[
CDATA
\[
) | # 21. CDATA section
( [!?/] [^>]*+ ) >? # 22. Bogus comment
# Anything else: parse error and emit literal less-than sign.
# We will let the match fail at this position and later check
# for less-than signs in the resulting text node.
)
~xs"
;
$nextState
=
self
::
STATE_DATA
;
do
{
# As an optimization, quick scan ahead to the first "difficult"
# character
$npos
=
strcspn
(
$this
->
text
,
"<&
\0
"
,
$this
->
pos
)
+
$this
->
pos
;
# While the "difficult" section is in fact a simple entity, keep
# skipping ahead.
$mpos
=
$npos
;
while
(
preg_match
(
'/&(?:amp|apos|lt|gt|quot|nbsp);/A'
,
$this
->
text
,
$m
,
0
,
$mpos
)
===
1
)
{
$mpos
+=
strlen
(
$m
[
0
]
);
$mpos
+=
strcspn
(
$this
->
text
,
"<&
\0
"
,
$mpos
);
}
$count
=
preg_match
(
$re
,
$this
->
text
,
$m
,
PREG_OFFSET_CAPTURE
,
$mpos
);
if
(
$count
===
false
)
{
$this
->
throwPregError
();
}
elseif
(
!
$count
)
{
// Text runs to end
$dataIsSimple
=
(
$npos
===
$this
->
length
);
$dataHasSimpleRefs
=
(
$mpos
===
$this
->
length
);
$this
->
emitDataRange
(
$this
->
pos
,
$this
->
length
-
$this
->
pos
,
$dataIsSimple
,
$dataHasSimpleRefs
);
$this
->
pos
=
$this
->
length
;
$nextState
=
self
::
STATE_EOF
;
break
;
}
$startPos
=
$m
[
0
][
1
];
$dataIsSimple
=
(
$npos
===
$startPos
);
$dataHasSimpleRefs
=
(
$mpos
===
$startPos
);
$tagName
=
isset
(
$m
[
self
::
MD_TAG_NAME
]
)
?
$m
[
self
::
MD_TAG_NAME
][
0
]
:
''
;
$this
->
emitDataRange
(
$this
->
pos
,
$startPos
-
$this
->
pos
,
$dataIsSimple
,
$dataHasSimpleRefs
);
$this
->
pos
=
$startPos
;
$nextPos
=
$m
[
0
][
1
]
+
strlen
(
$m
[
0
][
0
]
);
if
(
isset
(
$m
[
self
::
MD_CDATA
]
)
&&
$m
[
self
::
MD_CDATA
][
1
]
>=
0
)
{
if
(
$this
->
enableCdataCallback
)
{
$isCdata
=
call_user_func
(
$this
->
enableCdataCallback
);
}
else
{
$isCdata
=
false
;
}
if
(
!
$isCdata
)
{
$m
[
self
::
MD_BOGUS_COMMENT
]
=
$m
[
self
::
MD_CDATA
];
}
}
if
(
strlen
(
$tagName
)
)
{
// Tag
$isEndTag
=
(
bool
)
strlen
(
$m
[
self
::
MD_END_TAG_OPEN
][
0
]
);
$isAllLower
=
isset
(
$m
[
self
::
MD_TAG_AFTER_LOWERCASE
]
)
&&
$m
[
self
::
MD_TAG_AFTER_LOWERCASE
][
0
]
===
''
;
if
(
!
$isAllLower
)
{
// As an optimization, we skip these steps for the common
// case of an all-lowercase tag name
if
(
!
$this
->
ignoreNulls
)
{
$tagName
=
$this
->
handleNulls
(
$tagName
,
$m
[
self
::
MD_TAG_NAME
][
1
]
);
}
$tagName
=
strtolower
(
$tagName
);
}
$this
->
pos
=
$nextPos
;
if
(
$nextPos
<
$this
->
length
&&
$this
->
text
[
$nextPos
]
===
'>'
)
{
// "Simple tag" optimization; skip attribute parsing and
// stay in this state
$this
->
pos
=
++
$nextPos
;
if
(
$isEndTag
)
{
$this
->
listener
->
endTag
(
$tagName
,
$startPos
,
$nextPos
-
$startPos
);
}
else
{
$this
->
listener
->
startTag
(
$tagName
,
new
PlainAttributes
(),
false
,
$startPos
,
$nextPos
-
$startPos
);
}
// Respect any state switch imposed by the parser
$nextState
=
$this
->
state
;
continue
;
}
$nextState
=
$this
->
handleAttribsAndClose
(
self
::
STATE_DATA
,
$tagName
,
$isEndTag
,
$startPos
);
$nextPos
=
$this
->
pos
;
if
(
$nextState
===
self
::
STATE_EOF
)
{
break
;
}
// Respect any state switch imposed by the parser
$nextState
=
$this
->
state
;
}
elseif
(
isset
(
$m
[
self
::
MD_COMMENT
]
)
&&
$m
[
self
::
MD_COMMENT
][
1
]
>=
0
)
{
// Comment
$this
->
interpretCommentMatches
(
$m
);
}
elseif
(
isset
(
$m
[
self
::
MD_DOCTYPE
]
)
&&
$m
[
self
::
MD_DOCTYPE
][
1
]
>=
0
)
{
// DOCTYPE
$this
->
interpretDoctypeMatches
(
$m
);
}
elseif
(
isset
(
$m
[
self
::
MD_CDATA
]
)
&&
$m
[
self
::
MD_CDATA
][
1
]
>=
0
)
{
// CDATA
if
(
$this
->
enableCdataCallback
&&
call_user_func
(
$this
->
enableCdataCallback
)
)
{
$this
->
pos
+=
strlen
(
$m
[
self
::
MD_CDATA
][
0
]
)
+
1
;
$endPos
=
strpos
(
$this
->
text
,
']]>'
,
$this
->
pos
);
if
(
$endPos
===
false
)
{
$this
->
emitCdataRange
(
$this
->
pos
,
$this
->
length
-
$this
->
pos
,
$startPos
,
$this
->
length
-
$startPos
);
$this
->
pos
=
$this
->
length
;
$nextState
=
self
::
STATE_EOF
;
break
;
}
else
{
$outerEndPos
=
$endPos
+
strlen
(
']]>'
);
$this
->
emitCdataRange
(
$this
->
pos
,
$endPos
-
$this
->
pos
,
$startPos
,
$outerEndPos
-
$startPos
);
$nextPos
=
$outerEndPos
;
}
}
else
{
// Bogus comment
$this
->
error
(
"unexpected CDATA interpreted as bogus comment"
);
$endPos
=
strpos
(
$this
->
text
,
'>'
,
$this
->
pos
);
$bogusPos
=
$this
->
pos
+
2
;
if
(
$endPos
===
false
)
{
$nextPos
=
$this
->
length
;
$contents
=
substr
(
$this
->
text
,
$bogusPos
);
}
else
{
$nextPos
=
$endPos
+
1
;
$contents
=
substr
(
$this
->
text
,
$bogusPos
,
$endPos
-
$bogusPos
);
}
$contents
=
$this
->
handleNulls
(
$contents
,
$bogusPos
);
$this
->
listener
->
comment
(
$contents
,
$this
->
pos
,
$endPos
-
$this
->
pos
);
}
}
elseif
(
isset
(
$m
[
self
::
MD_BOGUS_COMMENT
]
)
&&
$m
[
self
::
MD_BOGUS_COMMENT
][
1
]
>=
0
)
{
// Bogus comment
$contents
=
$m
[
self
::
MD_BOGUS_COMMENT
][
0
];
$bogusPos
=
$m
[
self
::
MD_BOGUS_COMMENT
][
1
];
if
(
$m
[
0
][
0
]
===
'</>'
)
{
$this
->
error
(
"empty end tag"
);
// No token emitted
}
elseif
(
$m
[
0
][
0
]
===
'</'
)
{
$this
->
error
(
'EOF in end tag'
);
$sourceStart
=
$m
[
0
][
1
];
'@phan-var int $sourceStart'
;
/** @var int $sourceStart */
$this
->
listener
->
characters
(
'</'
,
0
,
2
,
$sourceStart
,
2
);
}
else
{
$this
->
error
(
"unexpected <{$contents[0]} interpreted as bogus comment"
);
if
(
$contents
[
0
]
!==
'?'
)
{
// For starting types other than <?, the initial character is
// not in the tag contents
$contents
=
substr
(
$contents
,
1
);
$bogusPos
++;
}
$contents
=
$this
->
handleNulls
(
$contents
,
$bogusPos
);
$this
->
listener
->
comment
(
$contents
,
$startPos
,
$nextPos
-
$startPos
);
}
}
else
{
$this
->
fatal
(
'unexpected data state match'
);
}
$this
->
pos
=
$nextPos
;
}
while
(
$loop
&&
$nextState
===
self
::
STATE_DATA
);
return
$nextState
;
}
/**
* Interpret the data state match results for a detected comment, and emit
* events as appropriate.
*
* @param array $m The match array
*/
protected
function
interpretCommentMatches
(
$m
)
{
$outerStart
=
$m
[
0
][
1
];
$outerLength
=
strlen
(
$m
[
0
][
0
]
);
$innerStart
=
$outerStart
+
strlen
(
'<!--'
);
$innerLength
=
isset
(
$m
[
self
::
MD_COMMENT_INNER
]
)
?
strlen
(
$m
[
self
::
MD_COMMENT_INNER
][
0
]
)
:
0
;
$contents
=
$innerLength
?
$m
[
self
::
MD_COMMENT_INNER
][
0
]
:
''
;
if
(
$m
[
0
][
0
]
===
'<!-->'
||
$m
[
0
][
0
]
===
'<!--->'
)
{
// These are special cases in the comment start state
$this
->
error
(
'not enough dashes in empty comment'
,
$outerStart
);
$this
->
listener
->
comment
(
''
,
$outerStart
,
$outerLength
);
return
;
}
if
(
!
$this
->
ignoreNulls
)
{
$contents
=
$this
->
handleNulls
(
$contents
,
$innerStart
);
}
$close
=
$m
[
self
::
MD_COMMENT_END
][
0
];
$closePos
=
$m
[
self
::
MD_COMMENT_END
][
1
];
if
(
!
$this
->
ignoreErrors
)
{
if
(
$close
===
'--!>'
)
{
$this
->
error
(
'invalid comment end bang'
,
$closePos
);
}
elseif
(
$close
===
'-'
||
$close
===
'--'
||
$close
===
'--!'
)
{
$this
->
error
(
'EOF part way through comment close'
,
$closePos
);
}
elseif
(
$close
===
''
)
{
$this
->
error
(
'EOF in comment'
,
$closePos
);
}
$dashSearchLength
=
$innerLength
;
while
(
$dashSearchLength
>
0
&&
$contents
[
$dashSearchLength
-
1
]
===
'-'
)
{
$this
->
error
(
'invalid extra dash at comment end'
,
$innerStart
+
$dashSearchLength
-
1
);
$dashSearchLength
--;
}
$offset
=
0
;
while
(
$offset
!==
false
&&
$offset
<
$dashSearchLength
)
{
$offset
=
strpos
(
$contents
,
'--'
,
$offset
);
if
(
$offset
!==
false
)
{
$this
->
error
(
'bare "--" found in comment'
,
$innerStart
+
$offset
);
$offset
+=
2
;
}
}
}
$this
->
listener
->
comment
(
$contents
,
$outerStart
,
$outerLength
);
}
/**
* Interpret the data state match results for a detected DOCTYPE token,
* and emit events as appropriate.
*
* @param array $m The match array
*/
protected
function
interpretDoctypeMatches
(
$m
)
{
$igerr
=
$this
->
ignoreErrors
;
$name
=
null
;
$public
=
null
;
$system
=
null
;
$quirks
=
false
;
// Missing ">" can only be caused by EOF
$eof
=
!
strlen
(
$m
[
self
::
MD_DT_END
][
0
]
);
if
(
strlen
(
$m
[
self
::
MD_DT_BOGUS
][
0
]
)
)
{
// Bogus DOCTYPE state
if
(
!
$igerr
)
{
$this
->
error
(
'invalid DOCTYPE contents'
,
$m
[
self
::
MD_DT_BOGUS
][
1
]
);
}
// Set quirks mode unless there was a properly quoted SYSTEM identifier
$haveDq
=
isset
(
$m
[
self
::
MD_DT_SYSTEM_DQ
]
)
&&
$m
[
self
::
MD_DT_SYSTEM_DQ
][
1
]
>=
0
;
$haveSq
=
isset
(
$m
[
self
::
MD_DT_SYSTEM_SQ
]
)
&&
$m
[
self
::
MD_DT_SYSTEM_SQ
][
1
]
>=
0
;
if
(
!
$haveDq
&&
!
$haveSq
)
{
$quirks
=
true
;
}
// EOF in the bogus state does not set quirks mode (but it is a parse error)
if
(
$eof
&&
!
$igerr
)
{
$this
->
error
(
'unterminated DOCTYPE'
);
}
}
elseif
(
$eof
)
{
if
(
!
$igerr
)
{
$this
->
error
(
'unterminated DOCTYPE'
);
}
$quirks
=
true
;
}
if
(
!
$igerr
&&
!
$eof
&&
!
strlen
(
$m
[
self
::
MD_DT_NAME_WS
][
0
]
)
)
{
$this
->
error
(
'missing whitespace'
,
$m
[
self
::
MD_DT_NAME_WS
][
1
]
);
}
if
(
strlen
(
$m
[
self
::
MD_DT_NAME
][
0
]
)
)
{
// DOCTYPE name
$name
=
$this
->
handleNulls
(
strtolower
(
$m
[
self
::
MD_DT_NAME
][
0
]
),
$m
[
self
::
MD_DT_NAME
][
1
]
);
}
else
{
if
(
!
$eof
&&
!
$igerr
)
{
$this
->
error
(
'missing DOCTYPE name'
,
$m
[
self
::
MD_DOCTYPE
][
1
]
+
strlen
(
'!DOCTYPE'
)
);
}
$quirks
=
true
;
}
if
(
isset
(
$m
[
self
::
MD_DT_PUBLIC_WS
]
)
&&
$m
[
self
::
MD_DT_PUBLIC_WS
][
1
]
>=
0
)
{
// PUBLIC keyword found
$public
=
$this
->
interpretDoctypeQuoted
(
$m
,
self
::
MD_DT_PUBLIC_DQ
,
self
::
MD_DT_PUBLIC_SQ
,
$quirks
);
if
(
$public
===
null
)
{
$quirks
=
true
;
if
(
!
$eof
&&
!
$igerr
)
{
$this
->
error
(
'missing public identifier'
,
$m
[
self
::
MD_DT_PUBLIC_WS
][
1
]
);
}
}
elseif
(
!
$igerr
&&
!
$eof
&&
!
strlen
(
$m
[
self
::
MD_DT_PUBLIC_WS
][
0
]
)
)
{
$this
->
error
(
'missing whitespace'
,
$m
[
self
::
MD_DT_PUBLIC_WS
][
1
]
);
}
// Check for a system ID after the public ID
$haveDq
=
isset
(
$m
[
self
::
MD_DT_PUBSYS_DQ
]
)
&&
$m
[
self
::
MD_DT_PUBSYS_DQ
][
1
]
>=
0
;
$haveSq
=
isset
(
$m
[
self
::
MD_DT_PUBSYS_SQ
]
)
&&
$m
[
self
::
MD_DT_PUBSYS_SQ
][
1
]
>=
0
;
if
(
$haveDq
||
$haveSq
)
{
if
(
!
$igerr
&&
!
strlen
(
$m
[
self
::
MD_DT_PUBSYS_WS
][
0
]
)
)
{
$this
->
error
(
'missing whitespace'
,
$m
[
self
::
MD_DT_PUBSYS_WS
][
1
]
);
}
$system
=
$this
->
interpretDoctypeQuoted
(
$m
,
self
::
MD_DT_PUBSYS_DQ
,
self
::
MD_DT_PUBSYS_SQ
,
$quirks
);
}
}
elseif
(
isset
(
$m
[
self
::
MD_DT_SYSTEM_WS
]
)
&&
$m
[
self
::
MD_DT_SYSTEM_WS
][
1
]
>=
0
)
{
// SYSTEM keyword found
$system
=
$this
->
interpretDoctypeQuoted
(
$m
,
self
::
MD_DT_SYSTEM_DQ
,
self
::
MD_DT_SYSTEM_SQ
,
$quirks
);
if
(
$system
===
null
)
{
$quirks
=
true
;
$this
->
error
(
'missing system identifier'
,
$m
[
self
::
MD_DT_SYSTEM_WS
][
1
]
);
}
elseif
(
!
$igerr
&&
!
strlen
(
$m
[
self
::
MD_DT_SYSTEM_WS
][
0
]
)
)
{
$this
->
error
(
'missing whitespace'
,
$m
[
self
::
MD_DT_SYSTEM_WS
][
1
]
);
}
}
$this
->
listener
->
doctype
(
$name
,
$public
,
$system
,
$quirks
,
$m
[
0
][
1
],
strlen
(
$m
[
0
][
0
]
)
);
}
/**
* DOCTYPE helper which interprets a quoted string (or lack thereof)
* @param array $m
* @param int $dq
* @param int $sq
* @param bool &$quirks
* @return string|null The quoted value, with nulls replaced.
*/
protected
function
interpretDoctypeQuoted
(
$m
,
$dq
,
$sq
,
&
$quirks
)
{
if
(
isset
(
$m
[
$dq
]
)
&&
$m
[
$dq
][
1
]
>=
0
)
{
$value
=
$m
[
$dq
][
0
];
$startPos
=
$m
[
$dq
][
1
];
}
elseif
(
isset
(
$m
[
$sq
]
)
&&
$m
[
$sq
][
1
]
>=
0
)
{
$value
=
$m
[
$sq
][
0
];
$startPos
=
$m
[
$sq
][
1
];
}
else
{
return
null
;
}
$endPos
=
$startPos
+
strlen
(
$value
);
if
(
$endPos
>=
$this
->
length
)
{
// This is a parse error, but we already emitted a generic EOF error
$quirks
=
true
;
}
elseif
(
$this
->
text
[
$endPos
]
===
'>'
)
{
$this
->
error
(
'DOCTYPE identifier terminated by ">"'
,
$endPos
);
$quirks
=
true
;
}
$value
=
$this
->
handleNulls
(
$value
,
$startPos
);
return
$value
;
}
/**
* Generic helper for all those points in the spec where U+0000 needs to be
* replaced with U+FFFD with a parse error issued.
*
* @param string $text The text to be converted
* @param int $sourcePos The input byte offset from which $text was
* extracted, for error position reporting.
* @return string The converted text
*/
protected
function
handleNulls
(
$text
,
$sourcePos
)
{
if
(
$this
->
ignoreNulls
)
{
return
$text
;
}
if
(
!
$this
->
ignoreErrors
)
{
$offset
=
0
;
while
(
true
)
{
$nullPos
=
strpos
(
$text
,
"
\0
"
,
$offset
);
if
(
$nullPos
===
false
)
{
break
;
}
$this
->
error
(
"replaced null character"
,
$sourcePos
+
$nullPos
);
if
(
$nullPos
<
strlen
(
$text
)
-
1
)
{
$offset
=
$nullPos
+
1
;
}
else
{
break
;
}
}
}
return
str_replace
(
"
\0
"
,
self
::
REPLACEMENT_CHAR
,
$text
);
}
/**
* Generic helper for points in the spec which say that an error should
* be issued when certain ASCII characters are seen, with no other action
* taken.
*
* @param string $mask Mask for strcspn
* @param string $text The input text
* @param int $offset The start of the range within $text to search
* @param int $length The length of the range within $text to search
* @param int $sourcePos The offset within the input text corresponding
* to $text, for error position reporting.
*/
protected
function
handleAsciiErrors
(
$mask
,
$text
,
$offset
,
$length
,
$sourcePos
)
{
while
(
$length
>
0
)
{
$validLength
=
strcspn
(
$text
,
$mask
,
$offset
,
$length
);
$offset
+=
$validLength
;
$length
-=
$validLength
;
if
(
$length
<=
0
)
{
break
;
}
$char
=
$text
[
$offset
];
$codepoint
=
ord
(
$char
);
if
(
$codepoint
<
0x20
||
$codepoint
>=
0x7f
)
{
$this
->
error
(
sprintf
(
'unexpected U+00%02X'
,
$codepoint
),
$offset
+
$sourcePos
);
}
else
{
$this
->
error
(
"unexpected
\"
$char
\"
"
,
$offset
+
$sourcePos
);
}
$offset
++;
$length
--;
}
}
// This string isn't used directly: it's input to GenerateDataFiles.php
// which will substitute in the named entities and create a
// compile-time constant string in HtmlData::$charRefRegex
// Only compile-time constants are handled efficiently in the
// regexp cache; otherwise we pay for a 26k strcmp each time we
// fetch the regexp from the cache.
public
const
CHARREF_REGEX
=
'~
( .*? ) # 1. prefix
&
(?:
\#
(?:
0*(
\d
+) | # 2. decimal
[xX]0*([0-9A-Fa-f]+) # 3. hexadecimal
)
( ; ) ? # 4. semicolon
|
(
\#
) # 5. bare hash
|
({{NAMED_ENTITY_REGEX}}) # 6. known named
(?:
(?<! ; ) # Assert no semicolon prior
( [=a-zA-Z0-9] ) # 7. attribute suffix
)?
|
( [a-zA-Z0-9]+ ; ) # 8. invalid named
)
# S = study, for efficient knownNamed
# A = anchor, to avoid unnecessary movement of the whole pattern on failure
~xAsS'
;
/**
* Expand character references in some text, and emit errors as appropriate.
* @param string $text The text to expand
* @param int $sourcePos The input position of $text
* @param bool $inAttr True if the text is within an attribute value
* @param string $additionalAllowedChar An unused string which the spec
* inexplicably spends a lot of space telling you how to derive. It
* suppresses errors in a place where no errors are emitted anyway.
* @return string The expanded text
*/
protected
function
handleCharRefs
(
$text
,
$sourcePos
,
$inAttr
=
false
,
$additionalAllowedChar
=
''
)
{
if
(
$this
->
ignoreCharRefs
)
{
return
$text
;
}
$out
=
''
;
$pos
=
0
;
$length
=
strlen
(
$text
);
$matches
=
[];
$count
=
preg_match_all
(
HTMLData
::
$charRefRegex
,
$text
,
$matches
,
PREG_SET_ORDER
);
if
(
$count
===
false
)
{
$this
->
throwPregError
();
}
foreach
(
$matches
as
$m
)
{
$out
.=
$m
[
self
::
MC_PREFIX
];
$errorPos
=
$sourcePos
+
$pos
+
strlen
(
$m
[
self
::
MC_PREFIX
]
);
$pos
+=
strlen
(
$m
[
0
]
);
if
(
isset
(
$m
[
self
::
MC_HASH
]
)
&&
strlen
(
$m
[
self
::
MC_HASH
]
)
)
{
// Bare &#
$this
->
error
(
'Expected digits after &#'
,
$errorPos
);
$out
.=
'&#'
;
continue
;
}
$knownNamed
=
$m
[
self
::
MC_NAMED
]
??
''
;
$attributeSuffix
=
$m
[
self
::
MC_SUFFIX
]
??
''
;
$haveSemicolon
=
(
isset
(
$m
[
self
::
MC_SEMICOLON
]
)
&&
strlen
(
$m
[
self
::
MC_SEMICOLON
]
)
)
||
(
strlen
(
$knownNamed
)
&&
$knownNamed
[
strlen
(
$knownNamed
)
-
1
]
===
';'
)
||
(
isset
(
$m
[
self
::
MC_INVALID
]
)
&&
strlen
(
$m
[
self
::
MC_INVALID
]
)
);
if
(
$inAttr
&&
!
$haveSemicolon
)
{
if
(
strlen
(
$attributeSuffix
)
)
{
if
(
!
$this
->
ignoreErrors
&&
$attributeSuffix
===
'='
)
{
$this
->
error
(
'invalid equals sign after named character reference'
);
}
$out
.=
'&'
.
$knownNamed
.
$attributeSuffix
;
continue
;
}
}
if
(
!
$this
->
ignoreErrors
&&
!
$haveSemicolon
)
{
$this
->
error
(
'character reference missing semicolon'
,
$errorPos
);
}
if
(
isset
(
$m
[
self
::
MC_DECIMAL
]
)
&&
strlen
(
$m
[
self
::
MC_DECIMAL
]
)
)
{
// Decimal
if
(
strlen
(
$m
[
self
::
MC_DECIMAL
]
)
>
7
)
{
$this
->
error
(
'invalid numeric reference'
,
$errorPos
);
$out
.=
self
::
REPLACEMENT_CHAR
;
continue
;
}
$codepoint
=
intval
(
$m
[
self
::
MC_DECIMAL
]
);
}
elseif
(
isset
(
$m
[
self
::
MC_HEXDEC
]
)
&&
strlen
(
$m
[
self
::
MC_HEXDEC
]
)
)
{
// Hexadecimal
if
(
strlen
(
$m
[
self
::
MC_HEXDEC
]
)
>
6
)
{
$this
->
error
(
'invalid numeric reference'
,
$errorPos
);
$out
.=
self
::
REPLACEMENT_CHAR
;
continue
;
}
$codepoint
=
intval
(
$m
[
self
::
MC_HEXDEC
],
16
);
}
elseif
(
$knownNamed
!==
''
)
{
$out
.=
HTMLData
::
$namedEntityTranslations
[
$knownNamed
]
.
$attributeSuffix
;
continue
;
}
elseif
(
isset
(
$m
[
self
::
MC_INVALID
]
)
&&
strlen
(
$m
[
self
::
MC_INVALID
]
)
)
{
if
(
!
$this
->
ignoreErrors
)
{
$this
->
error
(
'invalid named reference'
,
$errorPos
);
}
$out
.=
'&'
.
$m
[
self
::
MC_INVALID
];
continue
;
}
else
{
$this
->
fatal
(
'unable to identify char ref submatch'
);
// @phan-suppress-next-line PhanPluginUnreachableCode False positive that var is not defined
$codepoint
=
0
;
}
// Interpret $codepoint
if
(
$codepoint
===
0
||
(
$codepoint
>=
0xD800
&&
$codepoint
<=
0xDFFF
)
||
$codepoint
>
0x10FFFF
)
{
if
(
!
$this
->
ignoreErrors
)
{
$this
->
error
(
'invalid numeric reference'
,
$errorPos
);
}
$out
.=
self
::
REPLACEMENT_CHAR
;
}
elseif
(
isset
(
HTMLData
::
$legacyNumericEntities
[
$codepoint
]
)
)
{
if
(
!
$this
->
ignoreErrors
)
{
$this
->
error
(
'invalid reference to non-ASCII control character'
,
$errorPos
);
}
$out
.=
HTMLData
::
$legacyNumericEntities
[
$codepoint
];
}
else
{
if
(
!
$this
->
ignoreErrors
)
{
$disallowedCodepoints
=
[
0x000B
=>
true
,
0xFFFE
=>
true
,
0xFFFF
=>
true
,
0x1FFFE
=>
true
,
0x1FFFF
=>
true
,
0x2FFFE
=>
true
,
0x2FFFF
=>
true
,
0x3FFFE
=>
true
,
0x3FFFF
=>
true
,
0x4FFFE
=>
true
,
0x4FFFF
=>
true
,
0x5FFFE
=>
true
,
0x5FFFF
=>
true
,
0x6FFFE
=>
true
,
0x6FFFF
=>
true
,
0x7FFFE
=>
true
,
0x7FFFF
=>
true
,
0x8FFFE
=>
true
,
0x8FFFF
=>
true
,
0x9FFFE
=>
true
,
0x9FFFF
=>
true
,
0xAFFFE
=>
true
,
0xAFFFF
=>
true
,
0xBFFFE
=>
true
,
0xBFFFF
=>
true
,
0xCFFFE
=>
true
,
0xCFFFF
=>
true
,
0xDFFFE
=>
true
,
0xDFFFF
=>
true
,
0xEFFFE
=>
true
,
0xEFFFF
=>
true
,
0xFFFFE
=>
true
,
0xFFFFF
=>
true
,
0x10FFFE
=>
true
,
0x10FFFF
=>
true
];
if
(
(
$codepoint
>=
1
&&
$codepoint
<=
8
)
||
(
$codepoint
>=
0x0d
&&
$codepoint
<=
0x1f
)
||
(
$codepoint
>=
0x7f
&&
$codepoint
<=
0x9f
)
||
(
$codepoint
>=
0xfdd0
&&
$codepoint
<=
0xfdef
)
||
isset
(
$disallowedCodepoints
[
$codepoint
]
)
)
{
$this
->
error
(
'invalid numeric reference to control character'
,
$errorPos
);
}
}
$out
.=
\UtfNormal\Utils
::
codepointToUtf8
(
$codepoint
);
}
}
if
(
$pos
<
$length
)
{
$out
.=
substr
(
$text
,
$pos
);
}
return
$out
;
}
/**
* Emit a range of the input text as a character token, and emit related
* errors, with validity rules as per the data state.
*
* @param int $pos Offset within the input text
* @param int $length The length of the range
* @param bool $isSimple True if you know that the data range does not
* contain < \0 or &; false is safe if you're not sure
* @param bool $hasSimpleRefs True if you know that any character
* references are semicolon terminated and in the list of $commonEntities;
* false is safe if you're not sure
*/
protected
function
emitDataRange
(
$pos
,
$length
,
$isSimple
=
false
,
$hasSimpleRefs
=
false
)
{
if
(
$length
===
0
)
{
return
;
}
if
(
$this
->
ignoreCharRefs
&&
$this
->
ignoreNulls
&&
$this
->
ignoreErrors
)
{
// Pretend this data range doesn't contain < \0 or &
$isSimple
=
true
;
}
if
(
$isSimple
)
{
$this
->
listener
->
characters
(
$this
->
text
,
$pos
,
$length
,
$pos
,
$length
);
}
else
{
if
(
!
$this
->
ignoreErrors
)
{
// Any bare "<" in a data state text node is a parse error.
// Uniquely to the data state, nulls are just flagged as errors
// and passed through, they are not replaced.
$this
->
handleAsciiErrors
(
"<
\0
"
,
$this
->
text
,
$pos
,
$length
,
0
);
}
$text
=
substr
(
$this
->
text
,
$pos
,
$length
);
if
(
$hasSimpleRefs
)
{
$text
=
strtr
(
$text
,
self
::
$commonEntities
);
}
else
{
$text
=
$this
->
handleCharRefs
(
$text
,
$pos
);
}
$this
->
listener
->
characters
(
$text
,
0
,
strlen
(
$text
),
$pos
,
$length
);
}
}
/**
* Emit a range of characters from the input text, with validity rules as
* per the CDATA section state.
*
* @param int $innerPos The position after the <![CDATA[
* @param int $innerLength The length of the string not including the terminating ]]>
* @param int $outerPos The position of the start of the <!CDATA[
* @param int $outerLength The length of the whole input region being emitted
*/
protected
function
emitCdataRange
(
$innerPos
,
$innerLength
,
$outerPos
,
$outerLength
)
{
$this
->
listener
->
characters
(
$this
->
text
,
$innerPos
,
$innerLength
,
$outerPos
,
$outerLength
);
}
/**
* Emit a range of characters from the input text, either from RCDATA,
* RAWTEXT, script data or PLAINTEXT. The only difference between these
* states is whether or not character references are expanded, so we take
* that as a parameter.
*
* @param bool $ignoreCharRefs
* @param int $pos The input position
* @param int $length The length of the range to be emitted
*/
protected
function
emitRawTextRange
(
$ignoreCharRefs
,
$pos
,
$length
)
{
if
(
$length
===
0
)
{
return
;
}
$ignoreCharRefs
=
$ignoreCharRefs
||
$this
->
ignoreCharRefs
;
if
(
$ignoreCharRefs
&&
$this
->
ignoreNulls
)
{
$this
->
listener
->
characters
(
$this
->
text
,
$pos
,
$length
,
$pos
,
$length
);
}
else
{
$text
=
substr
(
$this
->
text
,
$pos
,
$length
);
if
(
!
$ignoreCharRefs
)
{
$text
=
$this
->
handleCharRefs
(
$text
,
$pos
);
}
$text
=
$this
->
handleNulls
(
$text
,
$pos
);
$this
->
listener
->
characters
(
$text
,
0
,
strlen
(
$text
),
$pos
,
$length
);
}
}
/**
* The entry point for the RCDATA and RAWTEXT states.
* @param bool $ignoreCharRefs True to ignore character references regardless
* of configuration, false to respect the configuration.
* @return int The next state index
*/
protected
function
textElementState
(
$ignoreCharRefs
)
{
if
(
$this
->
appropriateEndTag
===
null
)
{
$this
->
emitRawTextRange
(
$ignoreCharRefs
,
$this
->
pos
,
$this
->
length
-
$this
->
pos
);
$this
->
pos
=
$this
->
length
;
return
self
::
STATE_EOF
;
}
$re
=
"~</
{$this->appropriateEndTag}
# Assert that the end tag name state is exited appropriately,
# since the anything else case leads to the tag being treated as
# a literal
(?=[
\t\n\f
/>])
~ix"
;
do
{
$count
=
preg_match
(
$re
,
$this
->
text
,
$m
,
PREG_OFFSET_CAPTURE
,
$this
->
pos
);
if
(
$count
===
false
)
{
$this
->
throwPregError
();
}
elseif
(
!
$count
)
{
// Text runs to end
$this
->
emitRawTextRange
(
$ignoreCharRefs
,
$this
->
pos
,
$this
->
length
-
$this
->
pos
);
$this
->
pos
=
$this
->
length
;
return
self
::
STATE_EOF
;
}
$startPos
=
$m
[
0
][
1
];
// Emit text before tag
$this
->
emitRawTextRange
(
$ignoreCharRefs
,
$this
->
pos
,
$startPos
-
$this
->
pos
);
$matchLength
=
strlen
(
$m
[
0
][
0
]
);
$this
->
pos
=
$startPos
+
$matchLength
;
$nextState
=
$this
->
handleAttribsAndClose
(
self
::
STATE_RCDATA
,
$this
->
appropriateEndTag
,
true
,
$startPos
);
}
while
(
$nextState
===
self
::
STATE_RCDATA
);
return
$nextState
;
}
/**
* Advance $this->pos, consuming all tag attributes found at the current
* position. The new position will be at the end of the tag or at the end
* of the input string.
*
* To improve performance of consumers which don't need to read the
* attribute array, interpretation of the PCRE match results is deferred.
*
* - @todo: Make deferral configurable.
* - @todo: Measure performance improvement, assess whether the LazyAttributes
* feature is warranted.
*
* @return Attributes
*/
protected
function
consumeAttribs
()
{
static
$re
;
if
(
$re
===
null
)
{
$re
=
'~
[
\t\n\f
]*+ # Ignored whitespace before attribute name
(?! /> ) # Do not consume self-closing end of tag
(?! > ) # Do not consume normal closing bracket
(?:
# Before attribute name state
# A bare slash at this point, not part of a self-closing end tag, is
# consumed and ignored (with a parse error), returning to the before
# attribute name state.
( / ) | # 1. Bare slash
# Attribute name state
# Note that the first character can be an equals sign, this is a parse error
# but still generates an attribute called "=". Thus the only way the match
# could fail here is due to EOF.
( # 2. Attribute name
(?:
( [-a-z]++ ) | # 3. Optional "simple" prefix
[^
\t\n\f
/>]
)
[^
\t\n\f
=/>]*+
)
# After attribute name state
[
\t\n\f
]*+
(?:
=
# Before attribute value state
# Ignore whitespace
[
\t\n\f
]*+
(?:
# If an end-quote is omitted, the attribute will run to the end of the
# string, leaving no closing bracket. So the caller will detect the
# unexpected EOF and will not emit the tag, which is correct.
" ( # 4. Double-quoted attribute values
[^"&
\r\0
]*+ # Simple prefix
( # 5. Perhaps some simple entities
(?: &(?:amp|apos|lt|gt|quot|nbsp); | [^"&
\r\0
] )*+
)
( [^"]*+ ) # 6. Unsimple suffix
) "? |
\'
( # 7. Single-quoted attribute value
[^
\'
&
\r\0
]*+ # Simple prefix
( # 8. Perhaps some simple entities
(?: &(?:amp|apos|lt|gt|quot|nbsp); | [^
\'
&
\r\0
] )*+
)
( [^
\'
]*+ ) # 9. Unsimple suffix
)
\'
? |
( # 10. Unquoted attribute value
[^
\t\n\f
>
\r
"&
\'\0
=<`]*+ # Simple prefix
( [^
\t\n\f
>]*+ ) # 11. Unsimple suffix
)
)
# Or nothing: an attribute with an empty value. The attribute name was
# terminated by a slash, closing bracket or EOF
|
)
)
# The /A modifier causes preg_match_all to give contiguous chunks
~xAS'
;
}
$count
=
preg_match_all
(
$re
,
$this
->
text
,
$m
,
PREG_SET_ORDER
|
PREG_OFFSET_CAPTURE
,
$this
->
pos
);
if
(
$count
===
false
)
{
$this
->
throwPregError
();
// @phan-suppress-next-line PhanPluginUnreachableCode False positive that var is not defined
$attribs
=
new
PlainAttributes
();
}
elseif
(
$count
)
{
$this
->
pos
=
$m
[
$count
-
1
][
0
][
1
]
+
strlen
(
$m
[
$count
-
1
][
0
][
0
]
);
$attribs
=
new
LazyAttributes
(
$m
,
function
(
$m
)
{
return
$this
->
interpretAttribMatches
(
$m
);
}
);
}
else
{
$attribs
=
new
PlainAttributes
();
}
// Consume trailing whitespace. This is strictly part of the before attribute
// name state, but we didn't consume it in the regex since we used a principle
// of one match equals one attribute.
$this
->
pos
+=
strspn
(
$this
->
text
,
"
\t\n\f
"
,
$this
->
pos
);
return
$attribs
;
}
/**
* Interpret the results of the attribute preg_match_all(). Emit errors as
* appropriate and return an associative array.
*
* @param array $matches
* @return array
*/
protected
function
interpretAttribMatches
(
$matches
)
{
$attributes
=
[];
foreach
(
$matches
as
$m
)
{
if
(
strlen
(
$m
[
self
::
MA_SLASH
][
0
]
)
)
{
$this
->
error
(
'unexpected bare slash'
,
$m
[
self
::
MA_SLASH
][
1
]
);
continue
;
}
$name
=
$m
[
self
::
MA_NAME
][
0
];
$isSimple
=
isset
(
$m
[
self
::
MA_SIMPLE_NAME
]
)
&&
(
strlen
(
$name
)
===
strlen
(
$m
[
self
::
MA_SIMPLE_NAME
][
0
]
)
);
if
(
!
$isSimple
)
{
// We can skip these steps if we already know the name is simple
if
(
!
$this
->
ignoreErrors
)
{
$this
->
handleAsciiErrors
(
"
\"
'<="
,
$name
,
0
,
strlen
(
$name
),
$m
[
self
::
MA_NAME
][
1
]
);
}
if
(
!
$this
->
ignoreNulls
)
{
$name
=
$this
->
handleNulls
(
$m
[
self
::
MA_NAME
][
0
],
$m
[
self
::
MA_NAME
][
1
]
);
}
$name
=
strtolower
(
$name
);
}
$additionalAllowedChar
=
''
;
$isSimple
=
true
;
if
(
isset
(
$m
[
self
::
MA_DQUOTED
]
)
&&
$m
[
self
::
MA_DQUOTED
][
1
]
>=
0
)
{
// Double-quoted attribute value
$additionalAllowedChar
=
'"'
;
$value
=
$m
[
self
::
MA_DQUOTED
][
0
];
$pos
=
$m
[
self
::
MA_DQUOTED
][
1
];
$isSimple
=
!
strlen
(
$m
[
self
::
MA_DQUOTED_UNSIMPLE
][
0
]
);
if
(
$isSimple
&&
strlen
(
$m
[
self
::
MA_DQUOTED_CHARREF
][
0
]
)
&&
!
$this
->
ignoreCharRefs
)
{
// Efficiently handle well-behaved character references
$value
=
strtr
(
$value
,
self
::
$commonEntities
);
}
}
elseif
(
isset
(
$m
[
self
::
MA_SQUOTED
]
)
&&
$m
[
self
::
MA_SQUOTED
][
1
]
>=
0
)
{
// Single-quoted attribute value
$additionalAllowedChar
=
"'"
;
$value
=
$m
[
self
::
MA_SQUOTED
][
0
];
$pos
=
$m
[
self
::
MA_SQUOTED
][
1
];
$isSimple
=
!
strlen
(
$m
[
self
::
MA_SQUOTED_UNSIMPLE
][
0
]
);
if
(
$isSimple
&&
strlen
(
$m
[
self
::
MA_SQUOTED_CHARREF
][
0
]
)
&&
!
$this
->
ignoreCharRefs
)
{
// Efficiently handle well-behaved character references
$value
=
strtr
(
$value
,
self
::
$commonEntities
);
}
}
elseif
(
isset
(
$m
[
self
::
MA_UNQUOTED
]
)
&&
$m
[
self
::
MA_UNQUOTED
][
1
]
>=
0
)
{
// Unquoted attribute value
$value
=
$m
[
self
::
MA_UNQUOTED
][
0
];
$pos
=
$m
[
self
::
MA_UNQUOTED
][
1
];
$isSimple
=
!
strlen
(
$m
[
self
::
MA_UNQUOTED_UNSIMPLE
][
0
]
);
// Search for parse errors
if
(
!
$this
->
ignoreErrors
)
{
if
(
$value
===
''
)
{
// ">" in the before attribute value state is a parse error
$this
->
error
(
'empty unquoted attribute'
,
$pos
);
}
if
(
!
$isSimple
)
{
$this
->
handleAsciiErrors
(
"
\"
'<=`"
,
$value
,
0
,
strlen
(
$value
),
$pos
);
}
}
}
else
{
$value
=
''
;
// reassure phan
$pos
=
-
1
;
}
if
(
$additionalAllowedChar
&&
!
$this
->
ignoreErrors
)
{
// After attribute value (quoted) state
// Quoted attributes must be followed by a space, "/" or ">"
$aavPos
=
$m
[
0
][
1
]
+
strlen
(
$m
[
0
][
0
]
);
if
(
$aavPos
<
$this
->
length
)
{
$aavChar
=
$this
->
text
[
$aavPos
];
if
(
!
preg_match
(
'~^[
\t\n\f
/>]~'
,
$aavChar
)
)
{
$this
->
error
(
'missing space between attributes'
,
$aavPos
);
}
}
}
if
(
!
$isSimple
&&
$value
!==
''
)
{
if
(
!
$this
->
ignoreNulls
)
{
$value
=
$this
->
handleNulls
(
$value
,
$pos
);
}
if
(
!
$this
->
ignoreCharRefs
)
{
$value
=
$this
->
handleCharRefs
(
$value
,
$pos
,
true
,
$additionalAllowedChar
);
}
}
if
(
isset
(
$attributes
[
$name
]
)
)
{
$this
->
error
(
"duplicate attribute"
,
$m
[
0
][
1
]
);
}
else
{
$attributes
[
$name
]
=
$value
;
}
}
return
$attributes
;
}
/**
* Consume attributes, and the closing bracket which follows attributes.
* Emit the appropriate tag event, or in the case of broken attributes in
* text states, emit characters.
*
* @param int $state The current state
* @param string $tagName The normalized tag name
* @param bool $isEndTag True if this is an end tag, false if it is a start tag
* @param int $startPos The input position of the start of the current tag.
* @return int The next state
*/
protected
function
handleAttribsAndClose
(
$state
,
$tagName
,
$isEndTag
,
$startPos
)
{
$attribStartPos
=
$this
->
pos
;
$attribs
=
$this
->
consumeAttribs
();
$pos
=
$this
->
pos
;
// Literal characters are emitted on EOF or "anything else" from the
// end tag substates of the text states.
// (spec ref 8.2.4 sections 11-19, 25-27)
$isDataState
=
$state
===
self
::
STATE_DATA
;
$isLiteral
=
$attribStartPos
===
$pos
&&
!
$isDataState
;
if
(
$pos
>=
$this
->
length
)
{
$this
->
error
(
'unexpected end of file inside tag'
);
if
(
$isLiteral
)
{
$this
->
listener
->
characters
(
$this
->
text
,
$startPos
,
$this
->
length
-
$startPos
,
$startPos
,
$this
->
length
-
$startPos
);
}
return
self
::
STATE_EOF
;
}
if
(
$isEndTag
&&
!
$this
->
ignoreErrors
&&
$attribs
->
count
()
)
{
$this
->
error
(
'end tag has an attribute'
);
}
if
(
$this
->
text
[
$pos
]
===
'/'
&&
$this
->
text
[
$pos
+
1
]
===
'>'
)
{
$pos
+=
2
;
$selfClose
=
true
;
}
elseif
(
$this
->
text
[
$pos
]
===
'>'
)
{
$pos
++;
$selfClose
=
false
;
}
elseif
(
$isLiteral
)
{
$this
->
listener
->
characters
(
$this
->
text
,
$startPos
,
$attribStartPos
-
$startPos
,
$startPos
,
$attribStartPos
-
$startPos
);
return
$state
;
}
else
{
$this
->
fatal
(
'failed to find an already-matched ">"'
);
// @phan-suppress-next-line PhanPluginUnreachableCode False positive that var is not defined
$selfClose
=
false
;
}
$this
->
pos
=
$pos
;
if
(
$isEndTag
)
{
if
(
$selfClose
)
{
$this
->
error
(
'self-closing end tag'
);
}
$this
->
listener
->
endTag
(
$tagName
,
$startPos
,
$pos
-
$startPos
);
}
else
{
$this
->
listener
->
startTag
(
$tagName
,
$attribs
,
$selfClose
,
$startPos
,
$pos
-
$startPos
);
}
return
self
::
STATE_DATA
;
}
/**
* Process input text in the PLAINTEXT state
* @return int The next state index
*/
protected
function
plaintextState
()
{
$this
->
emitRawTextRange
(
true
,
$this
->
pos
,
$this
->
length
-
$this
->
pos
);
return
self
::
STATE_EOF
;
}
/**
* Process input text in the script data state
* @return int The next state index
*/
protected
function
scriptDataState
()
{
if
(
$this
->
appropriateEndTag
===
null
)
{
$this
->
pos
=
$this
->
length
;
return
self
::
STATE_EOF
;
}
$re
=
<<<REGEX
~
(?: # Outer loop start
# Script data state
# Stop iteration if we previously matched an appropriate end tag.
# This is a conditional subpattern: if capture 1 previously
# matched, then run the pattern /$./ which always fails.
(?(1) $. )
.*?
(?:
$ |
(
</ {$this->appropriateEndTag}
# If we hit the "anything else" case in the script data
# end tag name state, don't exit
(?= [\t\n\f />] )
) | # 1. Appropriate end tag
<!--
# Script data escaped dash dash state
# Hyphens at this point are consumed without a state transition
# and so are not part of a comment-end.
-*+
(?: # Inner loop start
# Script data escaped state
.*?
(?:
$ |
# Stop at, but do not consume, comment-close or end tag.
# This causes the inner loop to exit, since restarting the
# inner loop at this input position will cause the loop
# body to match zero characters. Repeating a zero-character
# match causes the repeat to terminate.
(?= --> ) |
(?= </ {$this->appropriateEndTag} [\t\n\f />] ) |
<script [\t\n\f />]
# Script data double escaped state
.*?
(?:
$ |
# Stop at, but do not consume, comment-close
(?= --> ) |
</script [\t\n\f />]
)
)
)*
# Consume the comment close which exited the inner loop, if any
(?: --> )?
)
)*+
~xsiA
REGEX;
do
{
$count
=
preg_match
(
$re
,
$this
->
text
,
$m
,
0
,
$this
->
pos
);
if
(
$count
===
false
)
{
$this
->
throwPregError
();
}
elseif
(
!
$count
)
{
$this
->
fatal
(
'unexpected regex failure: this pattern can match zero characters'
);
}
$startPos
=
$this
->
pos
;
$matchLength
=
strlen
(
$m
[
0
]
);
$endTagLength
=
isset
(
$m
[
1
]
)
?
strlen
(
$m
[
1
]
)
:
0
;
$textLength
=
$matchLength
-
$endTagLength
;
$this
->
emitRawTextRange
(
true
,
$startPos
,
$textLength
);
$this
->
pos
=
$startPos
+
$matchLength
;
$tagStartPos
=
$startPos
+
$textLength
;
if
(
$endTagLength
)
{
$nextState
=
$this
->
handleAttribsAndClose
(
self
::
STATE_SCRIPT_DATA
,
$this
->
appropriateEndTag
,
true
,
$tagStartPos
);
}
else
{
$nextState
=
self
::
STATE_EOF
;
}
}
while
(
$nextState
===
self
::
STATE_SCRIPT_DATA
);
return
$nextState
;
}
/**
* Emit a parse error event.
* @param string $text The error message
* @param int|null $pos The error position, or null to use the current position
*/
protected
function
error
(
$text
,
$pos
=
null
)
{
if
(
!
$this
->
ignoreErrors
)
{
if
(
$pos
===
null
)
{
$pos
=
$this
->
pos
;
}
$this
->
listener
->
error
(
$text
,
$pos
);
}
}
/**
* Throw an exception for a specified reason. This is used for API errors
* and assertion-like checks.
* @param string $text The error message
* @throws TokenizerError
* @return never
*/
protected
function
fatal
(
$text
)
{
throw
new
TokenizerError
(
__CLASS__
.
": "
.
$text
);
}
/**
* Interpret preg_last_error() and throw a suitable exception. This is
* called when preg_match() or similar returns false.
*
* Notes for users:
*
* - PCRE internal error: may be due to JIT stack space exhaustion prior
* to PHP 7, due to excessive recursion. Increase stack space.
*
* - pcre.backtrack_limit exhausted: The backtrack limit should be at least
* double the input size, the defaults are way too small. Increase it in
* configuration.
*
* @return never
*/
protected
function
throwPregError
()
{
if
(
defined
(
'PREG_JIT_STACKLIMIT_ERROR'
)
)
{
$PREG_JIT_STACKLIMIT_ERROR
=
PREG_JIT_STACKLIMIT_ERROR
;
}
else
{
$PREG_JIT_STACKLIMIT_ERROR
=
'undefined error'
;
}
switch
(
preg_last_error
()
)
{
case
PREG_NO_ERROR
:
$msg
=
"PCRE returned false but gave PREG_NO_ERROR"
;
break
;
case
PREG_INTERNAL_ERROR
:
$msg
=
"PCRE internal error"
;
break
;
case
PREG_BACKTRACK_LIMIT_ERROR
:
$msg
=
"pcre.backtrack_limit exhausted"
;
break
;
case
PREG_RECURSION_LIMIT_ERROR
:
$msg
=
"pcre.recursion_limit exhausted"
;
break
;
case
$PREG_JIT_STACKLIMIT_ERROR
:
$msg
=
"PCRE JIT stack space exhausted"
;
break
;
case
PREG_BAD_UTF8_ERROR
:
case
PREG_BAD_UTF8_OFFSET_ERROR
:
default
:
$msg
=
"PCRE unexpected error"
;
}
throw
new
TokenizerError
(
__CLASS__
.
": $msg"
);
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Sat, May 16, 22:40 (2 h, 10 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
d4/18/166682e8bc7b34e96f92abfcde05
Default Alt Text
Tokenizer.php (53 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment