Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F2750745
TranslatablePageParser.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
6 KB
Referenced Files
None
Subscribers
None
TranslatablePageParser.php
View Options
<?php
declare
(
strict_types
=
1
);
namespace
MediaWiki\Extension\Translate\PageTranslation
;
use
MediaWiki\Extension\Translate\Utilities\ParsingPlaceholderFactory
;
/**
* Generates ParserOutput from text or removes all tags from a text.
*
* @author Niklas Laxström
* @license GPL-2.0-or-later
* @since 2020.08
*/
class
TranslatablePageParser
{
private
ParsingPlaceholderFactory
$placeholderFactory
;
public
function
__construct
(
ParsingPlaceholderFactory
$placeholderFactory
)
{
$this
->
placeholderFactory
=
$placeholderFactory
;
}
public
function
containsMarkup
(
string
$text
):
bool
{
$nowiki
=
[];
$text
=
$this
->
armourNowiki
(
$nowiki
,
$text
);
return
preg_match
(
'~</?translate[ >]~'
,
$text
)
!==
0
;
}
/**
* Remove all opening and closing translate tags following the same whitespace rules as the
* regular parsing. This doesn't try to parse the page, so it can handle unbalanced tags.
*/
public
function
cleanupTags
(
string
$text
):
string
{
$nowiki
=
[];
$text
=
$this
->
armourNowiki
(
$nowiki
,
$text
);
$text
=
preg_replace
(
'~<translate( nowrap)?>
\n
?~s'
,
''
,
$text
);
$text
=
preg_replace
(
'~
\n
?</translate>~s'
,
''
,
$text
);
// Markers: headers and the rest
$ic
=
preg_quote
(
TranslationUnit
::
UNIT_MARKER_INVALID_CHARS
,
'~'
);
$text
=
preg_replace
(
"~(^=.*=) <!--T:[^$ic]+-->$~um"
,
'
\1
'
,
$text
);
$text
=
preg_replace
(
"~<!--T:[^$ic]+-->[
\n
]?~um"
,
''
,
$text
);
// Remove variables
$unit
=
new
TranslationUnit
(
$text
);
$text
=
$unit
->
getTextForTrans
();
return
$this
->
unarmourNowiki
(
$nowiki
,
$text
);
}
/** @throws ParsingFailure */
public
function
parse
(
string
$text
):
ParserOutput
{
$nowiki
=
[];
$text
=
$this
->
armourNowiki
(
$nowiki
,
$text
);
$sections
=
[];
$tagPlaceHolders
=
[];
while
(
true
)
{
$re
=
'~(<translate(?: nowrap)?>)(.*?)</translate>~s'
;
$matches
=
[];
$ok
=
preg_match
(
$re
,
$text
,
$matches
,
PREG_OFFSET_CAPTURE
);
if
(
$ok
===
0
||
$ok
===
false
)
{
break
;
// No match or failure
}
$contentWithTags
=
$matches
[
0
][
0
];
$contentWithoutTags
=
$matches
[
2
][
0
];
// These are offsets to the content inside the tags in $text
$offsetStart
=
$matches
[
0
][
1
];
$offsetEnd
=
$offsetStart
+
strlen
(
$contentWithTags
);
// Replace the whole match with a placeholder
$ph
=
$this
->
placeholderFactory
->
make
();
$text
=
substr
(
$text
,
0
,
$offsetStart
)
.
$ph
.
substr
(
$text
,
$offsetEnd
);
if
(
preg_match
(
'~<translate( nowrap)?>~'
,
$contentWithoutTags
)
!==
0
)
{
throw
new
ParsingFailure
(
'Nested tags'
,
[
'pt-parse-nested'
,
$contentWithoutTags
]
);
}
$openTag
=
$matches
[
1
][
0
];
$canWrap
=
$openTag
!==
'<translate nowrap>'
;
// Parse the content inside the tags
$contentWithoutTags
=
$this
->
unarmourNowiki
(
$nowiki
,
$contentWithoutTags
);
$parse
=
$this
->
parseSection
(
$contentWithoutTags
,
$canWrap
);
// Update list of sections and the template with the results
$sections
+=
$parse
[
'sections'
];
$tagPlaceHolders
[
$ph
]
=
new
Section
(
$openTag
,
$parse
[
'template'
],
'</translate>'
);
}
$prettyTemplate
=
$text
;
foreach
(
$tagPlaceHolders
as
$ph
=>
$value
)
{
$prettyTemplate
=
str_replace
(
$ph
,
'[...]'
,
$prettyTemplate
);
}
if
(
preg_match
(
'~<translate( nowrap)?>~'
,
$text
)
!==
0
)
{
throw
new
ParsingFailure
(
'Unmatched opening tag'
,
[
'pt-parse-open'
,
$prettyTemplate
]
);
}
elseif
(
str_contains
(
$text
,
'</translate>'
)
)
{
throw
new
ParsingFailure
(
"Unmatched closing tag"
,
[
'pt-parse-close'
,
$prettyTemplate
]
);
}
$text
=
$this
->
unarmourNowiki
(
$nowiki
,
$text
);
return
new
ParserOutput
(
$text
,
$tagPlaceHolders
,
$sections
);
}
/**
* Splits the content marked with \<translate> tags into translation units, which are
* separated with two or more newlines. Extra whitespace is captured in the template and
* is not included in the translation units.
* @internal
*/
public
function
parseSection
(
string
$text
,
bool
$canWrap
):
array
{
$flags
=
PREG_SPLIT_NO_EMPTY
|
PREG_SPLIT_DELIM_CAPTURE
;
$parts
=
preg_split
(
'~(^
\s
*|
\s
*
\n\n\s
*|
\s
*$)~'
,
$text
,
-
1
,
$flags
);
$inline
=
preg_match
(
'~
\n
~'
,
$text
)
===
0
;
$template
=
''
;
$sections
=
[];
foreach
(
$parts
as
$_
)
{
if
(
trim
(
$_
)
===
''
)
{
$template
.=
$_
;
}
else
{
$ph
=
$this
->
placeholderFactory
->
make
();
$tpSection
=
$this
->
parseUnit
(
$_
);
$tpSection
->
setIsInline
(
$inline
);
$tpSection
->
setCanWrap
(
$canWrap
);
$sections
[
$ph
]
=
$tpSection
;
$template
.=
$ph
;
}
}
return
[
'template'
=>
$template
,
'sections'
=>
$sections
,
];
}
/**
* Checks if this unit already contains a section marker. If there
* is not, a new one will be created. Marker will have the value of
* -1, which will later be replaced with a real value.
* @internal
*/
public
function
parseUnit
(
string
$content
):
TranslationUnit
{
$re
=
'~<!--T:(.*?)-->~'
;
$matches
=
[];
$count
=
preg_match_all
(
$re
,
$content
,
$matches
,
PREG_SET_ORDER
);
if
(
$count
>
1
)
{
throw
new
ParsingFailure
(
'Multiple translation unit markers'
,
[
'pt-shake-multiple'
,
$content
]
);
}
// If no id given in the source, default to a new section id
$id
=
TranslationUnit
::
NEW_UNIT_ID
;
if
(
$count
===
1
)
{
foreach
(
$matches
as
$match
)
{
[
/*full*/
,
$id
]
=
$match
;
// Currently handle only these two standard places.
// Is this too strict?
$rer1
=
'~^<!--T:(.*?)-->( |
\n
)~'
;
// Normal sections
$rer2
=
'~
\s
*<!--T:(.*?)-->$~m'
;
// Sections with title
$content
=
preg_replace
(
$rer1
,
''
,
$content
);
$content
=
preg_replace
(
$rer2
,
''
,
$content
);
if
(
preg_match
(
$re
,
$content
)
===
1
)
{
throw
new
ParsingFailure
(
'Translation unit marker is in unsupported position'
,
[
'pt-shake-position'
,
$content
]
);
}
elseif
(
trim
(
$content
)
===
''
)
{
throw
new
ParsingFailure
(
'Translation unit has no content besides marker'
,
[
'pt-shake-empty'
,
$id
]
);
}
}
}
return
new
TranslationUnit
(
$content
,
$id
);
}
/** @internal */
public
function
armourNowiki
(
array
&
$holders
,
string
$text
):
string
{
$re
=
'~(<nowiki>)(.*?)(</nowiki>)~s'
;
while
(
preg_match
(
$re
,
$text
,
$matches
)
)
{
$ph
=
$this
->
placeholderFactory
->
make
();
$text
=
str_replace
(
$matches
[
0
],
$ph
,
$text
);
$holders
[
$ph
]
=
$matches
[
0
];
}
return
$text
;
}
/** @internal */
public
function
unarmourNowiki
(
array
$holders
,
string
$text
):
string
{
return
strtr
(
$text
,
$holders
);
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Fri, Jul 3, 17:03 (11 h, 34 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
f6/37/3806daa6689f4d2095d9ce6a17d4
Default Alt Text
TranslatablePageParser.php (6 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment