Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F1431897
ParserFileProcessingHookHandlers.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
14 KB
Referenced Files
None
Subscribers
None
ParserFileProcessingHookHandlers.php
View Options
<?php
namespace
PageImages\Hooks
;
use
Exception
;
use
FormatMetadata
;
use
MediaWiki\Config\Config
;
use
MediaWiki\Context\DerivativeContext
;
use
MediaWiki\FileRepo\File\File
;
use
MediaWiki\FileRepo\RepoGroup
;
use
MediaWiki\Hook\ParserAfterTidyHook
;
use
MediaWiki\Hook\ParserModifyImageHTMLHook
;
use
MediaWiki\Hook\ParserTestGlobalsHook
;
use
MediaWiki\Http\HttpRequestFactory
;
use
MediaWiki\Linker\LinksMigration
;
use
MediaWiki\MainConfigNames
;
use
MediaWiki\Page\PageReference
;
use
MediaWiki\Parser\Parser
;
use
MediaWiki\Parser\ParserOutput
;
use
MediaWiki\Title\TitleFactory
;
use
PageImages\PageImageCandidate
;
use
PageImages\PageImages
;
use
RuntimeException
;
use
Wikimedia\ObjectCache\WANObjectCache
;
use
Wikimedia\Rdbms\IConnectionProvider
;
/**
* Handlers for parser hooks.
*
* The ParserModifyImageHTML hook handler collects candidate images, and marks
* them with a temporary HTML comment in the parser output.
*
* The ParserAfterTidy hook handler processes the candidate images, identifying
* the best image and the best free image. If $wgPageImagesLeadSectionOnly is
* set, images following the first section header are discarded. It removes the
* temporary comments and saves the resulting best images to page_props.
*
* The various query interfaces will retrieve the lead image from page_props.
*
* @license WTFPL
* @author Max Semenik
* @author Thiemo Kreuz
*/
class
ParserFileProcessingHookHandlers
implements
ParserAfterTidyHook
,
ParserModifyImageHTMLHook
,
ParserTestGlobalsHook
{
private
const
CANDIDATE_REGEX
=
'/<!--MW-PAGEIMAGES-CANDIDATE-([0-9]+)-->/'
;
protected
Config
$config
;
private
RepoGroup
$repoGroup
;
private
WANObjectCache
$mainWANObjectCache
;
private
HttpRequestFactory
$httpRequestFactory
;
private
IConnectionProvider
$connectionProvider
;
private
TitleFactory
$titleFactory
;
private
LinksMigration
$linksMigration
;
public
function
__construct
(
Config
$config
,
RepoGroup
$repoGroup
,
WANObjectCache
$mainWANObjectCache
,
HttpRequestFactory
$httpRequestFactory
,
IConnectionProvider
$connectionProvider
,
TitleFactory
$titleFactory
,
LinksMigration
$linksMigration
)
{
$this
->
config
=
$config
;
$this
->
repoGroup
=
$repoGroup
;
$this
->
mainWANObjectCache
=
$mainWANObjectCache
;
$this
->
httpRequestFactory
=
$httpRequestFactory
;
$this
->
connectionProvider
=
$connectionProvider
;
$this
->
titleFactory
=
$titleFactory
;
$this
->
linksMigration
=
$linksMigration
;
}
/**
* @param array &$globals
*/
public
function
onParserTestGlobals
(
&
$globals
)
{
$globals
+=
[
'wgPageImagesScores'
=>
[
'width'
=>
[
200
=>
10
,
1000
=>
20
],
'position'
=>
[],
'ratio'
=>
[],
'galleryImageWidth'
=>
[]
],
'wgPageImagesLeadSectionOnly'
=>
true
];
}
/**
* ParserModifyImageHTML hook. Save candidate images, and mark them with a
* comment so that we can later tell if they were in the lead section.
*
* @param Parser $parser
* @param File $file
* @param array $params
* @param string &$html
*/
public
function
onParserModifyImageHTML
(
Parser
$parser
,
File
$file
,
array
$params
,
string
&
$html
):
void
{
$page
=
$parser
->
getPage
();
if
(
!
$page
||
!
$this
->
processThisTitle
(
$page
)
)
{
return
;
}
$this
->
calcWidth
(
$params
,
$file
);
$index
=
$this
->
addPageImageCandidateToParserOutput
(
PageImageCandidate
::
newFromFileAndParams
(
$file
,
$params
),
$parser
->
getOutput
()
);
$html
.=
"<!--MW-PAGEIMAGES-CANDIDATE-$index-->"
;
}
/**
* ParserAfterTidy hook handler. Remove candidate images which were not in
* the lead section.
*
* @param Parser $parser
* @param string &$text
*/
public
function
onParserAfterTidy
(
$parser
,
&
$text
)
{
$parserOutput
=
$parser
->
getOutput
();
$allImages
=
$parserOutput
->
getExtensionData
(
'pageImages'
);
if
(
!
$allImages
)
{
return
;
}
// Find and remove our special comments
$images
=
[];
if
(
$this
->
config
->
get
(
'PageImagesLeadSectionOnly'
)
)
{
$leadEndPos
=
strpos
(
$text
,
'<mw:editsection'
);
}
else
{
$leadEndPos
=
false
;
}
$text
=
preg_replace_callback
(
self
::
CANDIDATE_REGEX
,
static
function
(
$m
)
use
(
$allImages
,
&
$images
,
$leadEndPos
)
{
$offset
=
$m
[
0
][
1
];
$id
=
intval
(
$m
[
1
][
0
]
);
$inLead
=
$leadEndPos
===
false
||
$offset
<
$leadEndPos
;
if
(
$inLead
&&
isset
(
$allImages
[
$id
]
)
)
{
$images
[]
=
PageImageCandidate
::
newFromArray
(
$allImages
[
$id
]
);
}
return
''
;
},
$text
,
-
1
,
$count
,
PREG_OFFSET_CAPTURE
);
[
$bestImageName
,
$freeImageName
]
=
$this
->
findBestImages
(
$images
);
if
(
$freeImageName
)
{
$parserOutput
->
setPageProperty
(
PageImages
::
getPropName
(
true
),
$freeImageName
);
}
// Only store the image if it's not free. Free image (if any) has already been stored above.
if
(
$bestImageName
&&
$bestImageName
!==
$freeImageName
)
{
$parserOutput
->
setPageProperty
(
PageImages
::
getPropName
(
false
),
$bestImageName
);
}
// Strip comments from indicators (T298930)
foreach
(
$parserOutput
->
getIndicators
()
as
$id
=>
$value
)
{
$stripped
=
preg_replace
(
self
::
CANDIDATE_REGEX
,
''
,
$value
);
if
(
$stripped
!==
$value
)
{
$parserOutput
->
setIndicator
(
$id
,
$stripped
);
}
}
// We may have comments in TOC data - Parser::cleanupTocLine strips them for us.
}
/**
* Find the best images out of an array of candidates
*
* @param PageImageCandidate[] $images
* @return array{string|false,string|false} The best image, and the best free image
*/
private
function
findBestImages
(
array
$images
)
{
if
(
!
$images
)
{
return
[
false
,
false
];
}
// Determine the image scores
$scores
=
[];
$counter
=
0
;
foreach
(
$images
as
$image
)
{
$score
=
$this
->
getScore
(
$image
,
$counter
++
);
$fileName
=
$image
->
getFileName
();
$scores
[
$fileName
]
=
max
(
$scores
[
$fileName
]
??
-
1
,
$score
);
}
$bestImageName
=
false
;
$freeImageName
=
false
;
foreach
(
$scores
as
$name
=>
$score
)
{
if
(
$score
>
0
)
{
if
(
!
$bestImageName
||
$score
>
$scores
[
$bestImageName
]
)
{
$bestImageName
=
$name
;
}
if
(
(
!
$freeImageName
||
$score
>
$scores
[
$freeImageName
]
)
&&
$this
->
isImageFree
(
$name
)
)
{
$freeImageName
=
$name
;
}
}
}
return
[
$bestImageName
,
$freeImageName
];
}
/**
* Adds $image to $parserOutput extension data.
*
* @param PageImageCandidate $image
* @param ParserOutput $parserOutput
* @return int
*/
private
function
addPageImageCandidateToParserOutput
(
PageImageCandidate
$image
,
ParserOutput
$parserOutput
)
{
$images
=
$parserOutput
->
getExtensionData
(
'pageImages'
)
?:
[];
$images
[]
=
$image
->
jsonSerialize
();
$parserOutput
->
setExtensionData
(
'pageImages'
,
$images
);
return
count
(
$images
)
-
1
;
}
/**
* Returns true if data for this title should be saved
*
* @param PageReference $pageReference
*
* @return bool
*/
private
function
processThisTitle
(
PageReference
$pageReference
)
{
static
$flipped
=
null
;
$flipped
??=
array_flip
(
$this
->
config
->
get
(
'PageImagesNamespaces'
)
);
return
isset
(
$flipped
[
$pageReference
->
getNamespace
()]
);
}
/**
* Estimates image size as displayed if not explicitly provided. We don't follow the core size
* calculation algorithm precisely because it's not required and editor's intentions are more
* important than the precise number.
*
* @param array[] &$params
* @param File $file
*/
private
function
calcWidth
(
array
&
$params
,
File
$file
)
{
if
(
isset
(
$params
[
'handler'
][
'width'
]
)
)
{
return
;
}
if
(
isset
(
$params
[
'handler'
][
'height'
]
)
&&
$file
->
getHeight
()
>
0
)
{
$params
[
'handler'
][
'width'
]
=
$file
->
getWidth
()
*
(
$params
[
'handler'
][
'height'
]
/
$file
->
getHeight
()
);
}
elseif
(
isset
(
$params
[
'frame'
][
'thumbnail'
]
)
||
isset
(
$params
[
'frame'
][
'thumb'
]
)
||
isset
(
$params
[
'frame'
][
'frameless'
]
)
)
{
$thumbLimits
=
$this
->
config
->
get
(
MainConfigNames
::
ThumbLimits
);
$defaultUserOptions
=
$this
->
config
->
get
(
MainConfigNames
::
DefaultUserOptions
);
$params
[
'handler'
][
'width'
]
=
$thumbLimits
[
$defaultUserOptions
[
'thumbsize'
]]
??
250
;
}
else
{
$params
[
'handler'
][
'width'
]
=
$file
->
getWidth
();
}
}
/**
* Returns score for image, the more the better, if it is less than zero,
* the image shouldn't be used for anything
*
* @param PageImageCandidate $image Associative array describing an image
* @param int $position Image order on page
*
* @return float
*/
protected
function
getScore
(
PageImageCandidate
$image
,
$position
)
{
$classes
=
preg_split
(
'/
\s
+/'
,
$image
->
getFrameClass
(),
-
1
,
PREG_SPLIT_NO_EMPTY
);
// Exclude images with class="notpageimage"
if
(
in_array
(
'notpageimage'
,
$classes
)
)
{
return
-
1000
;
}
$pageImagesScores
=
$this
->
config
->
get
(
'PageImagesScores'
);
if
(
$image
->
getHandlerWidth
()
)
{
// Standalone image
$score
=
$this
->
scoreFromTable
(
$image
->
getHandlerWidth
(),
$pageImagesScores
[
'width'
]
);
}
else
{
// From gallery
$score
=
$this
->
scoreFromTable
(
$image
->
getFullWidth
(),
$pageImagesScores
[
'galleryImageWidth'
]
);
}
if
(
isset
(
$pageImagesScores
[
'position'
][
$position
]
)
)
{
$score
+=
$pageImagesScores
[
'position'
][
$position
];
}
$ratio
=
intval
(
$this
->
getRatio
(
$image
)
*
10
);
$score
+=
$this
->
scoreFromTable
(
$ratio
,
$pageImagesScores
[
'ratio'
]
);
// T91683: Prefer images with class="pageimage". We're simply adding to the current score rather than returning
// earlier, so that the algorithm still helps decide which image to use when multiple have this class.
if
(
in_array
(
'pageimage'
,
$classes
)
)
{
$score
+=
1000
;
}
$denylist
=
$this
->
getDenylist
();
if
(
isset
(
$denylist
[
$image
->
getFileName
()]
)
)
{
$score
=
-
1000
;
}
return
$score
;
}
/**
* Returns score based on table of ranges
*
* @param int $value The number that the various bounds are compared against
* to calculate the score
* @param float[] $scores Table of scores for different ranges of $value
*
* @return float
*/
protected
function
scoreFromTable
(
$value
,
array
$scores
)
{
$lastScore
=
0
;
// The loop stops at the *first* match, and therefore *requires* the input array keys to be
// in increasing order.
ksort
(
$scores
,
SORT_NUMERIC
);
foreach
(
$scores
as
$upperBoundary
=>
$score
)
{
$lastScore
=
$score
;
if
(
$value
<=
$upperBoundary
)
{
break
;
}
}
if
(
!
is_numeric
(
$lastScore
)
)
{
wfLogWarning
(
'The PageImagesScores setting must only contain numeric values!'
);
}
return
(
float
)
$lastScore
;
}
/**
* Check whether image's copyright allows it to be used freely.
*
* @param string $fileName Name of the image file
* @return bool
*/
protected
function
isImageFree
(
$fileName
)
{
$file
=
$this
->
repoGroup
->
findFile
(
$fileName
);
if
(
$file
)
{
// Process copyright metadata from CommonsMetadata, if present.
// Image is considered free if the value is '0' or unset.
return
empty
(
$this
->
fetchFileMetadata
(
$file
)[
'NonFree'
][
'value'
]
);
}
return
true
;
}
/**
* Fetch file metadata
*
* @param File $file File to fetch metadata from
* @return array
*/
protected
function
fetchFileMetadata
(
$file
)
{
$format
=
new
FormatMetadata
;
$context
=
new
DerivativeContext
(
$format
->
getContext
()
);
// we don't care about the language, and specifying singleLanguage is slightly faster
$format
->
setSingleLanguage
(
true
);
// we don't care about the language, so avoid splitting the cache by selecting English
$context
->
setLanguage
(
'en'
);
$format
->
setContext
(
$context
);
return
$format
->
fetchExtendedMetadata
(
$file
);
}
/**
* Returns width/height ratio of an image as displayed or 0 if not available
*
* @param PageImageCandidate $image
*
* @return float|int
*/
protected
function
getRatio
(
PageImageCandidate
$image
)
{
$width
=
$image
->
getFullWidth
();
$height
=
$image
->
getFullHeight
();
return
$width
>
0
&&
$height
>
0
?
$width
/
$height
:
0
;
}
/**
* Returns a list of images denylisted from influencing this extension's output
*
* @return int[] Flipped associative array in format "image BDB key" => int
* @throws Exception
*/
protected
function
getDenylist
()
{
return
$this
->
mainWANObjectCache
->
getWithSetCallback
(
$this
->
mainWANObjectCache
->
makeKey
(
'pageimages-denylist'
),
$this
->
config
->
get
(
'PageImagesDenylistExpiry'
),
function
()
{
$list
=
[];
foreach
(
$this
->
config
->
get
(
'PageImagesDenylist'
)
as
$source
)
{
switch
(
$source
[
'type'
]
)
{
case
'db'
:
$list
=
array_merge
(
$list
,
$this
->
getDbDenylist
(
$source
[
'db'
],
$source
[
'page'
]
)
);
break
;
case
'url'
:
$list
=
array_merge
(
$list
,
$this
->
getUrlDenylist
(
$source
[
'url'
]
)
);
break
;
default
:
throw
new
RuntimeException
(
"unrecognized image denylist type '{$source['type']}'"
);
}
}
return
array_flip
(
$list
);
}
);
}
/**
* Returns list of images linked by the given denylist page
*
* @param string|false $dbName Database name or false for current database
* @param string $page
*
* @return string[]
*/
private
function
getDbDenylist
(
$dbName
,
$page
)
{
$title
=
$this
->
titleFactory
->
newFromText
(
$page
);
if
(
!
$title
||
!
$title
->
canExist
()
)
{
return
[];
}
$dbr
=
$this
->
connectionProvider
->
getReplicaDatabase
(
$dbName
);
$id
=
$dbr
->
newSelectQueryBuilder
()
->
select
(
'page_id'
)
->
from
(
'page'
)
->
where
(
[
'page_namespace'
=>
$title
->
getNamespace
(),
'page_title'
=>
$title
->
getDBkey
()
]
)
->
caller
(
__METHOD__
)->
fetchField
();
if
(
!
$id
)
{
return
[];
}
[
$blNamespace
,
$blTitle
]
=
$this
->
linksMigration
->
getTitleFields
(
'pagelinks'
);
$queryInfo
=
$this
->
linksMigration
->
getQueryInfo
(
'pagelinks'
);
return
$dbr
->
newSelectQueryBuilder
()
->
select
(
$blTitle
)
->
tables
(
$queryInfo
[
'tables'
]
)
->
joinConds
(
$queryInfo
[
'joins'
]
)
->
where
(
[
'pl_from'
=>
(
int
)
$id
,
$blNamespace
=>
NS_FILE
]
)
->
caller
(
__METHOD__
)->
fetchFieldValues
();
}
/**
* Returns list of images on given remote denylist page.
* Not quite 100% bulletproof due to localised namespaces and so on.
* Though if you beat people if they add bad entries to the list... :)
*
* @param string $url
*
* @return string[]
*/
private
function
getUrlDenylist
(
$url
)
{
$list
=
[];
$text
=
$this
->
httpRequestFactory
->
get
(
$url
,
[
'timeout'
=>
3
],
__METHOD__
);
$fileExtensions
=
$this
->
config
->
get
(
'FileExtensions'
);
$regex
=
'/
\[\[
:([^|
\#
]*?
\.
(?:'
.
implode
(
'|'
,
$fileExtensions
)
.
'))/i'
;
if
(
$text
&&
preg_match_all
(
$regex
,
$text
,
$matches
)
)
{
foreach
(
$matches
[
1
]
as
$s
)
{
$t
=
$this
->
titleFactory
->
makeTitleSafe
(
NS_FILE
,
$s
);
if
(
$t
)
{
$list
[]
=
$t
->
getDBkey
();
}
}
}
return
$list
;
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Sat, May 16, 21:11 (1 d, 20 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
a0/a4/b6b8793301698d17db29b180e8e9
Default Alt Text
ParserFileProcessingHookHandlers.php (14 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment