Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F1429590
DataCollector.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
14 KB
Referenced Files
None
Subscribers
None
DataCollector.php
View Options
<?php
namespace
CommonsMetadata
;
use
File
;
use
ForeignAPIFile
;
use
InvalidArgumentException
;
use
LocalFile
;
use
LogicException
;
use
MediaWiki\Language\Language
;
use
MediaWiki\MediaWikiServices
;
use
MediaWiki\Parser\ParserOutput
;
use
WikiFilePage
;
/**
* Class to handle metadata collection and formatting, and manage more specific data extraction
* classes.
*/
class
DataCollector
{
/**
* Mapping of category names to assesment levels. Array keys are regexps which will be
* matched case-insensitively against category names; the first match is returned.
* @var array
*/
protected
static
$assessmentCategories
=
[
'poty'
=>
'/^pictures of the year
\(
.*
\)
/'
,
'potd'
=>
'/^pictures of the day
\(
.*
\)
/'
,
'featured'
=>
'/^featured (pictures|sounds) on wikimedia commons/'
,
'quality'
=>
'/^quality images/'
,
'valued'
=>
'/^valued images/'
,
];
/**
* Language in which data should be collected. Can be null, which means collect all languages.
* @var Language
*/
protected
$language
;
/**
* If true, ignore $language and collect metadata in all languages.
* @var bool
*/
protected
$multiLang
;
/** @var TemplateParser */
protected
$templateParser
;
/** @var LicenseParser */
protected
$licenseParser
;
/**
* @param Language $language
*/
public
function
setLanguage
(
$language
)
{
$this
->
language
=
$language
;
}
/**
* @param bool $multiLang
*/
public
function
setMultiLang
(
$multiLang
)
{
$this
->
multiLang
=
$multiLang
;
}
/**
* @param TemplateParser $templateParser
*/
public
function
setTemplateParser
(
TemplateParser
$templateParser
)
{
$this
->
templateParser
=
$templateParser
;
}
/**
* @param LicenseParser $licenseParser
*/
public
function
setLicenseParser
(
LicenseParser
$licenseParser
)
{
$this
->
licenseParser
=
$licenseParser
;
}
/**
* Collects metadata from a file, and adds it to a metadata array.
* The array has the following format:
*
* '<metadata field name>' => array(
* 'value' => '<value>',
* 'source' => '<where did the data come from>',
* )
*
* For fields with multiple values and/or in multiple languages the format is more complex;
* see the documentation for the extmetadata API.
*
* @param array &$previousMetadata metadata collected so far;
* new metadata will be added to this array
* @param File $file
*/
public
function
collect
(
array
&
$previousMetadata
,
File
$file
)
{
$this
->
normalizeMetadataTimestamps
(
$previousMetadata
);
$descriptionText
=
$this
->
getDescriptionText
(
$file
,
$this
->
language
);
$categories
=
$this
->
getCategories
(
$file
,
$previousMetadata
);
$previousMetadata
=
array_merge
(
$previousMetadata
,
$this
->
getCategoryMetadata
(
$categories
)
);
$templateData
=
$this
->
templateParser
->
parsePage
(
$descriptionText
);
$previousMetadata
=
array_merge
(
$previousMetadata
,
$this
->
getTemplateMetadata
(
$templateData
)
);
}
/**
* Checks for the presence of metadata needed for attributing the file (author, source, license)
* and returns a list of keys corresponding to problems.
* @param ParserOutput $parserOutput
* @param File $file
* @return array one or more of the following keys:
* - no-license - failed to detect a license
* - no-description - failed to detect any image description
* - no-author - failed to detect author name or a custom attribution text
* - no-source - failed to detect the source of the image or a custom attribution text
*/
public
function
verifyAttributionMetadata
(
ParserOutput
$parserOutput
,
File
$file
)
{
// HTML code of the file description
if
(
!
$parserOutput
->
hasText
()
)
{
$descriptionText
=
''
;
}
else
{
$descriptionText
=
$parserOutput
->
getText
();
}
$templateData
=
$this
->
templateParser
->
parsePage
(
$descriptionText
);
$problems
=
$licenseData
=
$informationData
=
[];
if
(
isset
(
$templateData
[
TemplateParser
::
LICENSES_KEY
]
)
)
{
$licenseData
=
$this
->
selectLicense
(
$templateData
[
TemplateParser
::
LICENSES_KEY
]
);
}
if
(
isset
(
$templateData
[
TemplateParser
::
INFORMATION_FIELDS_KEY
]
)
)
{
$informationData
=
$this
->
selectInformationTemplate
(
$templateData
[
TemplateParser
::
INFORMATION_FIELDS_KEY
]
);
}
if
(
!
isset
(
$licenseData
[
'LicenseShortName'
]
)
||
$licenseData
[
'LicenseShortName'
]
===
''
)
{
$problems
[]
=
'no-license'
;
}
if
(
!
isset
(
$informationData
[
'ImageDescription'
]
)
||
$informationData
[
'ImageDescription'
]
===
''
)
{
$problems
[]
=
'no-description'
;
}
if
(
(
!
isset
(
$informationData
[
'Artist'
]
)
||
$informationData
[
'Artist'
]
===
''
)
&&
(
!
isset
(
$informationData
[
'Attribution'
]
)
||
$informationData
[
'Attribution'
]
===
''
)
)
{
$problems
[]
=
'no-author'
;
}
if
(
(
!
isset
(
$informationData
[
'Credit'
]
)
||
$informationData
[
'Credit'
]
===
''
)
&&
(
!
isset
(
$informationData
[
'Attribution'
]
)
||
$informationData
[
'Attribution'
]
===
''
)
)
{
$problems
[]
=
'no-source'
;
}
// Certain uploads (3D objects) need a patent license
$templates
=
$parserOutput
->
getTemplates
();
$templates
=
$templates
[
NS_TEMPLATE
]
??
[];
if
(
!
array_key_exists
(
'3dpatent'
,
$templates
)
&&
$file
->
getMimeType
()
===
'application/sla'
)
{
$problems
[]
=
'no-patent'
;
}
return
$problems
;
}
/**
* @param array $categories
* @return array
*/
protected
function
getCategoryMetadata
(
array
$categories
)
{
$assessments
=
$this
->
getAssessmentsAndRemoveFromCategories
(
$categories
);
$licenses
=
$this
->
getLicensesAndRemoveFromCategories
(
$categories
);
return
[
'Categories'
=>
[
'value'
=>
implode
(
'|'
,
$categories
),
'source'
=>
'commons-categories'
,
],
'Assessments'
=>
[
'value'
=>
implode
(
'|'
,
$assessments
),
'source'
=>
'commons-categories'
,
],
];
}
/**
* @param array $templateData
* @return array
*/
protected
function
getTemplateMetadata
(
$templateData
)
{
// GetExtendedMetadata does not handle multivalued fields,
// we need to select one of everything
$templateFields
=
[];
if
(
isset
(
$templateData
[
TemplateParser
::
COORDINATES_KEY
]
)
)
{
$templateFields
=
array_merge
(
$templateFields
,
$this
->
selectFirst
(
$templateData
[
TemplateParser
::
COORDINATES_KEY
]
)
);
}
if
(
isset
(
$templateData
[
TemplateParser
::
INFORMATION_FIELDS_KEY
]
)
)
{
$templateFields
=
array_merge
(
$templateFields
,
$this
->
selectInformationTemplate
(
$templateData
[
TemplateParser
::
INFORMATION_FIELDS_KEY
]
)
);
}
if
(
isset
(
$templateData
[
TemplateParser
::
LICENSES_KEY
]
)
)
{
$templateFields
=
array_merge
(
$templateFields
,
$this
->
selectLicense
(
$templateData
[
TemplateParser
::
LICENSES_KEY
]
)
);
}
if
(
isset
(
$templateData
[
TemplateParser
::
DELETION_KEY
]
)
)
{
$templateFields
=
array_merge
(
$templateFields
,
$this
->
selectFirst
(
$templateData
[
TemplateParser
::
DELETION_KEY
]
)
);
}
if
(
isset
(
$templateData
[
TemplateParser
::
RESTRICTIONS_KEY
]
)
)
{
$templateFields
=
array_merge
(
$templateFields
,
$this
->
selectFirst
(
$templateData
[
TemplateParser
::
RESTRICTIONS_KEY
]
)
);
}
$metadata
=
[];
foreach
(
$templateFields
as
$name
=>
$value
)
{
$metadata
[
$name
]
=
[
'value'
=>
$value
,
'source'
=>
'commons-desc-page'
];
}
// use short name to generate internal name used in i18n
if
(
isset
(
$templateFields
[
'LicenseShortName'
]
)
)
{
$licenseData
=
$this
->
licenseParser
->
parseLicenseString
(
$templateFields
[
'LicenseShortName'
]
);
if
(
isset
(
$licenseData
[
'name'
]
)
)
{
$metadata
[
'License'
]
=
[
'value'
=>
$licenseData
[
'name'
],
'source'
=>
'commons-templates'
,
];
}
}
return
$metadata
;
}
/**
* Gets the text of the file's description page.
* @param File $file
* @param Language $language
* @return string
*/
protected
function
getDescriptionText
(
File
$file
,
Language
$language
)
{
# Note: If this is a local file, there is no caching here.
# However, the results of this module have longer caching for local
# files to help compensate. For foreign files, this method is cached
# via parser cache, and possibly a second cache depending on
# descriptionCacheExpiry (disabled on Wikimedia).
$text
=
$file
->
getDescriptionText
(
$language
);
if
(
get_class
(
$file
)
==
'LocalFile'
||
get_class
(
$file
)
==
'LocalFileMock'
)
{
// LocalFile gets the text in a different way, and ends up with different output
// (specifically, relative instead of absolute URLs), so transform local URLs
// to absolute URLs after parse.
$text
=
(
new
ParserOutput
(
$text
)
)->
getText
(
[
'absoluteURLs'
=>
true
]
);
}
return
$text
;
}
/**
* @param File $file
* @param array $data metadata passed to the onGetExtendedMetadata hook
* @return string[] list of category names in human-readable format
*/
protected
function
getCategories
(
File
$file
,
array
$data
)
{
$categories
=
[];
if
(
is_a
(
$file
,
'LocalFileMock'
)
||
is_a
(
$file
,
'ForeignDBFileMock'
)
)
{
if
(
!
defined
(
'MW_PHPUNIT_TEST'
)
)
{
throw
new
LogicException
(
'Can only be called in tests'
);
}
// with all the hard-coded dependencies, mocking categoriy retrieval properly is
// pretty much impossible
// @phan-suppress-next-line PhanUndeclaredClassStaticProperty
return
ParserTestHelper
::
$mockedCategories
[
$file
->
getDescriptionText
()]
??
[];
}
elseif
(
$file
instanceof
LocalFile
)
{
// for local or shared DB files (which are also LocalFile subclasses)
// categories can be queried directly from the database
$page
=
MediaWikiServices
::
getInstance
()->
getWikiPageFactory
()->
newFromTitle
(
$file
->
getOriginalTitle
()
);
if
(
!
$page
instanceof
WikiFilePage
)
{
throw
new
InvalidArgumentException
(
'Cannot instance WikiFilePage to get categories for '
.
$file
->
getName
()
.
', got instance of '
.
get_class
(
$page
)
);
}
$page
->
setFile
(
$file
);
$categoryTitles
=
$page
->
getForeignCategories
();
foreach
(
$categoryTitles
as
$title
)
{
$categories
[]
=
$title
->
getText
();
}
}
elseif
(
$file
instanceof
ForeignAPIFile
&&
isset
(
$data
[
'Categories'
]
)
)
{
// getting categories for a ForeignAPIFile is not supported, but in case
// CommonsMetadata is installed on the remote repository as well, its output
// (including categories) is sent together with the extended file metadata,
// when the file is loaded. onGetExtendedMetadata hooks get that metadata
// when they are invoked.
$categories
=
explode
(
'|'
,
$data
[
'Categories'
][
'value'
]
);
}
else
{
// out of luck - file is probably from a ForeignAPIRepo
// with CommonsMetadata not installed there
wfDebug
(
'CommonsMetadata: cannot read category data'
);
}
return
$categories
;
}
/**
* Matches category names to a category => license mapping, removes the matching categories
* and returns the corresponding licenses.
* @param array &$categories a list of human-readable category names.
* @return array
*/
protected
function
getLicensesAndRemoveFromCategories
(
&
$categories
)
{
$licenses
=
[];
foreach
(
$categories
as
$i
=>
$category
)
{
$licenseData
=
$this
->
licenseParser
->
parseLicenseString
(
$category
);
if
(
$licenseData
)
{
$licenses
[]
=
$licenseData
[
'name'
];
unset
(
$categories
[
$i
]
);
}
}
$categories
=
array_merge
(
$categories
);
// renumber to avoid holes in array
return
$licenses
;
}
/**
* Matches category names to a category => assessment mapping, removes the matching categories
* and returns the corresponding assessments (valued image, picture of the day etc).
* @param array &$categories a list of human-readable category names.
* @return array
*/
protected
function
getAssessmentsAndRemoveFromCategories
(
&
$categories
)
{
$assessments
=
[];
foreach
(
$categories
as
$i
=>
$category
)
{
foreach
(
self
::
$assessmentCategories
as
$assessmentType
=>
$regexp
)
{
if
(
preg_match
(
$regexp
.
'i'
,
$category
)
)
{
$assessments
[]
=
$assessmentType
;
unset
(
$categories
[
$i
]
);
}
}
}
$categories
=
array_merge
(
$categories
);
// renumber to avoid holes in array
return
array_unique
(
$assessments
);
// potd/poty can happen multiple times
}
/**
* Receives a list of metadata arrays and selects the first one to use.
* @param array $arrays an array of arrays of metdata fields in fieldname => value form
* @return array an array of metadata fields in fieldname => value form
*/
protected
function
selectFirst
(
$arrays
)
{
// multiple metadata values for the same fields on the same image would not make much sense,
// so use the first value
return
$arrays
?
$arrays
[
0
]
:
[];
}
/**
* Receives the list of information templates found by the template parser and selects which one
* to use. Also collects all the authors to make sure attribution requirements are honored.
* @param array $informationTemplates an array of information templates,
* each is an array of metdata fields in fieldname => value form
* @return array an array of metdata fields in fieldname => value form
*/
protected
function
selectInformationTemplate
(
array
$informationTemplates
)
{
if
(
!
$informationTemplates
)
{
return
[];
}
$authorCount
=
0
;
foreach
(
$informationTemplates
as
$template
)
{
if
(
isset
(
$template
[
'Artist'
]
)
)
{
$authorCount
++;
}
}
if
(
$authorCount
>
1
)
{
$informationTemplates
[
0
][
'AuthorCount'
]
=
$authorCount
;
}
return
$informationTemplates
[
0
];
}
/**
* Receives the list of licenses found by the template parser and selects which one to use.
* @param array $licenses an array of licenses, each is an array of metadata fields
* in fieldname => value form
* @return array an array of metadata fields in fieldname => value form
*/
protected
function
selectLicense
(
array
$licenses
)
{
if
(
!
$licenses
)
{
return
[];
}
$sortedLicenses
=
$this
->
licenseParser
->
sortDataByLicensePriority
(
$licenses
,
static
function
(
$license
)
{
if
(
!
isset
(
$license
[
'LicenseShortName'
]
)
)
{
return
null
;
}
return
$license
[
'LicenseShortName'
];
}
);
// sortDataByLicensePriority puts things in right order but also rearranges the keys
// - we don't want that
$sortedLicenses
=
array_values
(
$sortedLicenses
);
if
(
!
$sortedLicenses
)
{
return
[];
}
// T131896 - if any license template is marked nonfree, the image is probably nonfree
foreach
(
$sortedLicenses
as
$license
)
{
if
(
!
empty
(
$license
[
'NonFree'
]
)
)
{
$sortedLicenses
[
0
][
'NonFree'
]
=
$license
[
'NonFree'
];
break
;
}
}
return
$sortedLicenses
[
0
];
}
/**
* Normalizes the metadata to wfTimestamp()'s TS_DB format
* @param array &$metadata
*/
protected
function
normalizeMetadataTimestamps
(
array
&
$metadata
)
{
$fieldsToNormalize
=
[
'DateTime'
,
'DateTimeOriginal'
];
foreach
(
$fieldsToNormalize
as
$field
)
{
if
(
isset
(
$metadata
[
$field
]
)
&&
isset
(
$metadata
[
$field
][
'value'
]
)
&&
// Multilang values can get down here, which are arrays with
// '_type' => 'lang'. We don't want to pass an array to
// wfTimestamp: it won't work and will annoy PHP.
// @phan-suppress-next-line PhanTypeArraySuspicious
!
isset
(
$metadata
[
$field
][
'value'
][
'_type'
]
)
)
{
$parsedTs
=
wfTimestamp
(
TS_DB
,
$metadata
[
$field
][
'value'
]
);
if
(
$parsedTs
)
{
$metadata
[
$field
][
'value'
]
=
$parsedTs
;
}
}
}
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Sat, May 16, 17:47 (9 h, 32 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
7b/4d/8f7ec654433d188057b8d1f844bb
Default Alt Text
DataCollector.php (14 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment