Page MenuHomeWickedGov Phorge

TemplateParser.php
No OneTemporary

Size
18 KB
Referenced Files
None
Subscribers
None

TemplateParser.php

<?php
namespace CommonsMetadata;
use DOMElement;
use DOMNode;
/**
* Class to parse metadata from commons formatted wiki pages.
* Relies on the attributes set by {{Information}} and similar templates - see
* https://commons.wikimedia.org/wiki/Commons:Machine-readable_data
*/
class TemplateParser {
public const COORDINATES_KEY = 'coordinates';
public const LICENSES_KEY = 'licenses';
public const INFORMATION_FIELDS_KEY = 'informationFields';
public const DELETION_KEY = 'deletion';
public const RESTRICTIONS_KEY = 'restrictions';
/**
* HTML element class name => metadata field name mapping for license data.
* @var array
*/
protected static $licenseFieldClasses = [
'licensetpl_short' => 'LicenseShortName',
'licensetpl_long' => 'UsageTerms',
'licensetpl_attr_req' => 'AttributionRequired',
'licensetpl_attr' => 'Attribution',
// 'licensetpl_link_req',
'licensetpl_link' => 'LicenseUrl',
'licensetpl_nonfree' => 'NonFree',
];
/**
* HTML element class/id => metadata field name mapping for information template data.
* @var array
*/
protected static $informationFieldClasses = [
'fileinfotpl_desc' => 'ImageDescription',
# For date: Open question - should we parse the commons
# date field better to deal with templates like
# {{Taken on}} et al. along with extracting a time stamp
# from the human readable field?
'fileinfotpl_date' => 'DateTimeOriginal',
'fileinfotpl_aut' => 'Artist',
# For "source" field of {{information}} there are two closely
# related fields we could map it to. Credit (iptc 2:110) is
# "Identifies the provider of the media, not necessarily the
# owner/creator." Source (iptc 2:115) "Identifies the
# original owner of the intellectual content of the media. This
# could be an agency, a member of an agency or an individual."
# I think "Credit" fits much more closely to the commons notion
# of source than "Source" does.
'fileinfotpl_src' => 'Credit',
'fileinfotpl_art_title' => 'ObjectName',
'fileinfotpl_perm' => 'Permission',
'fileinfotpl_credit' => 'Attribution',
];
/**
* Classnames identifying {{Information}}-like templates, ordered from highest to lowest
* priority. Higher priority means that template is more likely to be about the image
* (as opposed to e.g. some object visible on the image), data in higher-priority templates
* will be preferred. The classes should be on the <table> element (for templates using the
* deprecated id-based fieldname markup) or on the same element which has the "fileinfotpl"
* class (for templates with the class-based markup).
* @var array
*/
protected static $infoTemplateClasses = [
'fileinfotpl-type-photograph',
'fileinfotpl-type-information',
'fileinfotpl-type-artwork',
];
/**
* List for templates which should not have handled like {{Information}} even if they have
* fields matching $informationFieldClasses. Elements of this array refer to the same kind of
* classnames as $infoTemplateClasses.
* @var array
*/
protected static $infoTemplateExclusion = [
'fileinfotpl-type-book',
];
/**
* preg_replace patterns which will be used to clean up parsed HTML code.
* @var array
*/
protected static $cleanupPatterns = [
// trim leading or trailing whitespace
'/^\s+|\s+$/' => '',
// clean paragraph with no styling - usually generated by MediaWiki
'/^<p>(.*)<\/p>$/' => '\1',
];
/** @var array */
protected $priorityLanguages = [ 'en' ];
/** @var bool */
protected $multiLanguage = false;
/** @var string */
protected $artistCreditSeparator = ' / ';
/**
* When parsing multi-language text, use the first available language from this array.
* (Order matters - try to use the first element, if not available the second etc.)
* When set to false, will return all languages.
* @param array $priorityLanguages
*/
public function setPriorityLanguages( $priorityLanguages ) {
$this->priorityLanguages = $priorityLanguages;
}
/**
* When true, the parser will ignore $priorityLanguages and return all available languages.
* @param bool $multiLanguage
*/
public function setMultiLanguage( $multiLanguage ) {
$this->multiLanguage = $multiLanguage;
}
/**
* The separator used between multiple values of artist or credit when retrieved from a vcard
* @param string $separator
* @return void
*/
public function setArtistCreditSeparator( $separator ) {
$this->artistCreditSeparator = $separator;
}
/**
* Parse an html string for metadata.
*
* This is the main entry point to the class.
*
* @param string $html The html to parse
* @return array The properties extracted from the page.
*/
public function parsePage( $html ) {
if ( !$html ) { // DOMDocument does not like empty strings
return [];
}
$domNavigator = new DomNavigator( $html );
return array_filter( [
self::COORDINATES_KEY => $this->parseCoordinates( $domNavigator ),
self::INFORMATION_FIELDS_KEY => $this->parseInformationFields( $domNavigator ),
self::LICENSES_KEY => $this->parseLicenses( $domNavigator ),
self::DELETION_KEY => $this->parseNuke( $domNavigator ),
self::RESTRICTIONS_KEY => $this->parseRestrictions( $domNavigator ),
] );
}
/**
* Parses geocoded coordinates.
* @param DomNavigator $domNavigator
* @return array
*/
protected function parseCoordinates( DomNavigator $domNavigator ) {
$data = [];
foreach ( $domNavigator->findElementsWithClass( 'span', 'geo' ) as $geoNode ) {
$coordinateData = [];
$coords = explode( ';', $geoNode->textContent );
if ( count( $coords ) == 2 && is_numeric( $coords[0] ) && is_numeric( $coords[1] ) ) {
$coordinateData['GPSLatitude'] = trim( $coords[0] );
$coordinateData['GPSLongitude'] = trim( $coords[1] );
$coordinateData['GPSMapDatum'] = 'WGS-84';
}
$data[] = $coordinateData;
}
return $data;
}
/**
* Parses the {{Information}} templates (and anything using the same metadata notation,
* like {{Artwork}})
* @param DomNavigator $domNavigator
* @return array an array if information(-like) templates:
* array( 0 => array( 'ImageDescription' => ... ) ... )
*/
protected function parseInformationFields( DomNavigator $domNavigator ) {
$attributePrefix = 'fileinfotpl_';
$data = [];
$labelFields = $domNavigator->findElementsWithIdPrefix( [ 'td', 'th' ], $attributePrefix );
foreach ( $labelFields as $labelField ) {
$informationField = $domNavigator->nextElementSibling( $labelField );
if ( !$informationField ) {
continue;
}
$id = $labelField->getAttribute( 'id' );
$group = $domNavigator->closest( $informationField, 'table' );
$this->parseInformationField( $domNavigator, $informationField, $group, $id, $data );
}
foreach ( $domNavigator->findElementsWithClass( '*', 'fileinfotpl' ) as $group ) {
$informationFields = $domNavigator->findElementsWithClassPrefix(
'*', $attributePrefix, $group );
foreach ( $informationFields as $informationField ) {
$class = $domNavigator->getFirstClassWithPrefix(
$informationField, $attributePrefix );
$this->parseInformationField(
$domNavigator, $informationField, $group, $class, $data );
}
}
$this->pruneInfoTemplateData( $data );
$this->sortInformationGroups( $data );
// using node paths to identify tables is an internal detail, hide it
return array_values( $data );
}
/**
* Helper function for the inner loop of parseInformationFields
* @param DomNavigator $domNavigator
* @param DOMElement $informationField the node holding the data
* @param DOMElement|null $group the top node containing all fields of this type; expected (but
* not required) to have one of the $informationFieldClasses.
* @param string $idOrClass id or class identifying the field, per $informationFieldClasses Node
* is ignored if this is not a key of $informationFieldClasses. Also ignored if this is null.
* @param array[] &$data
*/
protected function parseInformationField(
DomNavigator $domNavigator, DOMElement $informationField, $group, $idOrClass, array &$data
) {
if ( !isset( self::$informationFieldClasses[$idOrClass] ) ) {
return;
}
$fieldName = self::$informationFieldClasses[$idOrClass];
// group fields coming from the same template
$groupName = $groupType = '-';
if ( $group ) {
$groupName = $group->getNodePath() ?? '-';
$groupType =
$domNavigator->getFirstClassWithPrefix( $group, 'fileinfotpl-type-' ) ?: '-';
}
if ( isset( $data[$groupName][$fieldName] ) ) {
// don't parse the same field multiple times if it has both id and classes; also
// ignore a second field of the same type in the same template
return;
}
$method = 'parseField' . $fieldName;
if ( !method_exists( $this, $method ) ) {
$method = 'parseContents';
}
$data[$groupName][$fieldName] = $this->{$method}( $domNavigator, $informationField );
$data[$groupName]['_type'] = $groupType;
}
/**
* Sorts info template data groups according to $informationFieldClasses, highest priority first
* Also removes the _type helper keys.
* @param array[] &$data info template data, as returned by parseInformationFields()
*/
protected function sortInformationGroups( array &$data ) {
// PHP 5.3 does not like class references in closures
$infoTemplateClasses = self::$infoTemplateClasses;
uasort( $data, static function ( $template1, $template2 ) use ( $infoTemplateClasses ) {
$priority1 = array_search( $template1['_type'], $infoTemplateClasses );
$priority2 = array_search( $template2['_type'], $infoTemplateClasses );
// preserve the order of unknown templates; known precedes unknown
if ( $priority2 === false ) {
return -1;
} elseif ( $priority1 === false ) {
return 1;
}
// $pri1 is smaller -> $template1['_type'] comes first in
// $informationFieldClasses -> should return negative so $template1 comes first
return $priority1 - $priority2;
} );
foreach ( $data as &$group ) {
unset( $group['_type'] );
}
}
/**
* Prunes template data
* Removes blacklisted templates if they are not alone
* @param array[] &$data info template data
*/
protected function pruneInfoTemplateData( array &$data ) {
foreach ( $data as $key => &$group ) {
if ( in_array( $group['_type'], self::$infoTemplateExclusion )
&& count( $data ) !== 1
) {
unset( $data[$key] );
}
}
}
/**
* Parses the artist, which might be an hCard
* @param DomNavigator $domNavigator
* @param DOMNode $node
* @return string
*/
protected function parseFieldArtist( DomNavigator $domNavigator, DOMNode $node ) {
return $this->parseCreditOrArtist( $domNavigator, $node );
}
/**
* @param DomNavigator $domNavigator
* @param DOMNode $node
* @return string
*/
protected function parseFieldCredit( DomNavigator $domNavigator, DOMNode $node ) {
return $this->parseCreditOrArtist( $domNavigator, $node );
}
/**
* @param DomNavigator $domNavigator
* @param DOMNode $node
* @return string
*/
protected function parseCreditOrArtist( DomNavigator $domNavigator, DOMNode $node ) {
$fields = $this->extractHCardProperty( $domNavigator, $node, 'fn' );
if ( count( $fields ) ) {
$fields = array_map( function ( $field ) {
return $this->cleanedInnerHtml( $field );
}, $fields );
return implode( $this->artistCreditSeparator, $fields );
}
return $this->parseContents( $domNavigator, $node );
}
/**
* Parses the DateTimeOriginal - finds <time> tag and returns the value of datetime attribute
* @param DomNavigator $domNavigator
* @param DOMNode $node
* @return string
*/
protected function parseFieldDateTimeOriginal( DomNavigator $domNavigator, DOMNode $node ) {
$nodes = $domNavigator->findElementsWithAttribute( 'time', 'datetime', $node );
foreach ( $nodes as $time ) {
return $time->getAttribute( 'datetime' );
}
return $this->parseContents( $domNavigator, $node );
}
/**
* Extracts an hCard property from a DOMNode that contains one or more hCard
* @param DomNavigator $domNavigator
* @param DOMNode $node
* @param string $property hCard property to be extracted
* @return array
*/
protected function extractHCardProperty(
DomNavigator $domNavigator, DOMNode $node, $property
) {
$values = [];
foreach ( $domNavigator->findElementsWithClass( '*', 'vcard', $node ) as $vcard ) {
foreach ( $domNavigator->findElementsWithClass( '*', $property, $vcard ) as $name ) {
$values[] = $name;
}
}
return $values;
}
/**
* @param DomNavigator $domNavigator
* @return array an array of licenses: array( 0 => array( 'LincenseShortName' => ... ) ... )
*/
protected function parseLicenses( DomNavigator $domNavigator ) {
$data = [];
foreach ( $domNavigator->findElementsWithClass( '*', 'licensetpl' ) as $licenseNode ) {
$licenseData = $this->parseLicenseNode( $domNavigator, $licenseNode );
if ( isset( $licenseData['UsageTerms'] ) ) {
$licenseData['Copyrighted'] = ( $licenseData['UsageTerms'] === 'Public domain' )
? 'False' : 'True';
}
$data[] = $licenseData;
}
return $data;
}
/**
* @param DomNavigator $domNavigator
* @param DOMNode $licenseNode
* @return array
*/
protected function parseLicenseNode( DomNavigator $domNavigator, DOMNode $licenseNode ) {
$data = [];
foreach ( self::$licenseFieldClasses as $class => $fieldName ) {
foreach ( $domNavigator->findElementsWithClass( '*', $class, $licenseNode ) as $node ) {
$data[$fieldName] = $this->cleanedInnerHtml( $node );
break;
}
}
return $data;
}
/**
* Parse and return deletion reason from the {{Nuke}} template
* ( https://commons.wikimedia.org/wiki/Template:Nuke )
* @param DomNavigator $domNavigator
* @return array
*/
protected function parseNuke( DomNavigator $domNavigator ) {
$deletions = [];
foreach ( $domNavigator->findElementsWithClass( '*', 'nuke' ) as $nukeNode ) {
$nukeLink = $nukeNode->firstChild;
if ( $nukeLink
&& $nukeLink instanceof DOMElement && $nukeLink->hasAttribute( 'href' )
) {
$urlBits = wfParseUrl( $nukeLink->getAttribute( 'href' ) );
if ( isset( $urlBits['query'] ) ) {
$params = wfCgiToArray( $urlBits['query'] );
if ( isset( $params['action'] ) && $params['action'] === 'delete'
&& isset( $params['wpReason'] )
) {
$deletions[] = [ 'DeletionReason' => $params['wpReason'] ];
}
}
}
}
return $deletions;
}
/**
* Parses file restrictions i.e. trademark, insignia, etc.
* @param DomNavigator $domNavigator
* @return array
*/
protected function parseRestrictions( DomNavigator $domNavigator ) {
$restrictionPrefix = 'restriction-';
$restrictions = [];
foreach (
$domNavigator->findElementsWithClassPrefix( '*', $restrictionPrefix ) as $element
) {
$classes = explode( ' ', $element->getAttribute( 'class' ) );
foreach ( $classes as $class ) {
if ( strpos( $class, $restrictionPrefix ) === 0 ) {
$restrictionType = substr( $class, strlen( $restrictionPrefix ) );
$restrictions[] = $restrictionType;
}
}
}
return [ [ 'Restrictions' => implode( '|', array_unique( $restrictions ) ) ] ];
}
/**
* Get the text of a node. The result might be a string, or an array of strings if the node has
* multiple languages (resulting from {{en}} and similar templates).
* @param DomNavigator $domNavigator
* @param DOMNode $node
* @return string|array
*/
protected function parseContents( DomNavigator $domNavigator, DOMNode $node ) {
$languageNodes = $domNavigator->findElementsWithClassAndLang( 'div', 'description', $node );
if ( !$languageNodes->length ) { // no language templates at all
return $this->cleanedInnerHtml( $node );
}
$languages = [];
foreach ( $languageNodes as $node ) {
$node = $this->removeLanguageName( $domNavigator, $node );
$languageCode = $node->getAttribute( 'lang' );
$languages[$languageCode] = $node;
}
if ( !$this->multiLanguage ) {
return $this->cleanedInnerHtml( $this->selectLanguage( $languages ) );
} else {
$languages = array_map( [ $this, 'cleanedInnerHtml' ], $languages );
$languages['_type'] = 'lang';
return $languages;
}
}
/**
* Language templates like {{en}} put the language name at the beginning of the text;
* this function removes it.
* @param DomNavigator $domNavigator
* @param DOMElement $node
* @return DOMElement a clone of the input node, with the language name removed
*/
protected function removeLanguageName( DomNavigator $domNavigator, DOMElement $node ) {
$node = $node->cloneNode( true );
$languageNames = $domNavigator->findElementsWithClass( 'span', 'language', $node );
foreach ( $languageNames as $languageName ) {
$parentNode = $languageName->parentNode;
if ( $parentNode !== null && !$node->isSameNode( $parentNode ) ) {
continue; // language names are direct children
}
$node->removeChild( $languageName );
}
// @phan-suppress-next-line PhanTypeMismatchReturnSuperType cloneNode returns `static`
return $node;
}
/**
* Takes an array indexed with language codes, and returns the best match.
* @param array $languages
* @return mixed
*/
protected function selectLanguage( array $languages ) {
foreach ( $this->priorityLanguages as $languageCode ) {
if ( array_key_exists( $languageCode, $languages ) ) {
return $languages[$languageCode];
}
}
return reset( $languages );
}
/**
* Turns a node into a HTML string
* @param DOMNode $node
* @return string
*/
protected function toHtml( DOMNode $node ) {
return $node->ownerDocument->saveHTML( $node );
}
/**
* Turns a node into plain text
* @param DOMNode $node
* @return string
*/
protected function toText( DOMNode $node ) {
return trim( $node->textContent );
}
/**
* Turns a node into HTML, except for the enclosing tag.
* @param DOMNode $node
* @return string
*/
protected function innerHtml( DOMNode $node ) {
if ( !$node instanceof DOMElement ) {
return $this->toHtml( $node );
}
$html = '';
foreach ( $node->childNodes as $child ) {
$html .= $this->toHtml( $child );
}
return $html;
}
/**
* Turns a node into HTML, except for the enclosing tag.
* Cleans up the contents by removing enclosing whitespace and some HTML elements.
* @param DOMNode $node
* @return string
*/
protected function cleanedInnerHtml( DOMNode $node ) {
$html = $this->innerHtml( $node );
do {
$oldHtml = $html;
foreach ( static::$cleanupPatterns as $pattern => $replacement ) {
$html = preg_replace( $pattern, $replacement, $html );
}
} while ( $oldHtml !== $html );
return $html;
}
/**
* Switch rows and columns. Usually it is easier to collect data grouped by source template,
* but the extmetadata API needs grouping by field name, this function turns around the grouping
* @param array $data
* @return array
*/
protected function arrayTranspose( array $data ) {
$transposedData = [];
foreach ( $data as $groupName => $group ) {
foreach ( $group as $fieldName => $value ) {
$transposedData[$fieldName][$groupName] = $value;
}
}
return $transposedData;
}
}

File Metadata

Mime Type
text/x-php
Expires
Sat, May 16, 20:31 (1 d, 6 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
21/37/2766194fff7431fc365c19cd2bb9
Default Alt Text
TemplateParser.php (18 KB)

Event Timeline