Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F2751115
BaseDump.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
6 KB
Referenced Files
None
Subscribers
None
BaseDump.php
View Options
<?php
/**
* Helper class for the --prefetch option of dumpTextPass.php
*
* Copyright © 2005 Brooke Vibber <bvibber@wikimedia.org>
* https://www.mediawiki.org/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
* http://www.gnu.org/copyleft/gpl.html
*
* @file
* @ingroup Maintenance
*/
use
MediaWiki\Revision\SlotRecord
;
/**
* Readahead helper for making large MediaWiki data dumps;
* reads in a previous XML dump to sequentially prefetch text
* records already normalized and decompressed.
*
* This can save load on the external database servers, hopefully.
*
* Assumes that dumps will be recorded in the canonical order:
* - ascending by page_id
* - ascending by rev_id within each page
* - text contents are immutable and should not change once
* recorded, so the previous dump is a reliable source
*
* @ingroup Maintenance
*/
class
BaseDump
{
/** @var XMLReader|null */
protected
$reader
=
null
;
/** @var bool */
protected
$atEnd
=
false
;
/** @var bool */
protected
$atPageEnd
=
false
;
/** @var int */
protected
$lastPage
=
0
;
/** @var int */
protected
$lastRev
=
0
;
/** @var string[]|null */
protected
$infiles
=
null
;
/**
* @param string $infile
*/
public
function
__construct
(
$infile
)
{
$this
->
infiles
=
explode
(
';'
,
$infile
);
$this
->
reader
=
new
XMLReader
();
$infile
=
array_shift
(
$this
->
infiles
);
if
(
!
$this
->
reader
->
open
(
$infile
,
null
,
LIBXML_PARSEHUGE
)
)
{
$this
->
debug
(
__METHOD__
.
' was unable to open xml'
);
$this
->
atEnd
=
true
;
}
}
/**
* Attempts to fetch the text of a particular page revision
* from the dump stream. May return null if the page is
* unavailable.
*
* @param int $page ID number of page to read
* @param int $rev ID number of revision to read
* @param string $slot Role name of the slot to read
* @return string|null
*/
public
function
prefetch
(
$page
,
$rev
,
$slot
=
SlotRecord
::
MAIN
)
{
$page
=
intval
(
$page
);
$rev
=
intval
(
$rev
);
while
(
$this
->
lastPage
<
$page
&&
!
$this
->
atEnd
)
{
$this
->
debug
(
"BaseDump::prefetch at page $this->lastPage, looking for $page"
);
$this
->
nextPage
();
}
if
(
$this
->
lastPage
>
$page
||
$this
->
atEnd
)
{
$this
->
debug
(
"BaseDump::prefetch already past page $page or failed to open/read input file, "
.
"looking for rev $rev [$this->lastPage, $this->lastRev]"
);
return
null
;
}
while
(
$this
->
lastRev
<
$rev
&&
!
$this
->
atEnd
&&
!
$this
->
atPageEnd
)
{
$this
->
debug
(
"BaseDump::prefetch at page $this->lastPage, rev $this->lastRev, "
.
"looking for $page, $rev"
);
$this
->
nextRev
();
}
if
(
$this
->
lastRev
==
$rev
&&
!
$this
->
atEnd
)
{
$this
->
debug
(
"BaseDump::prefetch hit on $page, $rev [$this->lastPage, $this->lastRev]"
);
if
(
$slot
!==
SlotRecord
::
MAIN
)
{
$lastSlot
=
SlotRecord
::
MAIN
;
while
(
$lastSlot
!==
$slot
)
{
if
(
!
$this
->
skipTo
(
'content'
,
'revision'
)
||
!
$this
->
skipTo
(
'role'
,
'revision'
)
)
{
return
null
;
}
$lastSlot
=
$this
->
nodeContents
();
}
}
return
$this
->
nextText
();
}
else
{
$this
->
debug
(
"BaseDump::prefetch already past rev $rev on page $page "
.
"[$this->lastPage, $this->lastRev]"
);
return
null
;
}
}
/**
* @param string $str
*/
protected
function
debug
(
$str
)
{
wfDebug
(
$str
);
// global $dumper;
// $dumper->progress( $str );
}
private
function
nextPage
()
{
if
(
$this
->
skipTo
(
'page'
,
'mediawiki'
)
)
{
if
(
$this
->
skipTo
(
'id'
)
)
{
$this
->
lastPage
=
intval
(
$this
->
nodeContents
()
);
$this
->
lastRev
=
0
;
$this
->
atPageEnd
=
false
;
}
}
else
{
$this
->
close
();
if
(
count
(
$this
->
infiles
)
)
{
$infile
=
array_shift
(
$this
->
infiles
);
if
(
!
$this
->
reader
->
open
(
$infile
,
null
,
LIBXML_PARSEHUGE
)
)
{
$this
->
debug
(
__METHOD__
.
' was unable to open xml'
);
$this
->
atEnd
=
true
;
}
else
{
$this
->
atEnd
=
false
;
}
}
}
}
private
function
nextRev
()
{
if
(
$this
->
skipTo
(
'revision'
)
)
{
if
(
$this
->
skipTo
(
'id'
)
)
{
$this
->
lastRev
=
intval
(
$this
->
nodeContents
()
);
}
}
else
{
$this
->
atPageEnd
=
true
;
}
}
/**
* @return string|null
*/
private
function
nextText
()
{
if
(
!
$this
->
skipTo
(
'text'
,
'revision'
)
)
{
return
null
;
}
return
strval
(
$this
->
nodeContents
()
);
}
/**
* @param string $name
* @param string $parent
* @return bool|null
*/
private
function
skipTo
(
$name
,
$parent
=
'page'
)
{
if
(
$this
->
atEnd
)
{
return
false
;
}
while
(
$this
->
reader
->
read
()
)
{
if
(
$this
->
reader
->
nodeType
==
XMLReader
::
ELEMENT
&&
$this
->
reader
->
name
==
$name
)
{
return
true
;
}
if
(
$this
->
reader
->
nodeType
==
XMLReader
::
END_ELEMENT
&&
$this
->
reader
->
name
==
$parent
)
{
$this
->
debug
(
"BaseDump::skipTo found </$parent> searching for <$name>"
);
return
false
;
}
}
return
$this
->
close
();
}
/**
* Shouldn't something like this be built-in to XMLReader?
* Fetches text contents of the current element, assuming
* no sub-elements or such scary things.
*
* @return string|null
*/
private
function
nodeContents
()
{
if
(
$this
->
atEnd
)
{
return
null
;
}
if
(
$this
->
reader
->
isEmptyElement
)
{
return
""
;
}
$buffer
=
""
;
while
(
$this
->
reader
->
read
()
)
{
switch
(
$this
->
reader
->
nodeType
)
{
case
XMLReader
::
TEXT
:
// case XMLReader::WHITESPACE:
case
XMLReader
::
SIGNIFICANT_WHITESPACE
:
$buffer
.=
$this
->
reader
->
value
;
break
;
case
XMLReader
::
END_ELEMENT
:
return
$buffer
;
}
}
return
$this
->
close
();
}
/**
* @return null
*/
public
function
close
()
{
$this
->
reader
->
close
();
$this
->
atEnd
=
true
;
return
null
;
}
}
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Fri, Jul 3, 17:53 (19 h, 47 m)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
96/1e/d925db9c48a6642bbed7bd8df319
Default Alt Text
BaseDump.php (6 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment