Page Menu
Home
WickedGov Phorge
Search
Configure Global Search
Log In
Files
F2751739
cleanup.php
No One
Temporary
Actions
Download File
Edit File
Delete File
View Transforms
Subscribe
Flag For Later
Award Token
Size
4 KB
Referenced Files
None
Subscribers
None
cleanup.php
View Options
<?php
/**
* An aggressive spam cleanup script.
* Searches the database for matching pages, and reverts them to
* the last non-spammed revision.
* If all revisions contain spam, blanks the page
*/
use
MediaWiki\Content\ContentHandler
;
use
MediaWiki\Content\TextContent
;
use
MediaWiki\Extension\SpamBlacklist\BaseBlacklist
;
use
MediaWiki\Maintenance\Maintenance
;
use
MediaWiki\MediaWikiServices
;
use
MediaWiki\Page\WikiPageFactory
;
use
MediaWiki\Revision\RevisionLookup
;
use
MediaWiki\Revision\RevisionRecord
;
use
MediaWiki\Revision\SlotRecord
;
use
MediaWiki\Title\Title
;
use
MediaWiki\Title\TitleFormatter
;
use
MediaWiki\User\User
;
// @codeCoverageIgnoreStart
$IP
=
getenv
(
'MW_INSTALL_PATH'
);
if
(
$IP
===
false
)
{
$IP
=
__DIR__
.
'/../../..'
;
}
require_once
"$IP/maintenance/Maintenance.php"
;
// @codeCoverageIgnoreEnd
class
Cleanup
extends
Maintenance
{
/** @var RevisionLookup */
private
$revisionLookup
;
/** @var TitleFormatter */
private
$titleFormatter
;
/** @var WikiPageFactory */
private
$wikiPageFactory
;
public
function
__construct
()
{
parent
::
__construct
();
$this
->
revisionLookup
=
MediaWikiServices
::
getInstance
()->
getRevisionLookup
();
$this
->
titleFormatter
=
MediaWikiServices
::
getInstance
()->
getTitleFormatter
();
$this
->
wikiPageFactory
=
MediaWikiServices
::
getInstance
()->
getWikiPageFactory
();
$this
->
requireExtension
(
'SpamBlacklist'
);
$this
->
addOption
(
'dry-run'
,
'Only do a dry run'
);
}
public
function
execute
()
{
$user
=
User
::
newSystemUser
(
'Spam cleanup script'
,
[
'steal'
=>
true
]
);
$sb
=
BaseBlacklist
::
getSpamBlacklist
();
$regexes
=
$sb
->
getBlacklists
();
if
(
!
$regexes
)
{
$this
->
fatalError
(
"Invalid regex, can't clean up spam"
);
}
$dryRun
=
$this
->
hasOption
(
'dry-run'
);
$dbr
=
$this
->
getReplicaDB
();
$maxID
=
(
int
)
$dbr
->
newSelectQueryBuilder
()
->
select
(
'MAX(page_id)'
)
->
from
(
'page'
)
->
caller
(
__METHOD__
)
->
fetchField
();
$reportingInterval
=
100
;
$this
->
output
(
"Regexes are "
.
implode
(
', '
,
array_map
(
'strlen'
,
$regexes
)
)
.
" bytes
\n
"
);
$this
->
output
(
"Searching for spam in $maxID pages...
\n
"
);
if
(
$dryRun
)
{
$this
->
output
(
"Dry run only
\n
"
);
}
for
(
$id
=
1
;
$id
<=
$maxID
;
$id
++
)
{
if
(
$id
%
$reportingInterval
==
0
)
{
printf
(
"%-8d %-5.2f%%
\r
"
,
$id
,
$id
/
$maxID
*
100
);
}
$revision
=
$this
->
revisionLookup
->
getRevisionByPageId
(
$id
);
if
(
$revision
)
{
$content
=
$revision
->
getContent
(
SlotRecord
::
MAIN
);
$text
=
(
$content
instanceof
TextContent
)
?
$content
->
getText
()
:
null
;
if
(
$text
)
{
foreach
(
$regexes
as
$regex
)
{
if
(
preg_match
(
$regex
,
$text
,
$matches
)
)
{
$titleText
=
$this
->
titleFormatter
->
getPrefixedText
(
$revision
->
getPageAsLinkTarget
()
);
if
(
$dryRun
)
{
$this
->
output
(
"Found spam in [[$titleText]]
\n
"
);
}
else
{
$this
->
output
(
"Cleaning up links to {$matches[0]} in [[$titleText]]
\n
"
);
$match
=
str_replace
(
'http://'
,
''
,
$matches
[
0
]
);
$this
->
cleanupArticle
(
$revision
,
$regexes
,
$match
,
$user
);
}
}
}
}
}
}
// Just for satisfaction
printf
(
"%-8d %-5.2f%%
\n
"
,
$id
-
1
,
(
$id
-
1
)
/
$maxID
*
100
);
}
/**
* Find the latest revision of the article that does not contain spam and revert to it
* @param RevisionRecord $rev
* @param array $regexes
* @param string $match
* @param User $user
*/
private
function
cleanupArticle
(
RevisionRecord
$rev
,
$regexes
,
$match
,
User
$user
)
{
$title
=
Title
::
newFromLinkTarget
(
$rev
->
getPageAsLinkTarget
()
);
while
(
$rev
)
{
$matches
=
false
;
$content
=
$rev
->
getContent
(
SlotRecord
::
MAIN
);
foreach
(
$regexes
as
$regex
)
{
$matches
=
$matches
||
preg_match
(
$regex
,
(
$content
instanceof
TextContent
)
?
$content
->
getText
()
:
null
);
}
if
(
!
$matches
)
{
// Didn't find any spam
break
;
}
$rev
=
$this
->
revisionLookup
->
getPreviousRevision
(
$rev
);
}
if
(
!
$rev
)
{
// Didn't find a non-spammy revision, blank the page
$this
->
output
(
"All revisions are spam, blanking...
\n
"
);
$content
=
ContentHandler
::
makeContent
(
''
,
$title
);
$comment
=
"All revisions matched the spam blacklist ($match), blanking"
;
}
else
{
// Revert to this revision
$content
=
$rev
->
getContent
(
SlotRecord
::
MAIN
)
?:
ContentHandler
::
makeContent
(
''
,
$title
);
$comment
=
"Cleaning up links to $match"
;
}
$wikiPage
=
$this
->
wikiPageFactory
->
newFromTitle
(
$title
);
$wikiPage
->
doUserEditContent
(
$content
,
$user
,
$comment
);
}
}
// @codeCoverageIgnoreStart
$maintClass
=
Cleanup
::
class
;
require_once
RUN_MAINTENANCE_IF_MAIN
;
// @codeCoverageIgnoreEnd
File Metadata
Details
Attached
Mime Type
text/x-php
Expires
Fri, Jul 3, 19:06 (1 d, 20 h)
Storage Engine
local-disk
Storage Format
Raw Data
Storage Handle
5a/9e/aca6350ad96d067cee8ba60157b8
Default Alt Text
cleanup.php (4 KB)
Attached To
Mode
rMWPROD MediaWiki Production
Attached
Detach File
Event Timeline
Log In to Comment