Skip to content

Commit 7b6e962

Browse files
author
mikolajmeller
committed
feat(PLATFORM-10779): restore partial support for linkstoexternal and introduce replacement via linkstoexternaldomain and linkstoexternalpath
1 parent cb95b6d commit 7b6e962

File tree

4 files changed

+213
-26
lines changed

4 files changed

+213
-26
lines changed
+44
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
<?php
2+
3+
namespace MediaWiki\Extension\DynamicPageList3;
4+
5+
use MediaWiki\Extension\DynamicPageList3\Tests\DPLExternalDomainPatternParserTest;
6+
7+
trait ExternalDomainPatternParser {
8+
/**
9+
* We provide:
10+
* * full support for "standalone" wildcard usage (eg. `%.fandom.com`)
11+
* * partial support for wildcard usage when it is not separated by `.` (eg. `%fandom.com would match starwars.fandom-suffix.com)
12+
* * protocols followed by the `://` are supported, like `http://` or `https://` (`mailto:` on the other hand is not supported)
13+
*
14+
* @See DPLExternalDomainPatternParserTest for example cases
15+
*/
16+
private function parseDomainPattern( string $pattern ): string {
17+
$protocol = false;
18+
// Protocol is specified. Strip it
19+
if ( str_contains( $pattern, '://' ) ) {
20+
[$protocol, $pattern] = explode( '://', $pattern );
21+
}
22+
23+
// Previous step will strip protocol if it was specified
24+
[$domainPattern, ] = explode( '/', $pattern, 2 );
25+
$parts = explode( '.', $domainPattern );
26+
$reversed = array_reverse( $parts );
27+
foreach ( $reversed as &$part ) {
28+
if ( $part === '%' ) {
29+
continue;
30+
}
31+
if ( str_starts_with( $part, '%' ) ) {
32+
$part .= '%';
33+
} else if ( str_ends_with( $part, '%' ) ) {
34+
$part = '%' . $part;
35+
}
36+
}
37+
38+
$rawPattern = implode( '.', $reversed );
39+
if ( $protocol ) {
40+
return "$protocol://$rawPattern.";
41+
}
42+
return "%://$rawPattern.";
43+
}
44+
}

includes/ParametersData.php

+30-4
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,9 @@ class ParametersData {
133133
'categoryregexp',
134134
'firstrevisionsince',
135135
'lastrevisionbefore',
136+
'linkstoexternal',
137+
'linkstoexternaldomain',
138+
'linkstoexternalpath',
136139
'maxrevisions',
137140
'minrevisions',
138141
'notcategorymatch',
@@ -149,7 +152,6 @@ class ParametersData {
149152
],
150153
// Should never be used; likely broken or will cause exceptions
151154
5 => [
152-
'linkstoexternal',
153155
],
154156
];
155157

@@ -577,13 +579,37 @@ class ParametersData {
577579
'page_name_must_exist' => true,
578580
'set_criteria_found' => true
579581
],
582+
/**
583+
* Alias for `linkstoexternaldomain`.
584+
* To mimic old behaviour use `linkstoexternaldomain` together with `linkstoexternalpath`
585+
*/
586+
'linkstoexternal' => [
587+
'default' => null,
588+
'open_ref_conflict' => true,
589+
'page_name_list' => true,
590+
'page_name_must_exist' => false,
591+
'set_criteria_found' => true
592+
],
580593
/**
581594
* This parameter restricts the output to articles which contain an external
582-
* reference that conatins a certain pattern.
595+
* domain reference that contains a certain pattern.
583596
*
584-
* Examples: linkstoexternal= www.xyz.com|www.xyz2.com
597+
* Examples: linkstoexternaldomain=www.xyz.com|www.xyz2.%
585598
*/
586-
'linkstoexternal' => [
599+
'linkstoexternaldomain' => [
600+
'default' => null,
601+
'open_ref_conflict' => true,
602+
'page_name_list' => true,
603+
'page_name_must_exist' => false,
604+
'set_criteria_found' => true
605+
],
606+
/**
607+
* This parameter restricts the output to articles which contain an external
608+
* path reference that contains a certain pattern.
609+
*
610+
* Examples: linkstoexternalpath=/xyz/%|%/abc/%
611+
*/
612+
'linkstoexternalpath' => [
587613
'default' => null,
588614
'open_ref_conflict' => true,
589615
'page_name_list' => true,

includes/Query.php

+75-22
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,10 @@
1616
use Wikimedia\ObjectCache\WANObjectCache;
1717
use Wikimedia\Rdbms\Database;
1818
use Wikimedia\Rdbms\IDatabase;
19+
use Wikimedia\Rdbms\Platform\ISQLPlatform;
1920

2021
class Query {
22+
use ExternalDomainPatternParser;
2123
/**
2224
* Parameters Object
2325
*
@@ -1588,40 +1590,91 @@ private function _notlinksto( $option ) {
15881590
* @param mixed $option
15891591
*/
15901592
private function _linkstoexternal( $option ) {
1593+
$this->_linkstoexternaldomain( $option );
1594+
}
1595+
1596+
/**
1597+
* Set SQL for 'linkstoexternaldomain' parameter.
1598+
*
1599+
* @param mixed $option
1600+
*/
1601+
private function _linkstoexternaldomain( $option ) {
15911602
if ( $this->parameters->getParameter( 'distinct' ) == 'strict' ) {
15921603
$this->addGroupBy( 'page_title' );
15931604
}
15941605

1595-
if ( count( $option ) > 0 ) {
1596-
$this->addTable( 'externallinks', 'el' );
1597-
$this->addSelect( [ 'el_to' => 'el.el_to' ] );
1606+
if ( count( $option ) == 0 ) {
1607+
// Nothing to do
1608+
return;
1609+
}
1610+
$this->addTable( 'externallinks', 'el' );
1611+
$this->addSelect( [ 'el_to_domain_index' => 'el.el_to_domain_index' ] );
15981612

1599-
foreach ( $option as $index => $linkGroup ) {
1600-
if ( $index == 0 ) {
1601-
$where = $this->tableNames['page'] . '.page_id=el.el_from AND ';
1602-
$ors = [];
1613+
foreach ( $option as $index => $domains ) {
1614+
$domainPatterns = array_map(
1615+
fn ( string $domain ) => $this->parseDomainPattern( $domain ),
1616+
$domains
1617+
);
1618+
if ( $index == 0 ) {
1619+
$ors = array_map(
1620+
fn ( $pattern ) => "el.el_to_domain_index LIKE {$this->dbr->addQuotes( $pattern )}",
1621+
$domainPatterns
1622+
);
16031623

1604-
foreach ( $linkGroup as $link ) {
1605-
$ors[] = 'el.el_to LIKE ' . $this->dbr->addQuotes( $link );
1606-
}
1624+
$where = "{$this->tableNames['page']}.page_id=el.el_from AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})";
1625+
} else {
1626+
$ors = array_map(
1627+
fn ( $pattern ) => "{$this->tableNames['externallinks']}.el_to_domain_index LIKE {$this->dbr->addQuotes( $pattern )}",
1628+
$domainPatterns
1629+
);
16071630

1608-
$where .= '(' . implode( ' OR ', $ors ) . ')';
1609-
} else {
1610-
$where = 'EXISTS(SELECT el_from FROM ' . $this->tableNames['externallinks'] .
1611-
' WHERE (' . $this->tableNames['externallinks'] . '.el_from=page_id AND ';
1631+
$where = "EXISTS(SELECT el_from FROM {$this->tableNames['externallinks']} " .
1632+
" WHERE ({$this->tableNames['externallinks']}.el_from=page_id " .
1633+
" AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})))";
1634+
}
16121635

1613-
$ors = [];
1636+
$this->addWhere( $where );
1637+
}
1638+
}
16141639

1615-
foreach ( $linkGroup as $link ) {
1616-
$ors[] = $this->tableNames['externallinks'] . '.el_to LIKE ' . $this->dbr->addQuotes( $link );
1617-
}
1640+
/**
1641+
* Set SQL for 'linkstoexternalpath' parameter.
1642+
*
1643+
* @param mixed $option
1644+
*/
1645+
private function _linkstoexternalpath( $option ) {
1646+
if ( $this->parameters->getParameter( 'distinct' ) == 'strict' ) {
1647+
$this->addGroupBy( 'page_title' );
1648+
}
16181649

1619-
$where .= '(' . implode( ' OR ', $ors ) . ')';
1620-
$where .= '))';
1621-
}
1650+
if ( count( $option ) == 0 ) {
1651+
// Nothing to do
1652+
return;
1653+
}
16221654

1623-
$this->addWhere( $where );
1655+
$this->addTable( 'externallinks', 'el' );
1656+
$this->addSelect( [ 'el_to_path' => 'el.el_to_path' ] );
1657+
1658+
foreach ( $option as $index => $paths ) {
1659+
if ( $index == 0 ) {
1660+
$ors = array_map(
1661+
fn ( $path ) => "el.el_to_path LIKE {$this->dbr->addQuotes( $path )}",
1662+
$paths
1663+
);
1664+
1665+
$where = "{$this->tableNames['page']}.page_id=el.el_from AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})";
1666+
} else {
1667+
$ors = array_map(
1668+
fn ( $path ) => "{$this->tableNames['externallinks']}.el_to_path LIKE {$this->dbr->addQuotes( $path )}",
1669+
$paths
1670+
);
1671+
1672+
$where = "EXISTS(SELECT el_from FROM {$this->tableNames['externallinks']} " .
1673+
" WHERE ({$this->tableNames['externallinks']}.el_from=page_id " .
1674+
" AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})))";
16241675
}
1676+
1677+
$this->addWhere( $where );
16251678
}
16261679
}
16271680

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
<?php
2+
3+
namespace MediaWiki\Extension\DynamicPageList3\Tests;
4+
use MediaWiki\Extension\DynamicPageList3\ExternalDomainPatternParser;
5+
use PHPUnit\Framework\Attributes\DataProvider;
6+
use PHPUnit\Framework\TestCase;
7+
8+
/**
9+
* @group DynamicPageList3
10+
*/
11+
class DPLExternalDomainPatternParserTest extends TestCase {
12+
use ExternalDomainPatternParser;
13+
14+
#[DataProvider( 'getDomainPattern' )]
15+
public function testParseDomainPattern( string $domain, string $expected ): void {
16+
$actual = $this->parseDomainPattern( $domain );
17+
$this->assertSame( $expected, $actual );
18+
}
19+
20+
public static function getDomainPattern(): array {
21+
return [
22+
// Full domain with extra path and without any wildcards
23+
[ 'http://www.fandom.com/test123/test?test=%', 'http://com.fandom.www.' ],
24+
// Protocol is preserved if specified (only protocols separated by `://` are supported)
25+
[ 'irc://starwars.%/test123/test', 'irc://%.starwars.' ],
26+
[ 'https://starwars.%/test123/test', 'https://%.starwars.' ],
27+
// Domain with `%` at the end
28+
[ 'http://starwars.%/test123/test?test=%', 'http://%.starwars.' ],
29+
// Domain with `%` at the begging. We have to guess the protocol
30+
[ '%.fandom.com/test123/test?test=%', '%://com.fandom.%.' ],
31+
// Domain with wildcard at the begging without separation
32+
[ '%fandom.com/test123/test?test=%', '%://com.%fandom%.' ],
33+
// Domain with wildcard in the middle, separated by `.`
34+
[ 'www.%.com/test123/test?test=%', '%://com.%.www.' ],
35+
// Domain with wildcard at the begging separated by `.` from one side
36+
[ 'www.%fandom.com', '%://com.%fandom%.www.' ],
37+
// Domain with wildcard at the begging separated by `.` from the other side
38+
[ 'www.fandom%.com', '%://com.%fandom%.www.' ],
39+
// Duplicated wildcard doesn't matter
40+
[ 'www.%%fandom.com', '%://com.%%fandom%.www.' ],
41+
];
42+
}
43+
44+
/**
45+
* This test documents cases that are not correctly supported
46+
*/
47+
#[DataProvider( 'getUnsupportedDomainPattern' )]
48+
public function testUnsupportedDomainPatterns( string $domain, string $expected ): void {
49+
$actual = $this->parseDomainPattern( $domain );
50+
$this->assertSame( $expected, $actual );
51+
}
52+
53+
public static function getUnsupportedDomainPattern(): array {
54+
return [
55+
// We are not supporting `_` as a `.`
56+
[ 'http://www.fandom_com', 'http://fandom_com.www.' ],
57+
// Domain with wildcard in the middle not followed by `.` is not processed
58+
[ 'ww%fandom.com', '%://com.ww%fandom.' ],
59+
[ '%www%fandom.com', '%://com.%www%fandom%.' ],
60+
// When wildcard should cover `/` we would generate garbage
61+
[ '%fandom.%?test=%', '%://%?test=%%.%fandom%.' ],
62+
];
63+
}
64+
}

0 commit comments

Comments
 (0)