Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(PLATFORM-10779): restore partial support for linkstoexternal and introduce replacement via linkstoexternaldomain and linkstoexternalpath #14

Merged
merged 1 commit into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
44 changes: 44 additions & 0 deletions includes/ExternalDomainPatternParser.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?php

namespace MediaWiki\Extension\DynamicPageList3;

use MediaWiki\Extension\DynamicPageList3\Tests\DPLExternalDomainPatternParserTest;

trait ExternalDomainPatternParser {
/**
* We provide:
* * full support for "standalone" wildcard usage (eg. `%.fandom.com`)
* * partial support for wildcard usage when it is not separated by `.` (eg. `%fandom.com would match starwars.fandom-suffix.com)
* * protocols followed by the `://` are supported, like `http://` or `https://` (`mailto:` on the other hand is not supported)
*
* @See DPLExternalDomainPatternParserTest for example cases
*/
private function parseDomainPattern( string $pattern ): string {
$protocol = false;
// Protocol is specified. Strip it
if ( str_contains( $pattern, '://' ) ) {
[$protocol, $pattern] = explode( '://', $pattern );
}

// Previous step will strip protocol if it was specified
[$domainPattern, ] = explode( '/', $pattern, 2 );
$parts = explode( '.', $domainPattern );
$reversed = array_reverse( $parts );
foreach ( $reversed as &$part ) {
if ( $part === '%' ) {
continue;
}
if ( str_starts_with( $part, '%' ) ) {
$part .= '%';
} else if ( str_ends_with( $part, '%' ) ) {
$part = '%' . $part;
}
}

$rawPattern = implode( '.', $reversed );
if ( $protocol ) {
return "$protocol://$rawPattern.";
}
return "%://$rawPattern.";
}
}
34 changes: 30 additions & 4 deletions includes/ParametersData.php
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,9 @@ class ParametersData {
'categoryregexp',
'firstrevisionsince',
'lastrevisionbefore',
'linkstoexternal',
'linkstoexternaldomain',
'linkstoexternalpath',
'maxrevisions',
'minrevisions',
'notcategorymatch',
Expand All @@ -149,7 +152,6 @@ class ParametersData {
],
// Should never be used; likely broken or will cause exceptions
5 => [
'linkstoexternal',
],
];

Expand Down Expand Up @@ -577,13 +579,37 @@ class ParametersData {
'page_name_must_exist' => true,
'set_criteria_found' => true
],
/**
* Alias for `linkstoexternaldomain`.
* To mimic old behaviour use `linkstoexternaldomain` together with `linkstoexternalpath`
*/
'linkstoexternal' => [
'default' => null,
'open_ref_conflict' => true,
'page_name_list' => true,
'page_name_must_exist' => false,
'set_criteria_found' => true
],
/**
* This parameter restricts the output to articles which contain an external
* reference that conatins a certain pattern.
* domain reference that contains a certain pattern.
*
* Examples: linkstoexternal= www.xyz.com|www.xyz2.com
* Examples: linkstoexternaldomain=www.xyz.com|www.xyz2.%
*/
'linkstoexternal' => [
'linkstoexternaldomain' => [
'default' => null,
'open_ref_conflict' => true,
'page_name_list' => true,
'page_name_must_exist' => false,
'set_criteria_found' => true
],
/**
* This parameter restricts the output to articles which contain an external
* path reference that contains a certain pattern.
*
* Examples: linkstoexternalpath=/xyz/%|%/abc/%
*/
'linkstoexternalpath' => [
'default' => null,
'open_ref_conflict' => true,
'page_name_list' => true,
Expand Down
97 changes: 75 additions & 22 deletions includes/Query.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,10 @@
use Wikimedia\ObjectCache\WANObjectCache;
use Wikimedia\Rdbms\Database;
use Wikimedia\Rdbms\IDatabase;
use Wikimedia\Rdbms\Platform\ISQLPlatform;

class Query {
use ExternalDomainPatternParser;
/**
* Parameters Object
*
Expand Down Expand Up @@ -1588,40 +1590,91 @@ private function _notlinksto( $option ) {
* @param mixed $option
*/
private function _linkstoexternal( $option ) {
$this->_linkstoexternaldomain( $option );
}

/**
* Set SQL for 'linkstoexternaldomain' parameter.
*
* @param mixed $option
*/
private function _linkstoexternaldomain( $option ) {
if ( $this->parameters->getParameter( 'distinct' ) == 'strict' ) {
$this->addGroupBy( 'page_title' );
}

if ( count( $option ) > 0 ) {
$this->addTable( 'externallinks', 'el' );
$this->addSelect( [ 'el_to' => 'el.el_to' ] );
if ( count( $option ) == 0 ) {
// Nothing to do
return;
}
$this->addTable( 'externallinks', 'el' );
$this->addSelect( [ 'el_to_domain_index' => 'el.el_to_domain_index' ] );

foreach ( $option as $index => $linkGroup ) {
if ( $index == 0 ) {
$where = $this->tableNames['page'] . '.page_id=el.el_from AND ';
$ors = [];
foreach ( $option as $index => $domains ) {
$domainPatterns = array_map(
fn ( string $domain ) => $this->parseDomainPattern( $domain ),
$domains
);
if ( $index == 0 ) {
$ors = array_map(
fn ( $pattern ) => "el.el_to_domain_index LIKE {$this->dbr->addQuotes( $pattern )}",
$domainPatterns
);

foreach ( $linkGroup as $link ) {
$ors[] = 'el.el_to LIKE ' . $this->dbr->addQuotes( $link );
}
$where = "{$this->tableNames['page']}.page_id=el.el_from AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})";
} else {
$ors = array_map(
fn ( $pattern ) => "{$this->tableNames['externallinks']}.el_to_domain_index LIKE {$this->dbr->addQuotes( $pattern )}",
$domainPatterns
);

$where .= '(' . implode( ' OR ', $ors ) . ')';
} else {
$where = 'EXISTS(SELECT el_from FROM ' . $this->tableNames['externallinks'] .
' WHERE (' . $this->tableNames['externallinks'] . '.el_from=page_id AND ';
$where = "EXISTS(SELECT el_from FROM {$this->tableNames['externallinks']} " .
" WHERE ({$this->tableNames['externallinks']}.el_from=page_id " .
" AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})))";
}

$ors = [];
$this->addWhere( $where );
}
}

foreach ( $linkGroup as $link ) {
$ors[] = $this->tableNames['externallinks'] . '.el_to LIKE ' . $this->dbr->addQuotes( $link );
}
/**
* Set SQL for 'linkstoexternalpath' parameter.
*
* @param mixed $option
*/
private function _linkstoexternalpath( $option ) {
if ( $this->parameters->getParameter( 'distinct' ) == 'strict' ) {
$this->addGroupBy( 'page_title' );
}

$where .= '(' . implode( ' OR ', $ors ) . ')';
$where .= '))';
}
if ( count( $option ) == 0 ) {
// Nothing to do
return;
}

$this->addWhere( $where );
$this->addTable( 'externallinks', 'el' );
$this->addSelect( [ 'el_to_path' => 'el.el_to_path' ] );

foreach ( $option as $index => $paths ) {
if ( $index == 0 ) {
$ors = array_map(
fn ( $path ) => "el.el_to_path LIKE {$this->dbr->addQuotes( $path )}",
$paths
);

$where = "{$this->tableNames['page']}.page_id=el.el_from AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})";
} else {
$ors = array_map(
fn ( $path ) => "{$this->tableNames['externallinks']}.el_to_path LIKE {$this->dbr->addQuotes( $path )}",
$paths
);

$where = "EXISTS(SELECT el_from FROM {$this->tableNames['externallinks']} " .
" WHERE ({$this->tableNames['externallinks']}.el_from=page_id " .
" AND ({$this->dbr->makeList( $ors, ISQLPlatform::LIST_OR )})))";
}

$this->addWhere( $where );
}
}

Expand Down
64 changes: 64 additions & 0 deletions tests/phpunit/DPLExternalDomainPatternParserTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php

namespace MediaWiki\Extension\DynamicPageList3\Tests;
use MediaWiki\Extension\DynamicPageList3\ExternalDomainPatternParser;
use PHPUnit\Framework\Attributes\DataProvider;
use PHPUnit\Framework\TestCase;

/**
* @group DynamicPageList3
*/
class DPLExternalDomainPatternParserTest extends TestCase {
use ExternalDomainPatternParser;

#[DataProvider( 'getDomainPattern' )]
public function testParseDomainPattern( string $domain, string $expected ): void {
$actual = $this->parseDomainPattern( $domain );
$this->assertSame( $expected, $actual );
}

public static function getDomainPattern(): array {
return [
// Full domain with extra path and without any wildcards
[ 'http://www.fandom.com/test123/test?test=%', 'http://com.fandom.www.' ],
// Protocol is preserved if specified (only protocols separated by `://` are supported)
[ 'irc://starwars.%/test123/test', 'irc://%.starwars.' ],
[ 'https://starwars.%/test123/test', 'https://%.starwars.' ],
// Domain with `%` at the end
[ 'http://starwars.%/test123/test?test=%', 'http://%.starwars.' ],
// Domain with `%` at the begging. We have to guess the protocol
[ '%.fandom.com/test123/test?test=%', '%://com.fandom.%.' ],
// Domain with wildcard at the begging without separation
[ '%fandom.com/test123/test?test=%', '%://com.%fandom%.' ],
// Domain with wildcard in the middle, separated by `.`
[ 'www.%.com/test123/test?test=%', '%://com.%.www.' ],
// Domain with wildcard at the begging separated by `.` from one side
[ 'www.%fandom.com', '%://com.%fandom%.www.' ],
// Domain with wildcard at the begging separated by `.` from the other side
[ 'www.fandom%.com', '%://com.%fandom%.www.' ],
// Duplicated wildcard doesn't matter
[ 'www.%%fandom.com', '%://com.%%fandom%.www.' ],
];
}

/**
* This test documents cases that are not correctly supported
*/
#[DataProvider( 'getUnsupportedDomainPattern' )]
public function testUnsupportedDomainPatterns( string $domain, string $expected ): void {
$actual = $this->parseDomainPattern( $domain );
$this->assertSame( $expected, $actual );
}

public static function getUnsupportedDomainPattern(): array {
return [
// We are not supporting `_` as a `.`
[ 'http://www.fandom_com', 'http://fandom_com.www.' ],
// Domain with wildcard in the middle not followed by `.` is not processed
[ 'ww%fandom.com', '%://com.ww%fandom.' ],
[ '%www%fandom.com', '%://com.%www%fandom%.' ],
// When wildcard should cover `/` we would generate garbage
[ '%fandom.%?test=%', '%://%?test=%%.%fandom%.' ],
];
}
}