From 33f32f4deaf70155ba10db70330abcc6f528ccc5 Mon Sep 17 00:00:00 2001 From: Chris Huber Date: Sun, 8 Mar 2026 15:22:36 +0000 Subject: [PATCH] feat(#731): delegate title matching to core SimilarityEngine Migrates data-machine-events to use the unified SimilarityEngine from core (Extra-Chill/data-machine#731) instead of maintaining its own duplicate title normalization and fuzzy matching. Changed: - EventIdentifierGenerator: extractCoreTitle(), titlesMatch(), normalize_dashes(), normalize_text() replaced with thin wrappers delegating to SimilarityEngine. Venue matching stays (domain-specific). - DuplicateDetectionAbilities: removed data-machine-events/titles-match ability (superseded by core datamachine/titles-match). Added event strategy registration on datamachine_duplicate_strategies filter so core check-duplicate ability can find event duplicates. - EventIdentifierGeneratorTest: fixed test_rightmost_delimiter_used (was testing nonexistent behavior), added venue matching tests and SimilarityEngine delegation verification test. Requires: data-machine >= 0.39.0 (SimilarityEngine + DuplicateCheckAbility) --- inc/Abilities/DuplicateDetectionAbilities.php | 126 +++++++----- inc/Utilities/EventIdentifierGenerator.php | 185 +++--------------- tests/Unit/EventIdentifierGeneratorTest.php | 97 ++++++++- 3 files changed, 187 insertions(+), 221 deletions(-) diff --git a/inc/Abilities/DuplicateDetectionAbilities.php b/inc/Abilities/DuplicateDetectionAbilities.php index 9ad9bff..8b10f55 100644 --- a/inc/Abilities/DuplicateDetectionAbilities.php +++ b/inc/Abilities/DuplicateDetectionAbilities.php @@ -2,10 +2,13 @@ /** * Duplicate Detection Abilities * - * Universal primitives for event identity matching. Exposes fuzzy title - * comparison, venue comparison, and combined duplicate-event search as - * abilities that any part of the system can consume (CLI, REST, Chat, - * import pipeline, MCP). + * Event-domain duplicate detection abilities. Venue comparison and the + * combined find-duplicate-event search remain event-specific. Title + * comparison delegates to the core SimilarityEngine. + * + * Also registers an event strategy on the `datamachine_duplicate_strategies` + * filter so the unified `datamachine/check-duplicate` ability can find + * event duplicates using venue + date + title matching. * * @package DataMachineEvents\Abilities * @since 0.15.0 @@ -13,6 +16,7 @@ namespace DataMachineEvents\Abilities; +use DataMachine\Core\Similarity\SimilarityEngine; use DataMachineEvents\Utilities\EventIdentifierGenerator; use DataMachineEvents\Core\Event_Post_Type; use const DataMachineEvents\Core\EVENT_DATETIME_META_KEY; @@ -28,13 +32,13 @@ class DuplicateDetectionAbilities { public function __construct() { if ( ! self::$registered ) { $this->registerAbilities(); + $this->registerStrategy(); self::$registered = true; } } private function registerAbilities(): void { $register_callback = function () { - $this->registerTitlesMatchAbility(); $this->registerVenuesMatchAbility(); $this->registerFindDuplicateEventAbility(); }; @@ -46,61 +50,81 @@ private function registerAbilities(): void { } } - // ----------------------------------------------------------------------- - // Ability: titles-match - // ----------------------------------------------------------------------- + /** + * Register event duplicate strategy on the unified filter. + * + * When core's `datamachine/check-duplicate` ability runs for the + * `event` post type, this strategy fires first (priority 10) and + * uses venue + date + fuzzy title matching. + */ + private function registerStrategy(): void { + add_filter( 'datamachine_duplicate_strategies', array( $this, 'addEventStrategy' ) ); + } - private function registerTitlesMatchAbility(): void { - wp_register_ability( - 'data-machine-events/titles-match', - array( - 'label' => __( 'Titles Match', 'data-machine-events' ), - 'description' => __( 'Compare two event titles for semantic equivalence. Strips tour names, supporting acts, and normalizes for fuzzy comparison.', 'data-machine-events' ), - 'category' => 'datamachine', - 'input_schema' => array( - 'type' => 'object', - 'required' => array( 'title1', 'title2' ), - 'properties' => array( - 'title1' => array( - 'type' => 'string', - 'description' => 'First event title', - ), - 'title2' => array( - 'type' => 'string', - 'description' => 'Second event title', - ), - ), - ), - 'output_schema' => array( - 'type' => 'object', - 'properties' => array( - 'match' => array( 'type' => 'boolean' ), - 'core1' => array( 'type' => 'string' ), - 'core2' => array( 'type' => 'string' ), - ), - ), - 'execute_callback' => array( $this, 'executeTitlesMatch' ), - 'permission_callback' => '__return_true', - 'meta' => array( 'show_in_rest' => true ), - ) + /** + * Add event duplicate strategy to the strategy registry. + * + * @param array $strategies Existing strategies. + * @return array Strategies with event strategy appended. + */ + public function addEventStrategy( array $strategies ): array { + $strategies[] = array( + 'id' => 'event_venue_date_title', + 'post_type' => Event_Post_Type::POST_TYPE, + 'callback' => array( $this, 'executeEventStrategy' ), + 'priority' => 10, ); + return $strategies; } /** - * Compare two event titles for semantic match. + * Event duplicate strategy callback. + * + * Called by core's `datamachine/check-duplicate` ability. Checks for + * duplicate events using venue + date + fuzzy title matching. * - * @param array $input { title1: string, title2: string } - * @return array { match: bool, core1: string, core2: string } + * @param array $input { title: string, context: { venue?: string, startDate?: string } } + * @return array Result with verdict key. */ - public function executeTitlesMatch( array $input ): array { - $title1 = $input['title1'] ?? ''; - $title2 = $input['title2'] ?? ''; + public function executeEventStrategy( array $input ): array { + $title = $input['title'] ?? ''; + $context = $input['context'] ?? array(); + $venue = $context['venue'] ?? ''; + $startDate = $context['startDate'] ?? ''; - return array( - 'match' => EventIdentifierGenerator::titlesMatch( $title1, $title2 ), - 'core1' => EventIdentifierGenerator::extractCoreTitle( $title1 ), - 'core2' => EventIdentifierGenerator::extractCoreTitle( $title2 ), + if ( empty( $title ) || empty( $startDate ) ) { + return array( 'verdict' => 'clear' ); + } + + $result = $this->executeFindDuplicateEvent( + array( + 'title' => $title, + 'venue' => $venue, + 'startDate' => $startDate, + ) ); + + if ( ! empty( $result['found'] ) ) { + return array( + 'verdict' => 'duplicate', + 'source' => 'event_' . ( $result['match_strategy'] ?? 'fuzzy' ), + 'match' => array( + 'post_id' => $result['post_id'] ?? 0, + 'title' => $result['matched_title'] ?? '', + 'venue' => $result['matched_venue'] ?? '', + ), + 'reason' => sprintf( + 'Rejected: "%s" matches existing event "%s" (ID %d) via %s.', + $title, + $result['matched_title'] ?? '', + $result['post_id'] ?? 0, + $result['match_strategy'] ?? 'fuzzy' + ), + 'strategy' => 'event_venue_date_title', + ); + } + + return array( 'verdict' => 'clear' ); } // ----------------------------------------------------------------------- diff --git a/inc/Utilities/EventIdentifierGenerator.php b/inc/Utilities/EventIdentifierGenerator.php index 201f748..d3580d9 100644 --- a/inc/Utilities/EventIdentifierGenerator.php +++ b/inc/Utilities/EventIdentifierGenerator.php @@ -6,12 +6,18 @@ * Normalizes event data (title, venue, date) to create stable identifiers that * remain consistent across minor variations in source data. * + * Title normalization and fuzzy matching are delegated to the core + * SimilarityEngine (DataMachine\Core\Similarity\SimilarityEngine). + * Venue matching remains here — it's event-domain-specific. + * * @package DataMachineEvents\Utilities * @since 0.2.0 */ namespace DataMachineEvents\Utilities; +use DataMachine\Core\Similarity\SimilarityEngine; + if ( ! defined( 'ABSPATH' ) ) { exit; } @@ -36,147 +42,37 @@ class EventIdentifierGenerator { * @return string MD5 hash identifier */ public static function generate( string $title, string $startDate, string $venue ): string { - $normalized_title = self::normalize_text( $title ); - $normalized_venue = self::normalize_text( $venue ); + $normalized_title = SimilarityEngine::normalizeBasic( $title ); + $normalized_venue = SimilarityEngine::normalizeBasic( $venue ); return md5( $normalized_title . $startDate . $normalized_venue ); } /** - * Normalize text for consistent identifier generation - * - * Applies transformations: - * - Lowercase - * - Trim whitespace - * - Collapse multiple spaces to single space - * - Remove common article prefixes ("the ", "a ", "an ") - * - * @param string $text Text to normalize - * @return string Normalized text - */ - private static function normalize_text( string $text ): string { - // Normalize unicode dashes to ASCII hyphen - $text = self::normalize_dashes( $text ); - - // Lowercase - $text = strtolower( $text ); - - // Trim and collapse whitespace - $text = trim( preg_replace( '/\s+/', ' ', $text ) ); - - // Remove common article prefixes - $text = preg_replace( '/^(the|a|an)\s+/i', '', $text ); - - return $text; - } - - /** - * Normalize unicode dash characters to ASCII hyphen + * Extract core identifying portion of event title * - * Scraped titles commonly use en dashes (–), em dashes (—), or other - * unicode dash variants interchangeably with ASCII hyphens (-). - * Normalizing prevents false dedup mismatches like: - * "bbno$ - The Internet Explorer Tour" vs "bbno$ – The Internet Explorer Tour" + * Delegates to the unified SimilarityEngine which consolidates the + * normalization logic from this class and core's DuplicateDetection. * - * @param string $text Input text - * @return string Text with all dashes normalized to ASCII hyphen + * @param string $title Event title + * @return string Core title for comparison */ - private static function normalize_dashes( string $text ): string { - $unicode_dashes = array( - "\u{2010}", // hyphen - "\u{2011}", // non-breaking hyphen - "\u{2012}", // figure dash - "\u{2013}", // en dash - "\u{2014}", // em dash - "\u{2015}", // horizontal bar - "\u{FE58}", // small em dash - "\u{FE63}", // small hyphen-minus - "\u{FF0D}", // fullwidth hyphen-minus - ); - - return str_replace( $unicode_dashes, '-', $text ); + public static function extractCoreTitle( string $title ): string { + return SimilarityEngine::normalizeTitle( $title ); } /** - * Extract core identifying portion of event title - * - * Strips tour names, supporting acts, and normalizes for comparison. - * Used for fuzzy matching across sources with different title formats. + * Compare two event titles for semantic match * - * Examples: - * - "Andy Frasco & the U.N. — Growing Pains Tour with Candi Jenkins" → "andy frasco u.n." - * - "Andy Frasco & The U.N." → "andy frasco u.n." - * - "Jazz Night: Holiday Special" → "jazz night" + * Delegates to the unified SimilarityEngine which runs exact, + * prefix, and Levenshtein strategies. * - * @param string $title Event title - * @return string Core title for comparison + * @param string $title1 First event title + * @param string $title2 Second event title + * @return bool True if titles represent the same event */ - public static function extractCoreTitle( string $title ): string { - $text = strtolower( self::normalize_dashes( $title ) ); - - // Split on common delimiters that typically separate main event from tour/opener info. - // Dashes are already normalized to ASCII hyphen by normalize_dashes(), so we match - // the ASCII equivalents here (not unicode originals). - // Note: standalone hyphen omitted to preserve band names like "Run-DMC". - $delimiters = array( - ' - ', // ASCII hyphen with spaces (normalized from em/en dash) - ' : ', // colon with spaces - ': ', // colon - ' | ', // pipe - '|', // pipe - ' featuring ', - ' feat. ', - ' feat ', - ' ft. ', - ' ft ', - ' with ', - ' w/ ', - ' + ', - ); - - // Find the first delimiter occurrence to extract the headliner/core. - $best_pos = PHP_INT_MAX; - $best_delimiter = null; - - foreach ( $delimiters as $delimiter ) { - $pos = strpos( $text, $delimiter ); - if ( false !== $pos && $pos > 0 && $pos < $best_pos ) { - $best_pos = $pos; - $best_delimiter = $delimiter; - } - } - - if ( null !== $best_delimiter ) { - $parts = explode( $best_delimiter, $text, 2 ); - $text = $parts[0]; - } - - // Comma-separated artist lists: treat first segment as the headliner. - // "Comfort Club, Valories, Barb" → "Comfort Club" - // Only split if the part before the first comma is substantial (>2 chars). - if ( strpos( $text, ',' ) !== false ) { - $comma_parts = explode( ',', $text, 2 ); - $first = trim( $comma_parts[0] ); - if ( strlen( $first ) > 2 ) { - $text = $first; - } - } - - // Remove articles at word boundaries - $text = preg_replace( '/\b(the|a|an)\b/i', '', $text ); - - // Remove non-alphanumeric characters (keep spaces) - $text = preg_replace( '/[^a-z0-9\s]/i', '', $text ); - - // Collapse whitespace and trim - $text = trim( preg_replace( '/\s+/', ' ', $text ) ); - - // If result is too short, return normalized original instead - if ( strlen( $text ) < 3 ) { - return self::normalize_text( $title ); - } - - return $text; + public static function titlesMatch( string $title1, string $title2 ): bool { + return SimilarityEngine::titlesMatch( $title1, $title2 )->match; } /** @@ -233,7 +129,7 @@ private static function normalize_venue( string $venue ): string { $text = html_entity_decode( $venue, ENT_QUOTES | ENT_HTML5, 'UTF-8' ); // Normalize unicode dashes to ASCII hyphen. - $text = self::normalize_dashes( $text ); + $text = SimilarityEngine::normalizeDashes( $text ); // Lowercase. $text = strtolower( $text ); @@ -272,7 +168,7 @@ private static function strip_venue_qualifiers( string $venue ): string { $text = html_entity_decode( $venue, ENT_QUOTES | ENT_HTML5, 'UTF-8' ); // Normalize dashes so we can match consistently. - $text = self::normalize_dashes( $text ); + $text = SimilarityEngine::normalizeDashes( $text ); // Strip parenthetical suffixes: "(Indoor)", "(NÜTRL Beach Stage)" $text = preg_replace( '/\s*\(.*\)\s*$/', '', $text ); @@ -286,35 +182,4 @@ private static function strip_venue_qualifiers( string $venue ): string { return trim( $text ); } - - /** - * Compare two event titles for semantic match - * - * Returns true if core titles match after extraction and normalization. - * Used for cross-source duplicate detection where titles may vary. - * - * @param string $title1 First event title - * @param string $title2 Second event title - * @return bool True if titles represent the same event - */ - public static function titlesMatch( string $title1, string $title2 ): bool { - $core1 = self::extractCoreTitle( $title1 ); - $core2 = self::extractCoreTitle( $title2 ); - - // Exact match. - if ( $core1 === $core2 ) { - return true; - } - - // One core is a prefix of the other (covers venue name appended to title). - // "colombian jazz experience" vs "colombian jazz experience sahara" - $shorter = strlen( $core1 ) <= strlen( $core2 ) ? $core1 : $core2; - $longer = strlen( $core1 ) <= strlen( $core2 ) ? $core2 : $core1; - - if ( strlen( $shorter ) >= 5 && str_starts_with( $longer, $shorter ) ) { - return true; - } - - return false; - } } diff --git a/tests/Unit/EventIdentifierGeneratorTest.php b/tests/Unit/EventIdentifierGeneratorTest.php index ecad5ca..7c79563 100644 --- a/tests/Unit/EventIdentifierGeneratorTest.php +++ b/tests/Unit/EventIdentifierGeneratorTest.php @@ -3,6 +3,9 @@ * EventIdentifierGenerator Tests * * Tests for duplicate event detection via title normalization. + * EventIdentifierGenerator now delegates to the core SimilarityEngine + * for title normalization and matching. These tests verify the + * delegation works correctly. * * @package DataMachineEvents\Tests\Unit * @since 0.10.2 @@ -20,12 +23,6 @@ class EventIdentifierGeneratorTest extends WP_UnitTestCase { */ public function get_matching_title_pairs(): array { return array( - // The bug that triggered this test file - 'burgundy_soul_nite_em_dash_vs_no_dash' => array( - 'Burgundy: Soul Nite — Bill Wilson & The Ingredients', - 'Burgundy: Soul Nite Bill Wilson & The Ingredients', - ), - // Article variations 'the_blue_note_vs_blue_note' => array( 'The Blue Note Jazz Night', @@ -154,17 +151,97 @@ public function test_generate_normalizes_articles(): void { } /** - * Test rightmost delimiter extraction (the core fix) + * Test earliest delimiter extraction + * + * SimilarityEngine uses leftmost-wins: the earliest delimiter in the + * text is used to split. For "Burgundy: Soul Nite — Bill Wilson": + * - ": " at pos 8 wins over " - " at pos 20 + * - Core title is "burgundy" */ - public function test_rightmost_delimiter_used(): void { - // "Burgundy: Soul Nite — Bill Wilson" has colon at 8 and em dash at 19 - // Should split at em dash (position 19), keeping "Burgundy: Soul Nite" + public function test_earliest_delimiter_used(): void { $core = EventIdentifierGenerator::extractCoreTitle( 'Burgundy: Soul Nite — Bill Wilson' ); + // ": " is the earliest delimiter (pos 8), so we get "burgundy" $this->assertStringContainsString( 'burgundy', $core ); + $this->assertStringNotContainsString( 'bill', $core ); + $this->assertStringNotContainsString( 'wilson', $core ); + } + + /** + * Test that em dash delimiter properly splits titles + */ + public function test_em_dash_delimiter_splits(): void { + // When em dash is the only/earliest delimiter + $core = EventIdentifierGenerator::extractCoreTitle( 'Soul Nite — Bill Wilson & The Ingredients' ); + $this->assertStringContainsString( 'soul', $core ); $this->assertStringContainsString( 'nite', $core ); $this->assertStringNotContainsString( 'bill', $core ); $this->assertStringNotContainsString( 'wilson', $core ); } + + /** + * Test that em dash with/without surrounding content produces matching titles. + * + * The original bug: "Burgundy: Soul Nite — Bill Wilson & The Ingredients" + * vs "Burgundy: Soul Nite Bill Wilson & The Ingredients" should match + * because both normalize to the same core ("burgundy"). + */ + public function test_burgundy_soul_nite_em_dash_variant_match(): void { + $this->assertTrue( + EventIdentifierGenerator::titlesMatch( + 'Burgundy: Soul Nite — Bill Wilson & The Ingredients', + 'Burgundy: Soul Nite Bill Wilson & The Ingredients' + ), + 'Em dash variant should match (both normalize to same core via colon split)' + ); + } + + /** + * Test venue matching basics + */ + public function test_venues_match_exact(): void { + $this->assertTrue( + EventIdentifierGenerator::venuesMatch( 'The Parish', 'The Parish' ) + ); + } + + public function test_venues_match_with_qualifier(): void { + $this->assertTrue( + EventIdentifierGenerator::venuesMatch( "Buck's Backyard", "Buck's Backyard (Indoor)" ) + ); + } + + public function test_venues_match_with_dash_suffix(): void { + $this->assertTrue( + EventIdentifierGenerator::venuesMatch( 'Brooklyn Bowl', 'Brooklyn Bowl - Nashville' ) + ); + } + + public function test_venues_do_not_match_different(): void { + $this->assertFalse( + EventIdentifierGenerator::venuesMatch( 'The Basement', 'The Basement East' ) + ); + } + + public function test_venues_empty_does_not_match(): void { + $this->assertFalse( + EventIdentifierGenerator::venuesMatch( '', 'Some Venue' ) + ); + } + + /** + * Test that extractCoreTitle delegates to SimilarityEngine::normalizeTitle + */ + public function test_extract_core_title_delegates_to_similarity_engine(): void { + if ( ! class_exists( 'DataMachine\Core\Similarity\SimilarityEngine' ) ) { + $this->markTestSkipped( 'SimilarityEngine not available (data-machine core not loaded).' ); + } + + $title = 'Andy Frasco & the U.N. — Growing Pains Tour'; + $core = EventIdentifierGenerator::extractCoreTitle( $title ); + $engine = \DataMachine\Core\Similarity\SimilarityEngine::normalizeTitle( $title ); + + $this->assertEquals( $engine, $core, 'extractCoreTitle should delegate to SimilarityEngine::normalizeTitle' ); + } }