Skip to content

Commit ea7f58c

Browse files
authored
refactor: deduplicate shared utilities across web scraper extractors (#100)
1 parent cddd648 commit ea7f58c

9 files changed

Lines changed: 191 additions & 171 deletions

File tree

inc/Steps/EventImport/Handlers/WebScraper/Extractors/BaseExtractor.php

Lines changed: 149 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -238,4 +238,153 @@ protected function formatPriceRange( ?float $min, ?float $max = null ): string {
238238
protected function formatStructuredPrice( ?float $min = null, ?float $max = null, string $currency = 'USD', ?bool $is_free = null ): string {
239239
return PriceFormatter::formatStructured( $min, $max, $currency, $is_free );
240240
}
241+
242+
/**
243+
* Infer a full Y-m-d date from month name and day number.
244+
*
245+
* Assumes the current year. If that date has already passed,
246+
* bumps to the next year. Useful for venue calendars that
247+
* display "January 15" without a year.
248+
*
249+
* @since 0.15.1
250+
* @param string $month Month name (e.g., "January", "Jan")
251+
* @param string $day Day number (e.g., "15")
252+
* @return string Date in Y-m-d format, or empty string on failure.
253+
*/
254+
protected function inferDateFromMonthDay( string $month, string $day ): string {
255+
$year = (int) gmdate( 'Y' );
256+
$date_str = "{$month} {$day} {$year}";
257+
258+
try {
259+
$dt = new \DateTime( $date_str );
260+
$today = new \DateTime( 'today' );
261+
262+
if ( $dt < $today ) {
263+
$dt->modify( '+1 year' );
264+
}
265+
266+
return $dt->format( 'Y-m-d' );
267+
} catch ( \Exception $e ) {
268+
return '';
269+
}
270+
}
271+
272+
/**
273+
* Load an HTML string into a DOMDocument + DOMXPath pair.
274+
*
275+
* Eliminates the repeated 5-line DOM bootstrap boilerplate
276+
* across extractors.
277+
*
278+
* @since 0.15.1
279+
* @param string $html Raw HTML content.
280+
* @return array{dom: \DOMDocument, xpath: \DOMXPath}
281+
*/
282+
protected function loadDom( string $html ): array {
283+
$dom = new \DOMDocument();
284+
libxml_use_internal_errors( true );
285+
$dom->loadHTML( '<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
286+
libxml_clear_errors();
287+
288+
return array(
289+
'dom' => $dom,
290+
'xpath' => new \DOMXPath( $dom ),
291+
);
292+
}
293+
294+
/**
295+
* Merge page-level venue data into an event array.
296+
*
297+
* Fills in venue name and address fields that are empty in the
298+
* event but present in the page-level venue data.
299+
*
300+
* @since 0.15.1
301+
* @param array $event Event data array.
302+
* @param array $page_venue Page-level venue data from PageVenueExtractor.
303+
* @return array Event with merged venue data.
304+
*/
305+
protected function mergePageVenueData( array $event, array $page_venue ): array {
306+
$fields = array( 'venue', 'venueAddress', 'venueCity', 'venueState', 'venueZip', 'venueCountry', 'venueTimezone' );
307+
308+
foreach ( $fields as $field ) {
309+
if ( empty( $event[ $field ] ) && ! empty( $page_venue[ $field ] ) ) {
310+
$event[ $field ] = $page_venue[ $field ];
311+
}
312+
}
313+
314+
return $event;
315+
}
316+
317+
/**
318+
* Fetch a URL via HttpClient with standard error handling.
319+
*
320+
* Centralizes the repeated pattern of HttpClient::get() + success check
321+
* that appears in many extractors.
322+
*
323+
* @since 0.15.1
324+
* @param string $url URL to fetch.
325+
* @param array $args Optional wp_remote_get args.
326+
* @param string $context Short description for logging (e.g., "Firebase events").
327+
* @return string|null Response body, or null on failure.
328+
*/
329+
protected function fetchUrl( string $url, array $args = array(), string $context = '' ): ?string {
330+
if ( ! class_exists( '\\DataMachine\\Core\\HttpClient' ) ) {
331+
return null;
332+
}
333+
334+
$defaults = array( 'timeout' => 15 );
335+
$args = array_merge( $defaults, $args );
336+
$result = \DataMachine\Core\HttpClient::get( $url, $args );
337+
338+
if ( empty( $result['success'] ) ) {
339+
if ( '' !== $context ) {
340+
do_action(
341+
'datamachine_log',
342+
'debug',
343+
"BaseExtractor::fetchUrl failed for {$context}",
344+
array(
345+
'url' => $url,
346+
'status_code' => $result['status_code'] ?? 0,
347+
)
348+
);
349+
}
350+
return null;
351+
}
352+
353+
return $result['body'] ?? null;
354+
}
355+
356+
/**
357+
* Resolve a relative URL against a base URL.
358+
*
359+
* @since 0.15.1
360+
* @param string $url Possibly relative URL.
361+
* @param string $base_url Base URL for resolution.
362+
* @return string Absolute URL.
363+
*/
364+
protected function resolveUrl( string $url, string $base_url ): string {
365+
if ( empty( $url ) ) {
366+
return '';
367+
}
368+
369+
if ( preg_match( '#^https?://#i', $url ) ) {
370+
return $url;
371+
}
372+
373+
if ( str_starts_with( $url, '//' ) ) {
374+
$scheme = wp_parse_url( $base_url, PHP_URL_SCHEME ) ?: 'https';
375+
return $scheme . ':' . $url;
376+
}
377+
378+
$parts = wp_parse_url( $base_url );
379+
$base = ( $parts['scheme'] ?? 'https' ) . '://' . ( $parts['host'] ?? '' );
380+
381+
if ( str_starts_with( $url, '/' ) ) {
382+
return $base . $url;
383+
}
384+
385+
$path = $parts['path'] ?? '/';
386+
$dir = substr( $path, 0, (int) strrpos( $path, '/' ) + 1 );
387+
388+
return $base . $dir . $url;
389+
}
241390
}

inc/Steps/EventImport/Handlers/WebScraper/Extractors/CraftpeakExtractor.php

Lines changed: 2 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -238,30 +238,9 @@ private function normalizeEvent( array $raw, array $page_venue ): array {
238238
}
239239

240240
/**
241-
* Infer full date from month and day, adding year.
242-
*
243-
* If the date has already passed this year, assumes next year.
244-
*
245-
* @param string $month Month name (e.g., "January")
246-
* @param string $day Day number
247-
* @return string Date in Y-m-d format
241+
* @deprecated Use BaseExtractor::inferDateFromMonthDay() instead.
248242
*/
249243
private function inferDate( string $month, string $day ): string {
250-
$year = (int) date( 'Y' );
251-
$date_str = "$month $day $year";
252-
253-
try {
254-
$dt = new \DateTime( $date_str );
255-
$today = new \DateTime( 'today' );
256-
257-
// If date is in the past, assume next year
258-
if ( $dt < $today ) {
259-
$dt->modify( '+1 year' );
260-
}
261-
262-
return $dt->format( 'Y-m-d' );
263-
} catch ( \Exception $e ) {
264-
return '';
265-
}
244+
return $this->inferDateFromMonthDay( $month, $day );
266245
}
267246
}

inc/Steps/EventImport/Handlers/WebScraper/Extractors/FreshtixExtractor.php

Lines changed: 4 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -148,19 +148,11 @@ private function parseEventDateTime( array &$event, array $raw ): void {
148148
}
149149
}
150150

151+
/**
152+
* @deprecated Use BaseExtractor::parseTimeString() instead.
153+
*/
151154
private function normalizeTime( string $time ): string {
152-
$time = strtolower( trim( $time ) );
153-
154-
if ( strpos( $time, ':' ) === false ) {
155-
$time = preg_replace( '/(\d+)\s*(am|pm)/i', '$1:00 $2', $time );
156-
}
157-
158-
$timestamp = strtotime( $time );
159-
if ( false !== $timestamp ) {
160-
return date( 'H:i', $timestamp );
161-
}
162-
163-
return '';
155+
return $this->parseTimeString( $time );
164156
}
165157

166158
private function parseVenue( array &$event, array $raw, array $venue_data ): void {

inc/Steps/EventImport/Handlers/WebScraper/Extractors/MicrodataExtractor.php

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,8 @@ public function canExtract( string $html ): bool {
2424
}
2525

2626
public function extract( string $html, string $source_url ): array {
27-
$dom = new \DOMDocument();
28-
libxml_use_internal_errors( true );
29-
$dom->loadHTML( $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
30-
libxml_clear_errors();
31-
32-
$xpath = new \DOMXPath( $dom );
27+
$loaded = $this->loadDom( $html );
28+
$xpath = $loaded['xpath'];
3329

3430
$event_elements = $xpath->query( "//*[@itemtype='https://schema.org/Event' or @itemtype='http://schema.org/Event']" );
3531

inc/Steps/EventImport/Handlers/WebScraper/Extractors/MusicItemExtractor.php

Lines changed: 3 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,8 @@ public function canExtract( string $html ): bool {
2424
}
2525

2626
public function extract( string $html, string $source_url ): array {
27-
$dom = new \DOMDocument();
28-
libxml_use_internal_errors( true );
29-
$dom->loadHTML( '<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
30-
libxml_clear_errors();
31-
32-
$xpath = new \DOMXPath( $dom );
27+
$loaded = $this->loadDom( $html );
28+
$xpath = $loaded['xpath'];
3329
$event_nodes = $xpath->query( "//*[contains(concat(' ', normalize-space(@class), ' '), ' music__item ')]" );
3430

3531
if ( 0 === $event_nodes->length ) {
@@ -55,24 +51,7 @@ public function getMethod(): string {
5551
return 'music_item';
5652
}
5753

58-
/**
59-
* Merge page-level venue data into event for missing fields.
60-
*/
61-
private function mergePageVenueData( array $event, array $page_venue ): array {
62-
$address_fields = array( 'venueAddress', 'venueCity', 'venueState', 'venueZip', 'venueCountry' );
63-
64-
foreach ( $address_fields as $field ) {
65-
if ( empty( $event[ $field ] ) && ! empty( $page_venue[ $field ] ) ) {
66-
$event[ $field ] = $page_venue[ $field ];
67-
}
68-
}
69-
70-
if ( empty( $event['venue'] ) && ! empty( $page_venue['venue'] ) ) {
71-
$event['venue'] = $page_venue['venue'];
72-
}
73-
74-
return $event;
75-
}
54+
// mergePageVenueData() is inherited from BaseExtractor.
7655

7756
/**
7857
* Normalize music item event node to standardized format.

inc/Steps/EventImport/Handlers/WebScraper/Extractors/RedRocksExtractor.php

Lines changed: 6 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,8 @@ public function canExtract( string $html ): bool {
3232
}
3333

3434
public function extract( string $html, string $source_url ): array {
35-
$dom = new \DOMDocument();
36-
libxml_use_internal_errors( true );
37-
$dom->loadHTML( '<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
38-
libxml_clear_errors();
39-
40-
$xpath = new \DOMXPath( $dom );
35+
$loaded = $this->loadDom( $html );
36+
$xpath = $loaded['xpath'];
4137
$event_nodes = $xpath->query( "//*[contains(@class, 'card-event')]" );
4238

4339
if ( 0 === $event_nodes->length ) {
@@ -163,19 +159,11 @@ private function parseEventDateTime( array &$event, \DOMXPath $xpath, \DOMElemen
163159
}
164160
}
165161

162+
/**
163+
* @deprecated Use BaseExtractor::parseTimeString() instead.
164+
*/
166165
private function normalizeTime( string $time ): string {
167-
$time = strtolower( trim( $time ) );
168-
169-
if ( strpos( $time, ':' ) === false ) {
170-
$time = preg_replace( '/(\d+)\s*(am|pm)/i', '$1:00 $2', $time );
171-
}
172-
173-
$timestamp = strtotime( $time );
174-
if ( false !== $timestamp ) {
175-
return date( 'H:i', $timestamp );
176-
}
177-
178-
return '';
166+
return $this->parseTimeString( $time );
179167
}
180168

181169
private function parseImage( array &$event, \DOMXPath $xpath, \DOMElement $node ): void {

inc/Steps/EventImport/Handlers/WebScraper/Extractors/RhpEventsExtractor.php

Lines changed: 5 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -24,12 +24,8 @@ public function canExtract( string $html ): bool {
2424
}
2525

2626
public function extract( string $html, string $source_url ): array {
27-
$dom = new \DOMDocument();
28-
libxml_use_internal_errors( true );
29-
$dom->loadHTML( '<?xml encoding="UTF-8">' . $html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD );
30-
libxml_clear_errors();
31-
32-
$xpath = new \DOMXPath( $dom );
27+
$loaded = $this->loadDom( $html );
28+
$xpath = $loaded['xpath'];
3329
$event_nodes = $xpath->query( "//*[contains(@class, 'rhpSingleEvent')]" );
3430

3531
if ( 0 === $event_nodes->length ) {
@@ -52,28 +48,7 @@ public function extract( string $html, string $source_url ): array {
5248
return $events;
5349
}
5450

55-
/**
56-
* Merge page-level venue data into event for missing fields.
57-
*
58-
* @param array $event Event data
59-
* @param array $page_venue Page-level venue data from PageVenueExtractor
60-
* @return array Event with merged venue data
61-
*/
62-
private function mergePageVenueData( array $event, array $page_venue ): array {
63-
$address_fields = array( 'venueAddress', 'venueCity', 'venueState', 'venueZip', 'venueCountry' );
64-
65-
foreach ( $address_fields as $field ) {
66-
if ( empty( $event[ $field ] ) && ! empty( $page_venue[ $field ] ) ) {
67-
$event[ $field ] = $page_venue[ $field ];
68-
}
69-
}
70-
71-
if ( empty( $event['venue'] ) && ! empty( $page_venue['venue'] ) ) {
72-
$event['venue'] = $page_venue['venue'];
73-
}
74-
75-
return $event;
76-
}
51+
// mergePageVenueData() is inherited from BaseExtractor.
7752

7853
public function getMethod(): string {
7954
return 'rhp_events';
@@ -208,22 +183,10 @@ private function parseTime( array &$event, \DOMXPath $xpath, \DOMElement $node )
208183
}
209184

210185
/**
211-
* Normalize time string to H:i format.
186+
* @deprecated Use BaseExtractor::parseTimeString() instead.
212187
*/
213188
private function normalizeTime( string $time ): string {
214-
$time = strtolower( trim( $time ) );
215-
216-
// Add :00 if no minutes
217-
if ( ! strpos( $time, ':' ) ) {
218-
$time = preg_replace( '/(\d+)\s*(am|pm)?/i', '$1:00 $2', $time );
219-
}
220-
221-
$timestamp = strtotime( $time );
222-
if ( false !== $timestamp ) {
223-
return date( 'H:i', $timestamp );
224-
}
225-
226-
return '';
189+
return $this->parseTimeString( $time );
227190
}
228191

229192
/**

0 commit comments

Comments
 (0)