Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
122 commits
Select commit Hold shift + click to select a range
6ef878b
Add handler for broken content paths
holtvogt Aug 26, 2025
9347158
Add deterministic suggestion rules
holtvogt Sep 1, 2025
eb1d1bb
Implement Suggestion class for handling content path recommendations
holtvogt Sep 1, 2025
d0053c3
Implement LanguageTree and Locale classes for language and locale man…
holtvogt Sep 1, 2025
6db8ef8
Implement ContentPath class that encapsulates content path management
holtvogt Sep 1, 2025
e3e330b
Add AemAuthorClient class for managing AEM Author interactions
holtvogt Sep 1, 2025
20fc82e
Add LevenshteinDistance class for calculating string edit distance
holtvogt Sep 1, 2025
4cce0bc
Add PathUtils class for content path manipulation
holtvogt Sep 1, 2025
6f5a788
Add PathIndex and PathNode classes for content path indexing
holtvogt Sep 1, 2025
321bda9
Add AnalysisStrategy class for analyzing broken content paths
holtvogt Sep 1, 2025
52fd82a
Add collector classes for managing broken content paths
holtvogt Sep 1, 2025
7569f35
Add unit tests for LanguageTree class
holtvogt Sep 1, 2025
9f19c8f
Add unit tests for LevenshteinDistance class
holtvogt Sep 1, 2025
b3169c6
Add unit tests for Locale class
holtvogt Sep 1, 2025
ed67287
Add unit tests for PathIndex class
holtvogt Sep 1, 2025
e30ad1f
Add unit tests for PathUtils class
holtvogt Sep 1, 2025
84e11ed
Add SQL scripts for database and table creation, and daily query
holtvogt Sep 1, 2025
94e9298
Refactor to step-based audit
holtvogt Sep 1, 2025
3077204
Remove limit in daily query
holtvogt Sep 1, 2025
480fa25
Add empty line between license header
holtvogt Sep 1, 2025
8af98a1
Update default reason in Suggestion.publish
holtvogt Sep 1, 2025
f3de2d3
Iinclude suggestedPath parameter in publish
holtvogt Sep 1, 2025
38f8fa2
Add double slash handling in LocaleFallbackRule and PathUtils
holtvogt Sep 1, 2025
4a2f3ee
Handle double slashes in similar path rule
holtvogt Sep 1, 2025
34abcd4
Add pagination support in AemAuthorClient
holtvogt Sep 1, 2025
34d3a53
Implement error handling in handler
holtvogt Sep 1, 2025
099b19d
Only change reason in post processing of suggestions
holtvogt Sep 3, 2025
87afbff
Refine content availability check in AemAuthorClient
holtvogt Sep 3, 2025
793ebea
Remove unnecessary debug log in SimilarPathRule class
holtvogt Sep 3, 2025
a9d7867
Refactor SQL table creation by removing hour partitioning
holtvogt Sep 15, 2025
95d3c21
Add a newline in handler.js
holtvogt Sep 15, 2025
e2fabd2
Add SQL scripts for 404 cdn analysis
holtvogt Sep 15, 2025
6439368
Use rawTable as dynamic table name
holtvogt Sep 15, 2025
88d7423
Update SQL unload path to use dynamic output variable
holtvogt Sep 15, 2025
56b5376
Update SQL script to use dynamic raw table and location variables
holtvogt Sep 15, 2025
4580172
Refactor AthenaCollector to use dynamic bucket and organization varia…
holtvogt Sep 15, 2025
2f318a9
Add CDN 404 analysis handler with Athena integration
holtvogt Sep 15, 2025
9205340
Add to-do note for raw table update
holtvogt Sep 18, 2025
72f8f40
Add unit tests for CDN 404 analysis handler
holtvogt Sep 18, 2025
bd1920b
Add IMS organization validation
holtvogt Sep 18, 2025
c9e5d72
Use URL constructor for host extraction
holtvogt Sep 18, 2025
4b487dc
Refactor broken content path collectors to remove BaseCollector and C…
holtvogt Sep 18, 2025
98671d5
Merge branch 'main' into feature/broken-content-path-audit
holtvogt Sep 18, 2025
8b7355f
Simplify URL selection syntax
holtvogt Sep 18, 2025
dc01079
Add URL resolver to audit
holtvogt Sep 18, 2025
c3626d2
Use sanitized hostname from domain extraction
holtvogt Sep 18, 2025
9a87a43
Get bucket name and IMS org from env context
holtvogt Sep 18, 2025
5889f34
Enhance error handling for missing rawBucket and imsOrg in context
holtvogt Sep 18, 2025
400ed15
Add CDN 404 analysis handler
holtvogt Sep 18, 2025
5ee3660
Refactor ESLint directive
holtvogt Sep 18, 2025
1ef3fe1
Remove debug logging from static context
holtvogt Sep 18, 2025
dde9b91
Add rule tests
holtvogt Sep 18, 2025
8f1952f
Add Athena collector test
holtvogt Sep 18, 2025
78e94fc
Add tests double slashes and locale removal
holtvogt Sep 18, 2025
0f2beb3
Add tests for Broken Content Path Handler
holtvogt Sep 18, 2025
4bb21d2
Remove part that cannot be reached
holtvogt Sep 19, 2025
0ad0539
Ensure path is a string before validation
holtvogt Sep 19, 2025
5f00cef
Add unit tests for AemAuthorClient functionality
holtvogt Sep 19, 2025
181a1fd
Add unit tests for AnalysisStrategy in broken content path analysis
holtvogt Sep 19, 2025
2ec787d
Add unit tests for ContentPath and ContentStatus in broken content pa…
holtvogt Sep 19, 2025
1eb7495
Include scenario with missing siblings and root locales
holtvogt Sep 19, 2025
4216698
Refactor LocaleFallbackRule test to use mocked dependencies for Local…
holtvogt Sep 19, 2025
4322fca
Add tests for handling prefixes in PathIndex that are not end nodes
holtvogt Sep 19, 2025
d7d77bc
Add unit tests for Suggestion and SuggestionType
holtvogt Sep 19, 2025
ffb349d
Handle empty content return
holtvogt Sep 29, 2025
e3dcfbf
Change parseContentStatus caller
holtvogt Sep 29, 2025
0de364e
Fix wrong availability condition
holtvogt Sep 23, 2025
d2b0d71
Fix missing parent path querying
holtvogt Sep 23, 2025
deba030
Remove unused PathIndex import
holtvogt Sep 29, 2025
1c318e4
Update CDN audit name for content fragments
holtvogt Sep 29, 2025
0d61f43
Include context in constructor
holtvogt Sep 29, 2025
d9d46e8
Update audit name for content fragment 404s
holtvogt Sep 29, 2025
ef22601
Remove content fetching logic
holtvogt Sep 29, 2025
1eb86c7
Remove context parameter from getChildrenFromPath call
holtvogt Sep 29, 2025
ac27109
Add to-do reflection
holtvogt Sep 29, 2025
63e8c79
Use fixed date and correct output references
holtvogt Sep 29, 2025
4992a88
Merge branch 'main' into feature/broken-content-path-audit
holtvogt Sep 29, 2025
8ca0daf
Rename broken content fragment link audit
holtvogt Sep 29, 2025
b91305b
Merge branch 'main' into feature/broken-content-path-audit
holtvogt Sep 29, 2025
0c8e7e4
Fix double slash handling in paths with protocols
holtvogt Sep 29, 2025
bcc0bd1
Fix step-based audit result persisting
holtvogt Oct 13, 2025
9e55270
Merge branch 'main' into feature/broken-content-path-audit
holtvogt Oct 13, 2025
dd42a5e
Refactor broken content fragment link analysis and update test cases
holtvogt Oct 13, 2025
b9f8b35
Update package-lock.json
holtvogt Oct 13, 2025
15373e9
Add retrieval of IMS org from method instead of env
holtvogt Oct 13, 2025
69fa7e2
Remove tenant specifier
holtvogt Oct 13, 2025
37e112e
Update Sites API paths
holtvogt Oct 21, 2025
b6c1297
Refactor AEM client name
holtvogt Oct 21, 2025
bfd539b
Merge branch 'main' into feature/broken-content-path-audit
holtvogt Oct 21, 2025
7f68b2a
Reorder methods in AthenaCollector
holtvogt Oct 22, 2025
e81b8ba
Add site ID to audit results
holtvogt Oct 22, 2025
81dca02
Add mock site ID to tests
holtvogt Oct 22, 2025
b57a696
Change IMS org retrieval
holtvogt Oct 22, 2025
5988340
Introduce asset utilities to identify assets in path
holtvogt Oct 24, 2025
434565d
Add user agent information
holtvogt Oct 24, 2025
b42f21a
Add test for handling mixed formats in broken paths
holtvogt Oct 24, 2025
1a29d25
Rename count to request_count in SQL unload query
holtvogt Oct 24, 2025
0c040df
Refactor customer domain extraction by importing utility function
holtvogt Oct 24, 2025
2632d20
Add request_count column to external table and update daily query to …
holtvogt Oct 24, 2025
844d933
Use customer domain utility function
holtvogt Oct 24, 2025
1966871
Align on database and table naming
holtvogt Oct 24, 2025
e861dce
Include request count in return
holtvogt Oct 24, 2025
e27fe29
Start with URL resolver
holtvogt Oct 24, 2025
b226c7f
Refactor step-based audit to traditional audit
holtvogt Oct 24, 2025
2a300a1
Add peer back to package-lock
holtvogt Oct 27, 2025
ef1ecb3
Merge branch 'main' into feature/broken-content-path-audit
holtvogt Oct 27, 2025
358f9d5
Add suggestion enrichment using Mystique
holtvogt Oct 27, 2025
66852f3
Decouple cache from AEMClient using Strategy pattern
holtvogt Oct 27, 2025
1656e1d
Move GraphQL suffix handling to AthenaCollector
holtvogt Oct 28, 2025
5440424
Enhance syncSuggestions to update rank using getRank function
holtvogt Oct 28, 2025
c6c5a5f
Divide post processing into two steps
holtvogt Oct 28, 2025
52ac94b
Merge branch 'main' into feature/broken-content-path-audit
holtvogt Oct 28, 2025
2a20152
Rename broken content fragment links audit
holtvogt Oct 28, 2025
96f3192
Rename audit to 'content-fragment-404'
holtvogt Oct 30, 2025
971f934
Update Athena methods for clarity
holtvogt Oct 30, 2025
792db36
Clean up tests for log messages
holtvogt Oct 30, 2025
00eecb3
Clean up magic numbers
holtvogt Oct 30, 2025
08857d3
Refactor test constants for CDN 404 analysis
holtvogt Oct 30, 2025
fa34b93
Refactor rule priority constants
holtvogt Oct 30, 2025
401c8ed
Refactor name for content fragment 404 data
holtvogt Oct 30, 2025
7b8b4c5
Update AemClient to retrieve author URL from delivery configuration
holtvogt Nov 3, 2025
c281a4e
Add tests for AemClient error handling with missing or invalid author…
holtvogt Nov 3, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 107 additions & 0 deletions src/cdn-content-fragment-404/handler.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright 2025 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { getStaticContent } from '@adobe/spacecat-shared-utils';
import { AWSAthenaClient } from '@adobe/spacecat-shared-athena-client';
import { AuditBuilder } from '../common/audit-builder.js';
import { wwwUrlResolver } from '../common/base-audit.js';
import { getImsOrgId } from '../utils/data-access.js';
import { extractCustomerDomain } from '../utils/cdn-utils.js';

const ONE_HOUR_MS = 60 * 60 * 1000;

function getHourParts() {
const previousHour = new Date(Date.now() - ONE_HOUR_MS);

const year = previousHour.getUTCFullYear().toString();
const month = String(previousHour.getUTCMonth() + 1).padStart(2, '0');
const day = String(previousHour.getUTCDate()).padStart(2, '0');
const hour = String(previousHour.getUTCHours()).padStart(2, '0');

return {
year, month, day, hour,
};
}

async function loadSql(filename, variables) {
return getStaticContent(variables, `./src/cdn-content-fragment-404/sql/${filename}.sql`);
}

export async function cdnContentFragment404Runner(context) {
const {
site, rawBucket, dataAccess, log,
} = context;
const sanitizedHostname = extractCustomerDomain(site);
const {
year, month, day, hour,
} = getHourParts();

if (!rawBucket) {
throw new Error('Raw bucket is required');
}

const imsOrg = await getImsOrgId(site, dataAccess, log);
if (!imsOrg) {
throw new Error('Unable to retrieve IMS organization ID');
}

const database = `cdn_logs_${sanitizedHostname}`;
const rawTable = `raw_logs_status_${sanitizedHostname}`;
const tempLocation = `s3://${rawBucket}/temp/athena-results/`;
const athenaClient = AWSAthenaClient.fromContext(context, tempLocation);

// Create database
const sqlDb = await loadSql('create-database', { database });
const sqlDbDescription = `[Athena Query] Create database ${database}`;
await athenaClient.execute(sqlDb, database, sqlDbDescription);

// Each tenant has its own folder mapped via IMS org within the raw bucket
const bucket = `${rawBucket}/${imsOrg}`;
// Subfolder aem-cs-fastly is used for raw logs currently
const rawLocation = `s3://${bucket}/raw/aem-cs-fastly`;

// Create table
const sqlTable = await loadSql('create-raw-table', {
database,
rawTable,
rawLocation,
});
const sqlTableDescription = `[Athena Query] Create raw logs table ${database}.${rawTable} from ${rawLocation}`;
await athenaClient.execute(sqlTable, database, sqlTableDescription);

const output = `s3://${bucket}/aggregated-404/${year}/${month}/${day}/${hour}/`;
const sqlUnload = await loadSql('unload-404-content', {
database,
rawTable,
year,
month,
day,
hour,
output,
});
const sqlUnloadDescription = `[Athena Query] Unload 404 content data to ${output}`;
await athenaClient.execute(sqlUnload, database, sqlUnloadDescription);

return {
auditResult: {
database,
rawTable,
completedAt: new Date().toISOString(),
},
fullAuditRef: output,
};
}

export default new AuditBuilder()
.withUrlResolver(wwwUrlResolver)
.withRunner(cdnContentFragment404Runner)
.build();
1 change: 1 addition & 0 deletions src/cdn-content-fragment-404/sql/create-database.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
CREATE DATABASE IF NOT EXISTS {{database}};
29 changes: 29 additions & 0 deletions src/cdn-content-fragment-404/sql/create-raw-table.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
CREATE EXTERNAL TABLE IF NOT EXISTS {{database}}.{{rawTable}} (
url string,
request_user_agent string,
response_status int
)
PARTITIONED BY (
year string,
month string,
day string,
hour string
)
ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'
LOCATION '{{rawLocation}}'
TBLPROPERTIES (
'projection.enabled' = 'true',
'storage.location.template' = '{{rawLocation}}/${year}/${month}/${day}/${hour}/',
'projection.year.type' = 'integer',
'projection.year.range' = '2024,2030',
'projection.month.type' = 'integer',
'projection.month.range' = '1,12',
'projection.month.digits' = '2',
'projection.day.type' = 'integer',
'projection.day.range' = '1,31',
'projection.day.digits' = '2',
'projection.hour.type' = 'integer',
'projection.hour.range' = '0,23',
'projection.hour.digits' = '2',
'has_encrypted_data' = 'false'
);
19 changes: 19 additions & 0 deletions src/cdn-content-fragment-404/sql/unload-404-content.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
UNLOAD (
SELECT
url,
request_user_agent,
COALESCE(REGEXP_EXTRACT(url, '/content/dam/([^/]+)', 1), 'unknown') AS tenant,
count(*) AS request_count
FROM {{database}}.{{rawTable}}
WHERE year = '{{year}}'
AND month = '{{month}}'
AND day = '{{day}}'
AND hour = '{{hour}}'

AND response_status = 404
-- Only include content fragment requests
AND url LIKE '/content/dam/%'

GROUP BY url, request_user_agent, COALESCE(REGEXP_EXTRACT(url, '/content/dam/([^/]+)', 1), 'unknown')
) TO '{{output}}'
WITH (format = 'PARQUET');
101 changes: 101 additions & 0 deletions src/content-fragment-404/analysis/analysis-strategy.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright 2025 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { PublishRule } from '../rules/publish-rule.js';
import { LocaleFallbackRule } from '../rules/locale-fallback-rule.js';
import { SimilarPathRule } from '../rules/similar-path-rule.js';
import { Suggestion, SuggestionType } from '../domain/suggestion/suggestion.js';

export class AnalysisStrategy {
constructor(context, aemClient, pathIndex) {
this.context = context;
this.aemClient = aemClient;
this.pathIndex = pathIndex;
this.rules = [
new PublishRule(context, this.aemClient),
new LocaleFallbackRule(context, this.aemClient),
new SimilarPathRule(context, this.aemClient, pathIndex),
].sort((a, b) => a.getPriority() - b.getPriority());
}

async analyze(contentFragment404s) {
const suggestions = [];

for (const path of contentFragment404s) {
// eslint-disable-next-line no-await-in-loop
const suggestion = await this.analyzePath(path);
if (suggestion) {
suggestions.push(suggestion);
}
}

// Post-process suggestions to check content status
return this.processSuggestions(suggestions);
}

async analyzePath(brokenPath) {
const { log } = this.context;
log.info(`Analyzing broken path: ${brokenPath}`);

for (const rule of this.rules) {
try {
// eslint-disable-next-line no-await-in-loop
const suggestion = await rule.apply(brokenPath);

if (suggestion) {
log.info(`Rule ${rule.constructor.name} applied to ${brokenPath}`);
return suggestion;
}
} catch (error) {
log.error(`Error applying rule ${rule.constructor.name} to ${brokenPath}: ${error.message}`);
// Continue to next rule
}
}

log.warn(`No rules applied to ${brokenPath}`);
return Suggestion.notFound(brokenPath);
}

async processSuggestions(suggestions) {
const { log } = this.context;
log.info(`Post-processing ${suggestions.length} suggestions`);

const processedSuggestions = [];

for (const suggestion of suggestions) {
if (suggestion.type !== SuggestionType.LOCALE && suggestion.type !== SuggestionType.SIMILAR) {
processedSuggestions.push(suggestion);
// eslint-disable-next-line no-continue
continue;
}

const { suggestedPath } = suggestion;
log.debug(`Checking content status for suggestion: ${suggestedPath} with type: ${suggestion.type}`);

// Path must be available as it was suggested
const contentPath = this.pathIndex.find(suggestedPath);
const { status } = contentPath;

if (contentPath.isPublished()) {
processedSuggestions.push(suggestion);
log.debug(`Kept original suggestion type for ${suggestedPath} with status: ${status}`);
// eslint-disable-next-line no-continue
continue;
}

suggestion.reason = `Content is in ${status} state. Suggest publishing.`;
processedSuggestions.push(suggestion);
}

return processedSuggestions;
}
}
43 changes: 43 additions & 0 deletions src/content-fragment-404/cache/cache-strategy.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
/*
* Copyright 2025 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

export class CacheStrategy {
/**
* Find direct children of a parent path
* @param {string} parentPath - The parent path
* @returns {Array<ContentPath>} Array of child ContentPath objects
*/
// eslint-disable-next-line no-unused-vars, class-methods-use-this
findChildren(parentPath) {
throw new Error('findChildren() must be implemented by subclass');
}

/**
* Cache content items.
* @param {Array} items - Array of content items
* @param {Function} statusParser - Function to parse content status
* @returns {void}
*/
// eslint-disable-next-line no-unused-vars, class-methods-use-this
cacheItems(items, statusParser) {
throw new Error('cacheItems() must be implemented by subclass');
}

/**
* Check if this cache strategy is available.
* @returns {boolean}
*/
// eslint-disable-next-line class-methods-use-this
isAvailable() {
throw new Error('isAvailable() must be implemented by subclass');
}
}
33 changes: 33 additions & 0 deletions src/content-fragment-404/cache/noop-cache.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright 2025 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { CacheStrategy } from './cache-strategy.js';

/**
* No-operation cache that doesn't store anything.
*/
export class NoOpCache extends CacheStrategy {
// eslint-disable-next-line class-methods-use-this
findChildren() {
return [];
}

// eslint-disable-next-line no-unused-vars, class-methods-use-this
cacheItems(items, statusParser) {
// Do nothing
}

// eslint-disable-next-line class-methods-use-this
isAvailable() {
return false;
}
}
50 changes: 50 additions & 0 deletions src/content-fragment-404/cache/path-index-cache.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
/*
* Copyright 2025 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { ContentPath } from '../domain/content/content-path.js';
import { Locale } from '../domain/language/locale.js';
import { CacheStrategy } from './cache-strategy.js';

/**
* Cache implementation that uses PathIndex for storage.
* Provides hierarchical path caching and lookup capabilities.
*/
export class PathIndexCache extends CacheStrategy {
constructor(pathIndex) {
super();
this.pathIndex = pathIndex;
}

findChildren(parentPath) {
return this.pathIndex.findChildren(parentPath);
}

cacheItems(items, statusParser) {
if (!items || items.length === 0) {
return;
}

for (const item of items) {
const contentPath = new ContentPath(
item.path,
statusParser(item.status),
Locale.fromPath(item.path),
);
this.pathIndex.insertContentPath(contentPath);
}
}

// eslint-disable-next-line class-methods-use-this
isAvailable() {
return true;
}
}
Loading