Skip to content
20 changes: 17 additions & 3 deletions src/metatags/handler.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import {
} from './constants.js';
import { syncSuggestions } from '../utils/data-access.js';
import { createOpportunityData } from './opportunity-data-mapper.js';
import { validateDetectedIssues } from './ssr-meta-validator.js';

const auditType = Audit.AUDIT_TYPES.META_TAGS;
const { AUDIT_STEP_DESTINATIONS } = Audit;
Expand Down Expand Up @@ -242,20 +243,28 @@ export async function runAuditAndGenerateSuggestions(context) {
extractedTags,
} = await metatagsAutoDetect(site, scrapeResultPaths, context);

// Validate detected issues using SSR fallback to eliminate false positives
log.debug('Validating detected issues via SSR to remove false positives...');
const validatedDetectedTags = await validateDetectedIssues(
detectedTags,
site.getBaseURL(),
log,
);

// Calculate projected traffic lost
const {
projectedTrafficLost,
projectedTrafficValue,
} = await calculateProjectedTraffic(
context,
site,
detectedTags,
validatedDetectedTags,
log,
);

// Generate AI suggestions for detected tags if auto-suggest enabled for site
const allTags = {
detectedTags: seoChecks.getDetectedTags(),
detectedTags: validatedDetectedTags,
healthyTags: seoChecks.getFewHealthyTags(),
extractedTags,
};
Expand Down Expand Up @@ -315,7 +324,12 @@ export async function submitForScraping(context) {
return {
urls: finalUrls.map((url) => ({ url })),
siteId: site.getId(),
type: 'meta-tags',
type: 'default',
allowCache: false,
maxScrapeAge: 0,
options: {
waitTimeoutForMetaTags: 5000,
},
};
}

Expand Down
130 changes: 130 additions & 0 deletions src/metatags/ssr-meta-validator.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
/*
* Copyright 2024 Adobe. All rights reserved.
* This file is licensed to you under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. You may obtain a copy
* of the License at http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software distributed under
* the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS
* OF ANY KIND, either express or implied. See the License for the specific language
* governing permissions and limitations under the License.
*/

import { context as fetchContext } from '@adobe/fetch';
import * as cheerio from 'cheerio';
import { hasText } from '@adobe/spacecat-shared-utils';

const { fetch } = fetchContext();

/**
* Validates missing meta tags using SSR content fetched via HTTP.
* This is useful as a fallback when Puppeteer fails to capture tags that load with delays.
*
* @param {string} url - The URL to validate
* @param {Object} log - Logger instance
* @returns {Promise<Object>} Object containing title, description, and h1 tags found via SSR
*/
export async function validateMetaTagsViaSSR(url, log) {
try {
log.debug(`Validating meta tags via SSR for: ${url}`);
const response = await fetch(url, {
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Spacecat/1.0',
},
timeout: 10000,
});

if (!response.ok) {
log.warn(`SSR validation failed with status ${response.status} for ${url}`);
return null;
}

const html = await response.text();
const $ = cheerio.load(html);

const title = $('title').first().text()?.trim() || null;
const description = $('meta[name="description"]').attr('content')?.trim() || null;

const h1Tags = [];
$('h1').each((_, element) => {
const text = $(element).text()?.trim();
if (hasText(text)) {
h1Tags.push(text);
}
});

const result = {
title: hasText(title) ? title : null,
description: hasText(description) ? description : null,
h1: h1Tags.length > 0 ? h1Tags : null,
};
log.debug(`SSR validation result for ${url}: title=${!!result.title}, description=${!!result.description}, h1Count=${h1Tags.length}`);
return result;
} catch (error) {
log.warn(`Error during SSR validation for ${url}: ${error.message}`);
return null;
}
}

/**
* Validates if detected issues are false positives by checking SSR content.
* Updates the detectedTags object to remove false positives.
*
* @param {Object} detectedTags - Object containing detected tag issues by endpoint
* @param {string} baseUrl - Base URL of the site
* @param {Object} log - Logger instance
* @returns {Promise<Object>} Updated detectedTags with false positives removed
*/
export async function validateDetectedIssues(detectedTags, baseUrl, log) {
const endpoints = Object.keys(detectedTags);

if (endpoints.length === 0) {
return detectedTags;
}
log.debug(`Validating ${endpoints.length} endpoints with detected issues via SSR`);
const updatedDetectedTags = { ...detectedTags };
let falsePositivesRemoved = 0;

// Process endpoints sequentially to avoid overwhelming the server
for (const endpoint of endpoints) {
const tags = updatedDetectedTags[endpoint];
const fullUrl = `${baseUrl}${endpoint}`;

// Check if any of the issues are related to missing tags
const hasMissingIssues = ['title', 'description', 'h1'].some(
(tagName) => tags[tagName]?.issue?.includes('Missing'),
);

if (hasMissingIssues) {
// Validate via SSR
// eslint-disable-next-line no-await-in-loop
const ssrResult = await validateMetaTagsViaSSR(fullUrl, log);

if (ssrResult) {
// Check each tag type and remove false positives
const tagNames = ['title', 'description', 'h1'];
for (const tagName of tagNames) {
if (tags[tagName]?.issue?.includes('Missing') && ssrResult[tagName]) {
log.info(`False positive detected for ${tagName} on ${endpoint} - tag exists in SSR`);
delete updatedDetectedTags[endpoint][tagName];
falsePositivesRemoved += 1;
}
}

// If all issues were false positives, remove the endpoint entirely
if (Object.keys(updatedDetectedTags[endpoint]).length === 0) {
delete updatedDetectedTags[endpoint];
}

// Add a small delay to avoid rate limiting
// eslint-disable-next-line no-await-in-loop
await new Promise((resolve) => {
setTimeout(resolve, 100);
});
}
}
}
log.info(`SSR validation complete. Removed ${falsePositivesRemoved} false positives from ${endpoints.length} endpoints`);
return updatedDetectedTags;
}
34 changes: 32 additions & 2 deletions test/audits/metatags.test.js → test/metatags/metatags.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,12 @@ describe('Meta Tags', () => {
{ url: 'http://example.com/page2' },
],
siteId: 'site-id',
type: 'meta-tags',
type: 'default',
allowCache: false,
maxScrapeAge: 0,
options: {
waitTimeoutForMetaTags: 5000,
},
});
});

Expand All @@ -336,7 +341,12 @@ describe('Meta Tags', () => {
{ url: 'http://example.com/page2' },
],
siteId: 'site-id',
type: 'meta-tags',
type: 'default',
allowCache: false,
maxScrapeAge: 0,
options: {
waitTimeoutForMetaTags: 5000,
},
});
});
});
Expand Down Expand Up @@ -1126,6 +1136,8 @@ describe('Meta Tags', () => {
.resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub()
.resolves(5000);
const mockValidateDetectedIssues = sinon.stub()
.callsFake(async (detectedTags) => detectedTags);
const auditStub = await esmock('../../src/metatags/handler.js', {
'../../src/support/utils.js': {
getRUMDomainkey: mockGetRUMDomainkey,
Expand All @@ -1148,6 +1160,9 @@ describe('Meta Tags', () => {
},
},
}),
'../../src/metatags/ssr-meta-validator.js': {
validateDetectedIssues: mockValidateDetectedIssues,
},
});
const result = await auditStub.runAuditAndGenerateSuggestions(context);

Expand All @@ -1164,6 +1179,8 @@ describe('Meta Tags', () => {
.resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub()
.resolves(2);
const mockValidateDetectedIssues = sinon.stub()
.callsFake(async (detectedTags) => detectedTags);
const auditStub = await esmock('../../src/metatags/handler.js', {
'../../src/support/utils.js': {
getRUMDomainkey: mockGetRUMDomainkey,
Expand All @@ -1173,6 +1190,9 @@ describe('Meta Tags', () => {
'../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() },
'../../src/metatags/metatags-auto-suggest.js': sinon.stub()
.resolves({}),
'../../src/metatags/ssr-meta-validator.js': {
validateDetectedIssues: mockValidateDetectedIssues,
},
});

// Override all S3 responses to have null tags
Expand Down Expand Up @@ -1213,6 +1233,8 @@ describe('Meta Tags', () => {
.resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub()
.resolves(2);
const mockValidateDetectedIssues = sinon.stub()
.callsFake(async (detectedTags) => detectedTags);
const auditStub = await esmock('../../src/metatags/handler.js', {
'../../src/support/utils.js': {
getRUMDomainkey:
Expand All @@ -1223,6 +1245,9 @@ describe('Meta Tags', () => {
'../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() },
'../../src/metatags/metatags-auto-suggest.js': sinon.stub()
.resolves({}),
'../../src/metatags/ssr-meta-validator.js': {
validateDetectedIssues: mockValidateDetectedIssues,
},
});
// Override RUM API response to simulate error
RUMAPIClientStub.createFrom()
Expand All @@ -1245,6 +1270,8 @@ describe('Meta Tags', () => {
it('should submit top pages for scraping when getIncludedURLs returns null', async () => {
const mockGetRUMDomainkey = sinon.stub().resolves('mockedDomainKey');
const mockCalculateCPCValue = sinon.stub().resolves(2);
const mockValidateDetectedIssues = sinon.stub()
.callsFake(async (detectedTags) => detectedTags);
const getConfigStub = sinon.stub().returns({
getIncludedURLs: sinon.stub().returns(null),
});
Expand All @@ -1258,6 +1285,9 @@ describe('Meta Tags', () => {
'@adobe/spacecat-shared-rum-api-client': RUMAPIClientStub,
'../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() },
'../../src/metatags/metatags-auto-suggest.js': sinon.stub().resolves({}),
'../../src/metatags/ssr-meta-validator.js': {
validateDetectedIssues: mockValidateDetectedIssues,
},
});
const result = await auditStub.runAuditAndGenerateSuggestions(context);
expect(result).to.deep.equal({ status: 'complete' });
Expand Down
Loading