diff --git a/src/metatags/handler.js b/src/metatags/handler.js index 9a05d6884..35c1d7ffa 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -28,6 +28,7 @@ import { } from './constants.js'; import { syncSuggestions } from '../utils/data-access.js'; import { createOpportunityData } from './opportunity-data-mapper.js'; +import { validateDetectedIssues } from './ssr-meta-validator.js'; const auditType = Audit.AUDIT_TYPES.META_TAGS; const { AUDIT_STEP_DESTINATIONS } = Audit; @@ -242,6 +243,14 @@ export async function runAuditAndGenerateSuggestions(context) { extractedTags, } = await metatagsAutoDetect(site, scrapeResultPaths, context); + // Validate detected issues using SSR fallback to eliminate false positives + log.debug('Validating detected issues via SSR to remove false positives...'); + const validatedDetectedTags = await validateDetectedIssues( + detectedTags, + site.getBaseURL(), + log, + ); + // Calculate projected traffic lost const { projectedTrafficLost, @@ -249,13 +258,13 @@ export async function runAuditAndGenerateSuggestions(context) { } = await calculateProjectedTraffic( context, site, - detectedTags, + validatedDetectedTags, log, ); // Generate AI suggestions for detected tags if auto-suggest enabled for site const allTags = { - detectedTags: seoChecks.getDetectedTags(), + detectedTags: validatedDetectedTags, healthyTags: seoChecks.getFewHealthyTags(), extractedTags, }; @@ -315,7 +324,12 @@ export async function submitForScraping(context) { return { urls: finalUrls.map((url) => ({ url })), siteId: site.getId(), - type: 'meta-tags', + type: 'default', + allowCache: false, + maxScrapeAge: 0, + options: { + waitTimeoutForMetaTags: 5000, + }, }; } diff --git a/src/metatags/ssr-meta-validator.js b/src/metatags/ssr-meta-validator.js new file mode 100644 index 000000000..11f18bb07 --- /dev/null +++ b/src/metatags/ssr-meta-validator.js @@ -0,0 +1,130 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { context as fetchContext } from '@adobe/fetch'; +import * as cheerio from 'cheerio'; +import { hasText } from '@adobe/spacecat-shared-utils'; + +const { fetch } = fetchContext(); + +/** + * Validates missing meta tags using SSR content fetched via HTTP. + * This is useful as a fallback when Puppeteer fails to capture tags that load with delays. + * + * @param {string} url - The URL to validate + * @param {Object} log - Logger instance + * @returns {Promise} Object containing title, description, and h1 tags found via SSR + */ +export async function validateMetaTagsViaSSR(url, log) { + try { + log.debug(`Validating meta tags via SSR for: ${url}`); + const response = await fetch(url, { + method: 'GET', + headers: { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Spacecat/1.0', + }, + timeout: 10000, + }); + + if (!response.ok) { + log.warn(`SSR validation failed with status ${response.status} for ${url}`); + return null; + } + + const html = await response.text(); + const $ = cheerio.load(html); + + const title = $('title').first().text()?.trim() || null; + const description = $('meta[name="description"]').attr('content')?.trim() || null; + + const h1Tags = []; + $('h1').each((_, element) => { + const text = $(element).text()?.trim(); + if (hasText(text)) { + h1Tags.push(text); + } + }); + + const result = { + title: hasText(title) ? title : null, + description: hasText(description) ? description : null, + h1: h1Tags.length > 0 ? h1Tags : null, + }; + log.debug(`SSR validation result for ${url}: title=${!!result.title}, description=${!!result.description}, h1Count=${h1Tags.length}`); + return result; + } catch (error) { + log.warn(`Error during SSR validation for ${url}: ${error.message}`); + return null; + } +} + +/** + * Validates if detected issues are false positives by checking SSR content. + * Updates the detectedTags object to remove false positives. + * + * @param {Object} detectedTags - Object containing detected tag issues by endpoint + * @param {string} baseUrl - Base URL of the site + * @param {Object} log - Logger instance + * @returns {Promise} Updated detectedTags with false positives removed + */ +export async function validateDetectedIssues(detectedTags, baseUrl, log) { + const endpoints = Object.keys(detectedTags); + + if (endpoints.length === 0) { + return detectedTags; + } + log.debug(`Validating ${endpoints.length} endpoints with detected issues via SSR`); + const updatedDetectedTags = { ...detectedTags }; + let falsePositivesRemoved = 0; + + // Process endpoints sequentially to avoid overwhelming the server + for (const endpoint of endpoints) { + const tags = updatedDetectedTags[endpoint]; + const fullUrl = `${baseUrl}${endpoint}`; + + // Check if any of the issues are related to missing tags + const hasMissingIssues = ['title', 'description', 'h1'].some( + (tagName) => tags[tagName]?.issue?.includes('Missing'), + ); + + if (hasMissingIssues) { + // Validate via SSR + // eslint-disable-next-line no-await-in-loop + const ssrResult = await validateMetaTagsViaSSR(fullUrl, log); + + if (ssrResult) { + // Check each tag type and remove false positives + const tagNames = ['title', 'description', 'h1']; + for (const tagName of tagNames) { + if (tags[tagName]?.issue?.includes('Missing') && ssrResult[tagName]) { + log.info(`False positive detected for ${tagName} on ${endpoint} - tag exists in SSR`); + delete updatedDetectedTags[endpoint][tagName]; + falsePositivesRemoved += 1; + } + } + + // If all issues were false positives, remove the endpoint entirely + if (Object.keys(updatedDetectedTags[endpoint]).length === 0) { + delete updatedDetectedTags[endpoint]; + } + + // Add a small delay to avoid rate limiting + // eslint-disable-next-line no-await-in-loop + await new Promise((resolve) => { + setTimeout(resolve, 100); + }); + } + } + } + log.info(`SSR validation complete. Removed ${falsePositivesRemoved} false positives from ${endpoints.length} endpoints`); + return updatedDetectedTags; +} diff --git a/test/audits/metatags.test.js b/test/metatags/metatags.test.js similarity index 97% rename from test/audits/metatags.test.js rename to test/metatags/metatags.test.js index cff167d53..6cac2b2f5 100644 --- a/test/audits/metatags.test.js +++ b/test/metatags/metatags.test.js @@ -309,7 +309,12 @@ describe('Meta Tags', () => { { url: 'http://example.com/page2' }, ], siteId: 'site-id', - type: 'meta-tags', + type: 'default', + allowCache: false, + maxScrapeAge: 0, + options: { + waitTimeoutForMetaTags: 5000, + }, }); }); @@ -336,7 +341,12 @@ describe('Meta Tags', () => { { url: 'http://example.com/page2' }, ], siteId: 'site-id', - type: 'meta-tags', + type: 'default', + allowCache: false, + maxScrapeAge: 0, + options: { + waitTimeoutForMetaTags: 5000, + }, }); }); }); @@ -1126,6 +1136,8 @@ describe('Meta Tags', () => { .resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub() .resolves(5000); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const auditStub = await esmock('../../src/metatags/handler.js', { '../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, @@ -1148,6 +1160,9 @@ describe('Meta Tags', () => { }, }, }), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); const result = await auditStub.runAuditAndGenerateSuggestions(context); @@ -1164,6 +1179,8 @@ describe('Meta Tags', () => { .resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub() .resolves(2); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const auditStub = await esmock('../../src/metatags/handler.js', { '../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, @@ -1173,6 +1190,9 @@ describe('Meta Tags', () => { '../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() }, '../../src/metatags/metatags-auto-suggest.js': sinon.stub() .resolves({}), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); // Override all S3 responses to have null tags @@ -1213,6 +1233,8 @@ describe('Meta Tags', () => { .resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub() .resolves(2); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const auditStub = await esmock('../../src/metatags/handler.js', { '../../src/support/utils.js': { getRUMDomainkey: @@ -1223,6 +1245,9 @@ describe('Meta Tags', () => { '../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() }, '../../src/metatags/metatags-auto-suggest.js': sinon.stub() .resolves({}), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); // Override RUM API response to simulate error RUMAPIClientStub.createFrom() @@ -1245,6 +1270,8 @@ describe('Meta Tags', () => { it('should submit top pages for scraping when getIncludedURLs returns null', async () => { const mockGetRUMDomainkey = sinon.stub().resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub().resolves(2); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const getConfigStub = sinon.stub().returns({ getIncludedURLs: sinon.stub().returns(null), }); @@ -1258,6 +1285,9 @@ describe('Meta Tags', () => { '@adobe/spacecat-shared-rum-api-client': RUMAPIClientStub, '../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() }, '../../src/metatags/metatags-auto-suggest.js': sinon.stub().resolves({}), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); const result = await auditStub.runAuditAndGenerateSuggestions(context); expect(result).to.deep.equal({ status: 'complete' }); diff --git a/test/metatags/ssr-validator.test.js b/test/metatags/ssr-validator.test.js new file mode 100644 index 000000000..1bb5d6883 --- /dev/null +++ b/test/metatags/ssr-validator.test.js @@ -0,0 +1,358 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +/* eslint-env mocha */ + +import { expect } from 'chai'; +import sinon from 'sinon'; +import esmock from 'esmock'; + +describe('SSR Validator', () => { + let validateMetaTagsViaSSR; + let validateDetectedIssues; + let fetchStub; + let log; + + beforeEach(async () => { + fetchStub = sinon.stub(); + log = { + info: sinon.stub(), + debug: sinon.stub(), + warn: sinon.stub(), + error: sinon.stub(), + }; + + const ssrValidator = await esmock('../../src/metatags/ssr-meta-validator.js', { + '@adobe/fetch': { + context: () => ({ fetch: fetchStub }), + }, + }); + + validateMetaTagsViaSSR = ssrValidator.validateMetaTagsViaSSR; + validateDetectedIssues = ssrValidator.validateDetectedIssues; + }); + + afterEach(() => { + sinon.restore(); + }); + + describe('validateMetaTagsViaSSR', () => { + it('should successfully extract meta tags from HTML', async () => { + const html = ` + + + + Test Page Title + + + +

Main Heading

+

Secondary Heading

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result).to.deep.equal({ + title: 'Test Page Title', + description: 'Test page description', + h1: ['Main Heading', 'Secondary Heading'], + }); + expect(log.debug.calledWith('Validating meta tags via SSR for: https://example.com/page')).to.be.true; + }); + + it('should return null for missing title', async () => { + const html = ` + + + + + + +

Main Heading

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result.title).to.be.null; + expect(result.description).to.equal('Test page description'); + expect(result.h1).to.deep.equal(['Main Heading']); + }); + + it('should return null for missing description', async () => { + const html = ` + + + + Test Page Title + + +

Main Heading

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result.title).to.equal('Test Page Title'); + expect(result.description).to.be.null; + expect(result.h1).to.deep.equal(['Main Heading']); + }); + + it('should return null for missing h1 tags', async () => { + const html = ` + + + + Test Page Title + + + +

Not an H1

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result.title).to.equal('Test Page Title'); + expect(result.description).to.equal('Test page description'); + expect(result.h1).to.be.null; + }); + + it('should return null when fetch fails', async () => { + fetchStub.resolves({ + ok: false, + status: 404, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/notfound', log); + + expect(result).to.be.null; + expect(log.warn.calledWith('SSR validation failed with status 404 for https://example.com/notfound')).to.be.true; + }); + + it('should return null when fetch throws an error', async () => { + fetchStub.rejects(new Error('Network error')); + + const result = await validateMetaTagsViaSSR('https://example.com/error', log); + + expect(result).to.be.null; + expect(log.warn.calledWith(sinon.match('Error during SSR validation'))).to.be.true; + }); + + it('should handle empty/whitespace-only tags', async () => { + const html = ` + + + + + + + +

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/empty', log); + + expect(result.title).to.be.null; + expect(result.description).to.be.null; + expect(result.h1).to.be.null; + }); + }); + + describe('validateDetectedIssues', () => { + beforeEach(() => { + // Mock the internal validateMetaTagsViaSSR calls + sinon.stub(Date, 'now').returns(1000); + }); + + it('should remove false positives for missing tags', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + description: { issue: 'Missing description', tagContent: '' }, + }, + '/page2': { + h1: { issue: 'Missing h1', tagContent: '' }, + }, + }; + + const html1 = ` + + + Actual Title + + + + `; + + const html2 = ` + + +

Actual H1

+ + + `; + + fetchStub.onFirstCall().resolves({ + ok: true, + status: 200, + text: async () => html1, + }); + + fetchStub.onSecondCall().resolves({ + ok: true, + status: 200, + text: async () => html2, + }); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + expect(result).to.deep.equal({}); + expect(log.info.calledWith(sinon.match('False positive detected'))).to.be.true; + }); + + it('should keep legitimate issues', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + description: { issue: 'Too short description', tagContent: 'Short' }, + }, + }; + + const html1 = ` + + + + + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html1, + }); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + // Title is still missing (not in SSR), description issue is not about missing + expect(result['/page1'].title).to.exist; + expect(result['/page1'].description).to.exist; + }); + + it('should skip endpoints without missing issues', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Too long title', tagContent: 'Very long title' }, + description: { issue: 'Duplicate description', tagContent: 'Duplicate' }, + }, + }; + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + expect(result).to.deep.equal(detectedTags); + expect(fetchStub.called).to.be.false; + }); + + it('should handle validation errors gracefully', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + }, + }; + + fetchStub.rejects(new Error('Network error')); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + // Should keep the issue if validation fails + expect(result['/page1'].title).to.exist; + expect(log.warn.called).to.be.true; + }); + + it('should return unchanged when no detected tags', async () => { + const detectedTags = {}; + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + expect(result).to.deep.equal({}); + expect(fetchStub.called).to.be.false; + }); + + it('should partially remove false positives', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + description: { issue: 'Missing description', tagContent: '' }, + }, + }; + + // SSR only has title, not description + const html = ` + + + Actual Title + + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + // Title should be removed (false positive), description should remain + expect(result['/page1'].title).to.be.undefined; + expect(result['/page1'].description).to.exist; + }); + }); +});