From 01f02ba9c12aae550f206bda70862268885fee2c Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Thu, 9 Oct 2025 03:32:29 +0530 Subject: [PATCH 01/12] fix: add wait for metatags --- src/metatags/handler.js | 3 +++ test/audits/metatags.test.js | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index 8b1b9b29f..f63a1090f 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -320,6 +320,9 @@ export async function submitForScraping(context) { urls: finalUrls.map((url) => ({ url })), siteId: site.getId(), type: 'meta-tags', + options: { + waitTimeoutForMetaTags: 5000, + }, }; } diff --git a/test/audits/metatags.test.js b/test/audits/metatags.test.js index 1d3805b35..b764b3a9e 100644 --- a/test/audits/metatags.test.js +++ b/test/audits/metatags.test.js @@ -310,6 +310,9 @@ describe('Meta Tags', () => { ], siteId: 'site-id', type: 'meta-tags', + options: { + waitTimeoutForMetaTags: 5000, + }, }); }); @@ -337,6 +340,9 @@ describe('Meta Tags', () => { ], siteId: 'site-id', type: 'meta-tags', + options: { + waitTimeoutForMetaTags: 5000, + }, }); }); }); From eba2fc4b50bc384ac021d4379fa2941bf625b7e2 Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Sun, 12 Oct 2025 19:15:39 +0530 Subject: [PATCH 02/12] fix: allow cache false --- src/metatags/handler.js | 1 + test/audits/metatags.test.js | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index f63a1090f..623b872ee 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -320,6 +320,7 @@ export async function submitForScraping(context) { urls: finalUrls.map((url) => ({ url })), siteId: site.getId(), type: 'meta-tags', + allowCache: false, options: { waitTimeoutForMetaTags: 5000, }, diff --git a/test/audits/metatags.test.js b/test/audits/metatags.test.js index b764b3a9e..8eb53efbf 100644 --- a/test/audits/metatags.test.js +++ b/test/audits/metatags.test.js @@ -310,6 +310,7 @@ describe('Meta Tags', () => { ], siteId: 'site-id', type: 'meta-tags', + allowCache: false, options: { waitTimeoutForMetaTags: 5000, }, @@ -340,6 +341,7 @@ describe('Meta Tags', () => { ], siteId: 'site-id', type: 'meta-tags', + allowCache: false, options: { waitTimeoutForMetaTags: 5000, }, From dfb24f1163f60d2eada9eced6acee70d11aa7eea Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Mon, 13 Oct 2025 01:32:47 +0530 Subject: [PATCH 03/12] fix: add debug log --- src/metatags/handler.js | 1 + 1 file changed, 1 insertion(+) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index 623b872ee..791cb5963 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -315,6 +315,7 @@ export async function submitForScraping(context) { if (finalUrls.length === 0) { throw new Error('No URLs found for site neither top pages nor included URLs'); } + log.info(`Submitting for scraping ${finalUrls.length} with allowCache: false`); return { urls: finalUrls.map((url) => ({ url })), From cd231d62202dd00bfecb70d46505134d441838e0 Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Mon, 13 Oct 2025 03:14:59 +0530 Subject: [PATCH 04/12] fix: forced scraping --- src/metatags/handler.js | 1 + test/audits/metatags.test.js | 2 ++ 2 files changed, 3 insertions(+) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index 791cb5963..cae620583 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -322,6 +322,7 @@ export async function submitForScraping(context) { siteId: site.getId(), type: 'meta-tags', allowCache: false, + maxScrapeAge: 0, options: { waitTimeoutForMetaTags: 5000, }, diff --git a/test/audits/metatags.test.js b/test/audits/metatags.test.js index 8eb53efbf..785e67066 100644 --- a/test/audits/metatags.test.js +++ b/test/audits/metatags.test.js @@ -311,6 +311,7 @@ describe('Meta Tags', () => { siteId: 'site-id', type: 'meta-tags', allowCache: false, + maxScrapeAge: 0, options: { waitTimeoutForMetaTags: 5000, }, @@ -342,6 +343,7 @@ describe('Meta Tags', () => { siteId: 'site-id', type: 'meta-tags', allowCache: false, + maxScrapeAge: 0, options: { waitTimeoutForMetaTags: 5000, }, From b2377ef97afe41cf20db3992b8b266611305244f Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Mon, 13 Oct 2025 03:36:04 +0530 Subject: [PATCH 05/12] fix: forced scraping --- src/metatags/handler.js | 2 +- test/audits/metatags.test.js | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index cae620583..8b31c5235 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -322,7 +322,7 @@ export async function submitForScraping(context) { siteId: site.getId(), type: 'meta-tags', allowCache: false, - maxScrapeAge: 0, + maxScrapeAge: 0.001, options: { waitTimeoutForMetaTags: 5000, }, diff --git a/test/audits/metatags.test.js b/test/audits/metatags.test.js index 785e67066..443ecfa1b 100644 --- a/test/audits/metatags.test.js +++ b/test/audits/metatags.test.js @@ -311,7 +311,7 @@ describe('Meta Tags', () => { siteId: 'site-id', type: 'meta-tags', allowCache: false, - maxScrapeAge: 0, + maxScrapeAge: 0.001, options: { waitTimeoutForMetaTags: 5000, }, @@ -343,7 +343,7 @@ describe('Meta Tags', () => { siteId: 'site-id', type: 'meta-tags', allowCache: false, - maxScrapeAge: 0, + maxScrapeAge: 0.001, options: { waitTimeoutForMetaTags: 5000, }, From 20a852303f2bf13caf3fb049b15ce74706b33c7e Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Mon, 13 Oct 2025 18:33:54 +0530 Subject: [PATCH 06/12] fix: add validation --- package-lock.json | 6 +- package.json | 2 +- src/metatags/handler.js | 15 +- src/metatags/ssr-meta-validator.js | 132 ++++++++ test/{audits => metatags}/metatags.test.js | 24 +- test/metatags/ssr-validator.test.js | 358 +++++++++++++++++++++ 6 files changed, 528 insertions(+), 9 deletions(-) create mode 100644 src/metatags/ssr-meta-validator.js rename test/{audits => metatags}/metatags.test.js (98%) create mode 100644 test/metatags/ssr-validator.test.js diff --git a/package-lock.json b/package-lock.json index 8925486a6..22a72fe0c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -18,7 +18,7 @@ "@adobe/spacecat-helix-content-sdk": "1.4.24", "@adobe/spacecat-shared-ahrefs-client": "1.9.10", "@adobe/spacecat-shared-athena-client": "1.3.5", - "@adobe/spacecat-shared-data-access": "2.71.0", + "@adobe/spacecat-shared-data-access": "https://gitpkg.now.sh/adobe/spacecat-shared/packages/spacecat-shared-data-access?meta-scrape", "@adobe/spacecat-shared-google-client": "1.4.49", "@adobe/spacecat-shared-gpt-client": "1.6.5", "@adobe/spacecat-shared-http-utils": "1.17.6", @@ -8439,8 +8439,8 @@ }, "node_modules/@adobe/spacecat-shared-data-access": { "version": "2.71.0", - "resolved": "https://registry.npmjs.org/@adobe/spacecat-shared-data-access/-/spacecat-shared-data-access-2.71.0.tgz", - "integrity": "sha512-ukYib5540epX0RiXAqHD1WHW4sQa+1uRiOgpZBbzm0busuIUiOq4cZv7ssQ2muYjGkVclX/k7y70rd6dYM9cMA==", + "resolved": "https://gitpkg.now.sh/adobe/spacecat-shared/packages/spacecat-shared-data-access?meta-scrape", + "integrity": "sha512-yyUQo1fULsRYk4fK5cVz/XPoJ109mKNSNt42SLkAdIZMMB3q4TOjtXHDPyesIuD5BOrU2Irk6EwrGSHYmsbI7A==", "license": "Apache-2.0", "dependencies": { "@adobe/spacecat-shared-utils": "1.49.0", diff --git a/package.json b/package.json index bc13e05f7..356797cd8 100755 --- a/package.json +++ b/package.json @@ -80,7 +80,7 @@ "@adobe/spacecat-helix-content-sdk": "1.4.24", "@adobe/spacecat-shared-ahrefs-client": "1.9.10", "@adobe/spacecat-shared-athena-client": "1.3.5", - "@adobe/spacecat-shared-data-access": "2.71.0", + "@adobe/spacecat-shared-data-access": "https://gitpkg.now.sh/adobe/spacecat-shared/packages/spacecat-shared-data-access?meta-scrape", "@adobe/spacecat-shared-google-client": "1.4.49", "@adobe/spacecat-shared-gpt-client": "1.6.5", "@adobe/spacecat-shared-http-utils": "1.17.6", diff --git a/src/metatags/handler.js b/src/metatags/handler.js index 8b31c5235..8c60d297e 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -28,6 +28,7 @@ import { } from './constants.js'; import { syncSuggestions } from '../utils/data-access.js'; import { createOpportunityData } from './opportunity-data-mapper.js'; +import { validateDetectedIssues } from './ssr-meta-validator.js'; const auditType = Audit.AUDIT_TYPES.META_TAGS; const { AUDIT_STEP_DESTINATIONS } = Audit; @@ -246,6 +247,14 @@ export async function runAuditAndGenerateSuggestions(context) { extractedTags, } = await metatagsAutoDetect(site, scrapeResultPaths, context); + // Validate detected issues using SSR fallback to eliminate false positives + log.debug('Validating detected issues via SSR to remove false positives...'); + const validatedDetectedTags = await validateDetectedIssues( + detectedTags, + site.getBaseURL(), + log, + ); + // Calculate projected traffic lost const { projectedTrafficLost, @@ -253,13 +262,13 @@ export async function runAuditAndGenerateSuggestions(context) { } = await calculateProjectedTraffic( context, site, - detectedTags, + validatedDetectedTags, log, ); // Generate AI suggestions for detected tags if auto-suggest enabled for site const allTags = { - detectedTags: seoChecks.getDetectedTags(), + detectedTags: validatedDetectedTags, healthyTags: seoChecks.getFewHealthyTags(), extractedTags, }; @@ -322,7 +331,7 @@ export async function submitForScraping(context) { siteId: site.getId(), type: 'meta-tags', allowCache: false, - maxScrapeAge: 0.001, + maxScrapeAge: 0, options: { waitTimeoutForMetaTags: 5000, }, diff --git a/src/metatags/ssr-meta-validator.js b/src/metatags/ssr-meta-validator.js new file mode 100644 index 000000000..608ce5c22 --- /dev/null +++ b/src/metatags/ssr-meta-validator.js @@ -0,0 +1,132 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +import { context as fetchContext } from '@adobe/fetch'; +import * as cheerio from 'cheerio'; +import { hasText } from '@adobe/spacecat-shared-utils'; + +const { fetch } = fetchContext(); + +/** + * Validates missing meta tags using SSR content fetched via HTTP. + * This is useful as a fallback when Puppeteer fails to capture tags that load with delays. + * + * @param {string} url - The URL to validate + * @param {Object} log - Logger instance + * @returns {Promise} Object containing title, description, and h1 tags found via SSR + */ +export async function validateMetaTagsViaSSR(url, log) { + try { + log.debug(`Validating meta tags via SSR for: ${url}`); + const response = await fetch(url, { + method: 'GET', + headers: { + 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Spacecat/1.0', + }, + timeout: 10000, + }); + + if (!response.ok) { + log.warn(`SSR validation failed with status ${response.status} for ${url}`); + return null; + } + + const html = await response.text(); + const $ = cheerio.load(html); + + const title = $('title').first().text()?.trim() || null; + const description = $('meta[name="description"]').attr('content')?.trim() || null; + + const h1Tags = []; + $('h1').each((_, element) => { + const text = $(element).text()?.trim(); + if (hasText(text)) { + h1Tags.push(text); + } + }); + + const result = { + title: hasText(title) ? title : null, + description: hasText(description) ? description : null, + h1: h1Tags.length > 0 ? h1Tags : null, + }; + log.debug(`SSR validation result for ${url}: title=${!!result.title}, description=${!!result.description}, h1Count=${h1Tags.length}`); + return result; + } catch (error) { + log.warn(`Error during SSR validation for ${url}: ${error.message}`); + return null; + } +} + +/** + * Validates if detected issues are false positives by checking SSR content. + * Updates the detectedTags object to remove false positives. + * + * @param {Object} detectedTags - Object containing detected tag issues by endpoint + * @param {string} baseUrl - Base URL of the site + * @param {Object} log - Logger instance + * @returns {Promise} Updated detectedTags with false positives removed + */ +export async function validateDetectedIssues(detectedTags, baseUrl, log) { + const endpoints = Object.keys(detectedTags); + + if (endpoints.length === 0) { + return detectedTags; + } + + log.info(`Validating ${endpoints.length} endpoints with detected issues via SSR`); + + const updatedDetectedTags = { ...detectedTags }; + let falsePositivesRemoved = 0; + + // Process endpoints sequentially to avoid overwhelming the server + for (const endpoint of endpoints) { + const tags = updatedDetectedTags[endpoint]; + const fullUrl = `${baseUrl}${endpoint}`; + + // Check if any of the issues are related to missing tags + const hasMissingIssues = ['title', 'description', 'h1'].some( + (tagName) => tags[tagName]?.issue?.includes('Missing'), + ); + + if (hasMissingIssues) { + // Validate via SSR + // eslint-disable-next-line no-await-in-loop + const ssrResult = await validateMetaTagsViaSSR(fullUrl, log); + + if (ssrResult) { + // Check each tag type and remove false positives + const tagNames = ['title', 'description', 'h1']; + for (const tagName of tagNames) { + if (tags[tagName]?.issue?.includes('Missing') && ssrResult[tagName]) { + log.info(`False positive detected for ${tagName} on ${endpoint} - tag exists in SSR`); + delete updatedDetectedTags[endpoint][tagName]; + falsePositivesRemoved += 1; + } + } + + // If all issues were false positives, remove the endpoint entirely + if (Object.keys(updatedDetectedTags[endpoint]).length === 0) { + delete updatedDetectedTags[endpoint]; + } + + // Add a small delay to avoid rate limiting + // eslint-disable-next-line no-await-in-loop + await new Promise((resolve) => { + setTimeout(resolve, 100); + }); + } + } + } + log.info(`SSR validation complete. Removed ${falsePositivesRemoved} false positives from ${endpoints.length} endpoints`); + return updatedDetectedTags; +} diff --git a/test/audits/metatags.test.js b/test/metatags/metatags.test.js similarity index 98% rename from test/audits/metatags.test.js rename to test/metatags/metatags.test.js index 443ecfa1b..30150ce80 100644 --- a/test/audits/metatags.test.js +++ b/test/metatags/metatags.test.js @@ -311,7 +311,7 @@ describe('Meta Tags', () => { siteId: 'site-id', type: 'meta-tags', allowCache: false, - maxScrapeAge: 0.001, + maxScrapeAge: 0, options: { waitTimeoutForMetaTags: 5000, }, @@ -343,7 +343,7 @@ describe('Meta Tags', () => { siteId: 'site-id', type: 'meta-tags', allowCache: false, - maxScrapeAge: 0.001, + maxScrapeAge: 0, options: { waitTimeoutForMetaTags: 5000, }, @@ -1136,6 +1136,8 @@ describe('Meta Tags', () => { .resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub() .resolves(5000); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const auditStub = await esmock('../../src/metatags/handler.js', { '../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, @@ -1158,6 +1160,9 @@ describe('Meta Tags', () => { }, }, }), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); const result = await auditStub.runAuditAndGenerateSuggestions(context); @@ -1174,6 +1179,8 @@ describe('Meta Tags', () => { .resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub() .resolves(2); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const auditStub = await esmock('../../src/metatags/handler.js', { '../../src/support/utils.js': { getRUMDomainkey: mockGetRUMDomainkey, @@ -1183,6 +1190,9 @@ describe('Meta Tags', () => { '../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() }, '../../src/metatags/metatags-auto-suggest.js': sinon.stub() .resolves({}), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); // Override all S3 responses to have null tags @@ -1223,6 +1233,8 @@ describe('Meta Tags', () => { .resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub() .resolves(2); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const auditStub = await esmock('../../src/metatags/handler.js', { '../../src/support/utils.js': { getRUMDomainkey: @@ -1233,6 +1245,9 @@ describe('Meta Tags', () => { '../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() }, '../../src/metatags/metatags-auto-suggest.js': sinon.stub() .resolves({}), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); // Override RUM API response to simulate error RUMAPIClientStub.createFrom() @@ -1255,6 +1270,8 @@ describe('Meta Tags', () => { it('should submit top pages for scraping when getIncludedURLs returns null', async () => { const mockGetRUMDomainkey = sinon.stub().resolves('mockedDomainKey'); const mockCalculateCPCValue = sinon.stub().resolves(2); + const mockValidateDetectedIssues = sinon.stub() + .callsFake(async (detectedTags) => detectedTags); const getConfigStub = sinon.stub().returns({ getIncludedURLs: sinon.stub().returns(null), }); @@ -1268,6 +1285,9 @@ describe('Meta Tags', () => { '@adobe/spacecat-shared-rum-api-client': RUMAPIClientStub, '../../src/common/index.js': { wwwUrlResolver: (siteObj) => siteObj.getBaseURL() }, '../../src/metatags/metatags-auto-suggest.js': sinon.stub().resolves({}), + '../../src/metatags/ssr-meta-validator.js': { + validateDetectedIssues: mockValidateDetectedIssues, + }, }); const result = await auditStub.runAuditAndGenerateSuggestions(context); expect(result).to.deep.equal({ status: 'complete' }); diff --git a/test/metatags/ssr-validator.test.js b/test/metatags/ssr-validator.test.js new file mode 100644 index 000000000..1bb5d6883 --- /dev/null +++ b/test/metatags/ssr-validator.test.js @@ -0,0 +1,358 @@ +/* + * Copyright 2024 Adobe. All rights reserved. + * This file is licensed to you under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. You may obtain a copy + * of the License at http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under + * the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR REPRESENTATIONS + * OF ANY KIND, either express or implied. See the License for the specific language + * governing permissions and limitations under the License. + */ + +/* eslint-env mocha */ + +import { expect } from 'chai'; +import sinon from 'sinon'; +import esmock from 'esmock'; + +describe('SSR Validator', () => { + let validateMetaTagsViaSSR; + let validateDetectedIssues; + let fetchStub; + let log; + + beforeEach(async () => { + fetchStub = sinon.stub(); + log = { + info: sinon.stub(), + debug: sinon.stub(), + warn: sinon.stub(), + error: sinon.stub(), + }; + + const ssrValidator = await esmock('../../src/metatags/ssr-meta-validator.js', { + '@adobe/fetch': { + context: () => ({ fetch: fetchStub }), + }, + }); + + validateMetaTagsViaSSR = ssrValidator.validateMetaTagsViaSSR; + validateDetectedIssues = ssrValidator.validateDetectedIssues; + }); + + afterEach(() => { + sinon.restore(); + }); + + describe('validateMetaTagsViaSSR', () => { + it('should successfully extract meta tags from HTML', async () => { + const html = ` + + + + Test Page Title + + + +

Main Heading

+

Secondary Heading

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result).to.deep.equal({ + title: 'Test Page Title', + description: 'Test page description', + h1: ['Main Heading', 'Secondary Heading'], + }); + expect(log.debug.calledWith('Validating meta tags via SSR for: https://example.com/page')).to.be.true; + }); + + it('should return null for missing title', async () => { + const html = ` + + + + + + +

Main Heading

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result.title).to.be.null; + expect(result.description).to.equal('Test page description'); + expect(result.h1).to.deep.equal(['Main Heading']); + }); + + it('should return null for missing description', async () => { + const html = ` + + + + Test Page Title + + +

Main Heading

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result.title).to.equal('Test Page Title'); + expect(result.description).to.be.null; + expect(result.h1).to.deep.equal(['Main Heading']); + }); + + it('should return null for missing h1 tags', async () => { + const html = ` + + + + Test Page Title + + + +

Not an H1

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/page', log); + + expect(result.title).to.equal('Test Page Title'); + expect(result.description).to.equal('Test page description'); + expect(result.h1).to.be.null; + }); + + it('should return null when fetch fails', async () => { + fetchStub.resolves({ + ok: false, + status: 404, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/notfound', log); + + expect(result).to.be.null; + expect(log.warn.calledWith('SSR validation failed with status 404 for https://example.com/notfound')).to.be.true; + }); + + it('should return null when fetch throws an error', async () => { + fetchStub.rejects(new Error('Network error')); + + const result = await validateMetaTagsViaSSR('https://example.com/error', log); + + expect(result).to.be.null; + expect(log.warn.calledWith(sinon.match('Error during SSR validation'))).to.be.true; + }); + + it('should handle empty/whitespace-only tags', async () => { + const html = ` + + + + + + + +

+ + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateMetaTagsViaSSR('https://example.com/empty', log); + + expect(result.title).to.be.null; + expect(result.description).to.be.null; + expect(result.h1).to.be.null; + }); + }); + + describe('validateDetectedIssues', () => { + beforeEach(() => { + // Mock the internal validateMetaTagsViaSSR calls + sinon.stub(Date, 'now').returns(1000); + }); + + it('should remove false positives for missing tags', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + description: { issue: 'Missing description', tagContent: '' }, + }, + '/page2': { + h1: { issue: 'Missing h1', tagContent: '' }, + }, + }; + + const html1 = ` + + + Actual Title + + + + `; + + const html2 = ` + + +

Actual H1

+ + + `; + + fetchStub.onFirstCall().resolves({ + ok: true, + status: 200, + text: async () => html1, + }); + + fetchStub.onSecondCall().resolves({ + ok: true, + status: 200, + text: async () => html2, + }); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + expect(result).to.deep.equal({}); + expect(log.info.calledWith(sinon.match('False positive detected'))).to.be.true; + }); + + it('should keep legitimate issues', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + description: { issue: 'Too short description', tagContent: 'Short' }, + }, + }; + + const html1 = ` + + + + + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html1, + }); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + // Title is still missing (not in SSR), description issue is not about missing + expect(result['/page1'].title).to.exist; + expect(result['/page1'].description).to.exist; + }); + + it('should skip endpoints without missing issues', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Too long title', tagContent: 'Very long title' }, + description: { issue: 'Duplicate description', tagContent: 'Duplicate' }, + }, + }; + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + expect(result).to.deep.equal(detectedTags); + expect(fetchStub.called).to.be.false; + }); + + it('should handle validation errors gracefully', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + }, + }; + + fetchStub.rejects(new Error('Network error')); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + // Should keep the issue if validation fails + expect(result['/page1'].title).to.exist; + expect(log.warn.called).to.be.true; + }); + + it('should return unchanged when no detected tags', async () => { + const detectedTags = {}; + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + expect(result).to.deep.equal({}); + expect(fetchStub.called).to.be.false; + }); + + it('should partially remove false positives', async () => { + const detectedTags = { + '/page1': { + title: { issue: 'Missing title', tagContent: '' }, + description: { issue: 'Missing description', tagContent: '' }, + }, + }; + + // SSR only has title, not description + const html = ` + + + Actual Title + + + `; + + fetchStub.resolves({ + ok: true, + status: 200, + text: async () => html, + }); + + const result = await validateDetectedIssues(detectedTags, 'https://example.com', log); + + // Title should be removed (false positive), description should remain + expect(result['/page1'].title).to.be.undefined; + expect(result['/page1'].description).to.exist; + }); + }); +}); From 2d6c8b77baf0dfa50cc93af122265e8a4cf1216d Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Tue, 14 Oct 2025 00:06:10 +0530 Subject: [PATCH 07/12] fix: update processing type --- src/metatags/handler.js | 2 +- test/metatags/metatags.test.js | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index 8c60d297e..08ef23155 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -329,7 +329,7 @@ export async function submitForScraping(context) { return { urls: finalUrls.map((url) => ({ url })), siteId: site.getId(), - type: 'meta-tags', + type: 'default', allowCache: false, maxScrapeAge: 0, options: { diff --git a/test/metatags/metatags.test.js b/test/metatags/metatags.test.js index 30150ce80..4f7f82af2 100644 --- a/test/metatags/metatags.test.js +++ b/test/metatags/metatags.test.js @@ -309,7 +309,7 @@ describe('Meta Tags', () => { { url: 'http://example.com/page2' }, ], siteId: 'site-id', - type: 'meta-tags', + type: 'default', allowCache: false, maxScrapeAge: 0, options: { @@ -341,7 +341,7 @@ describe('Meta Tags', () => { { url: 'http://example.com/page2' }, ], siteId: 'site-id', - type: 'meta-tags', + type: 'default', allowCache: false, maxScrapeAge: 0, options: { From a339e778b78adefb8e243ac38a1b5736d4b101c0 Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Tue, 14 Oct 2025 01:55:35 +0530 Subject: [PATCH 08/12] fix: refactor --- src/metatags/handler.js | 1 - src/metatags/ssr-meta-validator.js | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/src/metatags/handler.js b/src/metatags/handler.js index 08ef23155..e726ebfe9 100644 --- a/src/metatags/handler.js +++ b/src/metatags/handler.js @@ -324,7 +324,6 @@ export async function submitForScraping(context) { if (finalUrls.length === 0) { throw new Error('No URLs found for site neither top pages nor included URLs'); } - log.info(`Submitting for scraping ${finalUrls.length} with allowCache: false`); return { urls: finalUrls.map((url) => ({ url })), diff --git a/src/metatags/ssr-meta-validator.js b/src/metatags/ssr-meta-validator.js index 608ce5c22..11f18bb07 100644 --- a/src/metatags/ssr-meta-validator.js +++ b/src/metatags/ssr-meta-validator.js @@ -82,9 +82,7 @@ export async function validateDetectedIssues(detectedTags, baseUrl, log) { if (endpoints.length === 0) { return detectedTags; } - - log.info(`Validating ${endpoints.length} endpoints with detected issues via SSR`); - + log.debug(`Validating ${endpoints.length} endpoints with detected issues via SSR`); const updatedDetectedTags = { ...detectedTags }; let falsePositivesRemoved = 0; From 7d8c709ccf51472ede73bcd44a36def0427f3fbb Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Wed, 22 Oct 2025 12:11:21 +0530 Subject: [PATCH 09/12] trigger build From d64860dc25bcd1e619206a755254bf9953de731a Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Wed, 22 Oct 2025 12:29:13 +0530 Subject: [PATCH 10/12] fix: local deploy --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 19f765f6b..030b6f4d5 100755 --- a/package.json +++ b/package.json @@ -20,7 +20,7 @@ "build": "hedy -v --test-bundle", "deploy": "hedy -v --deploy --aws-deploy-bucket=spacecat-prod-deploy --pkgVersion=latest", "deploy-stage": "hedy -v --deploy --aws-deploy-bucket=spacecat-stage-deploy --pkgVersion=latest", - "deploy-dev": "hedy -v --deploy --pkgVersion=ci$CI_BUILD_NUM -l latest --aws-deploy-bucket=spacecat-dev-deploy --cleanup-ci=24h", + "deploy-dev": "hedy -v --deploy --pkgVersion=ci$CI_BUILD_NUM -l dipratap --aws-deploy-bucket=spacecat-dev-deploy --cleanup-ci=24h", "deploy-secrets": "hedy --aws-update-secrets --params-file=secrets/secrets.env", "prepare": "husky", "local-build": "sam build", From eda0e1d793bf61385fc84be71e5e32eda6373ac0 Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Wed, 22 Oct 2025 12:36:50 +0530 Subject: [PATCH 11/12] fix: merge main --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index 030b6f4d5..daab966b7 100755 --- a/package.json +++ b/package.json @@ -20,7 +20,7 @@ "build": "hedy -v --test-bundle", "deploy": "hedy -v --deploy --aws-deploy-bucket=spacecat-prod-deploy --pkgVersion=latest", "deploy-stage": "hedy -v --deploy --aws-deploy-bucket=spacecat-stage-deploy --pkgVersion=latest", - "deploy-dev": "hedy -v --deploy --pkgVersion=ci$CI_BUILD_NUM -l dipratap --aws-deploy-bucket=spacecat-dev-deploy --cleanup-ci=24h", + "deploy-dev": "hedy -v --deploy --pkgVersion=latest$CI_BUILD_NUM -l dipratap --aws-deploy-bucket=spacecat-dev-deploy --cleanup-ci=24h --aws-api vldld6qz1d", "deploy-secrets": "hedy --aws-update-secrets --params-file=secrets/secrets.env", "prepare": "husky", "local-build": "sam build", From fcfe14b4f352fc85d3463feca5c3e264dcac56b9 Mon Sep 17 00:00:00 2001 From: Divyansh Pratap Date: Thu, 23 Oct 2025 10:36:41 +0530 Subject: [PATCH 12/12] fix: remove tmp logs --- package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/package.json b/package.json index daab966b7..19f765f6b 100755 --- a/package.json +++ b/package.json @@ -20,7 +20,7 @@ "build": "hedy -v --test-bundle", "deploy": "hedy -v --deploy --aws-deploy-bucket=spacecat-prod-deploy --pkgVersion=latest", "deploy-stage": "hedy -v --deploy --aws-deploy-bucket=spacecat-stage-deploy --pkgVersion=latest", - "deploy-dev": "hedy -v --deploy --pkgVersion=latest$CI_BUILD_NUM -l dipratap --aws-deploy-bucket=spacecat-dev-deploy --cleanup-ci=24h --aws-api vldld6qz1d", + "deploy-dev": "hedy -v --deploy --pkgVersion=ci$CI_BUILD_NUM -l latest --aws-deploy-bucket=spacecat-dev-deploy --cleanup-ci=24h", "deploy-secrets": "hedy --aws-update-secrets --params-file=secrets/secrets.env", "prepare": "husky", "local-build": "sam build",