From 2fee68615a36bf8fd20cf00e72fb8d225d143204 Mon Sep 17 00:00:00 2001 From: Dong Nguyen Date: Sat, 19 Oct 2024 23:26:57 +0700 Subject: [PATCH] v8.0.14 - Fix inconsistent output (#407) - Modify some stuff at LdJson extraction (#405) - Only use value from LdJson if missed from meta tags - Only accept string value from LdJson - Stop converting LdJson value to lowercase --- package.json | 2 +- src/utils/extractLdSchema.js | 17 ++++++++--------- src/utils/extractMetaData.js | 8 ++++---- src/utils/findDate.js | 4 ++-- 4 files changed, 15 insertions(+), 16 deletions(-) diff --git a/package.json b/package.json index af8403d..063157b 100644 --- a/package.json +++ b/package.json @@ -1,5 +1,5 @@ { - "version": "8.0.13", + "version": "8.0.14", "name": "@extractus/article-extractor", "description": "To extract main article from given URL", "homepage": "https://github.com/extractus/article-extractor", diff --git a/src/utils/extractLdSchema.js b/src/utils/extractLdSchema.js index 6077331..dc42394 100644 --- a/src/utils/extractLdSchema.js +++ b/src/utils/extractLdSchema.js @@ -1,3 +1,7 @@ +// utils -> extractLdSchema.js + +import { isArray, isObject, isString } from 'bellajs' + const typeSchemas = [ 'aboutpage', 'checkoutpage', @@ -53,25 +57,20 @@ const parseJson = (text) => { */ export default (document, entry) => { const ldSchemas = document.querySelectorAll('script[type="application/ld+json"]') - ldSchemas.forEach(ldSchema => { const ldJson = parseJson(ldSchema.textContent.replace(/[\n\r\t]/g, '')) const isAllowedLdJsonType = typeSchemas.includes(ldJson['@type']?.toLowerCase()) if (ldJson && isAllowedLdJsonType) { Object.entries(attributeLists).forEach(([key, attr]) => { - const isEntryAlreadyPopulated = typeof entry[key] !== 'undefined' && entry[key] !== '' - - if (isEntryAlreadyPopulated || !ldJson[attr]) { + if (!entry[key] || !ldJson[attr]) { return } const keyValue = ldJson[attr] - if (keyValue) { - entry[key] = Array.isArray(keyValue) ? keyValue[0] : keyValue - if (typeof entry[key] === 'string') { - entry[key] = entry[key].toLowerCase().trim() - } + const val = isArray(keyValue) ? keyValue[0] : isObject(keyValue) ? keyValue?.name || '' : keyValue + if (isString(val)) { + entry[key] = val.trim() } }) } diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js index 88c19c4..d3f1f53 100644 --- a/src/utils/extractMetaData.js +++ b/src/utils/extractMetaData.js @@ -143,11 +143,11 @@ export default (html) => { } }) - const entries = extractLdSchema(doc, entry) + const metadata = extractLdSchema(doc, entry) - if (!entries.published) { - entries.published = findDate(doc) + if (!metadata.published) { + metadata.published = findDate(doc) || '' } - return entries + return metadata } diff --git a/src/utils/findDate.js b/src/utils/findDate.js index fd617dd..3a666e0 100644 --- a/src/utils/findDate.js +++ b/src/utils/findDate.js @@ -38,7 +38,7 @@ export default function (doc) { const match = element.textContent.match(pattern) if (match) return convertDateFormat(match[0]) } - return null + return '' } const priorityElements = doc.querySelectorAll('time, [datetime], [itemprop~=datePublished], [itemprop~=dateCreated]') @@ -53,5 +53,5 @@ export default function (doc) { if (date) return date } - return null + return '' }