From 2fe4d725c42e9443144b8487bcfb6afada96251e Mon Sep 17 00:00:00 2001
From: andremacola <andremacola@gmail.com>
Date: Mon, 4 Dec 2023 19:48:10 -0300
Subject: [PATCH 1/3] Feat: extract pagetype from og:type or ld+json

---
 index.d.ts                             |  1 +
 src/utils/extractLdSchema.js           | 71 ++++++++++++++++++++++++
 src/utils/extractMetaData.js           | 76 ++++++++++++++++----------
 src/utils/extractMetaData.test.js      | 11 +++-
 src/utils/parseFromHtml.js             |  2 +
 test-data/regular-article-json-ld.html | 65 ++++++++++++++++++++++
 6 files changed, 196 insertions(+), 30 deletions(-)
 create mode 100644 src/utils/extractLdSchema.js
 create mode 100644 test-data/regular-article-json-ld.html

diff --git a/index.d.ts b/index.d.ts
index c0043fb5..3a495780 100644
--- a/index.d.ts
+++ b/index.d.ts
@@ -82,6 +82,7 @@ export interface ArticleData {
   source?: string;
   published?: string;
   ttr?: number;
+  type?: string;
 }
 
 export function extract(input: string, parserOptions?: ParserOptions, fetchOptions?: FetchOptions): Promise<ArticleData|null>;
diff --git a/src/utils/extractLdSchema.js b/src/utils/extractLdSchema.js
new file mode 100644
index 00000000..bdd7e27f
--- /dev/null
+++ b/src/utils/extractLdSchema.js
@@ -0,0 +1,71 @@
+const typeSchemas = [
+  'aboutpage',
+  'checkoutpage',
+  'collectionpage',
+  'contactpage',
+  'faqpage',
+  'itempage',
+  'medicalwebpage',
+  'profilepage',
+  'qapage',
+  'realestatelisting',
+  'searchresultspage',
+  'webpage',
+  'website',
+  'article',
+  'advertisercontentarticle',
+  'newsarticle',
+  'analysisnewsarticle',
+  'askpublicnewsarticle',
+  'backgroundnewsarticle',
+  'opinionnewsarticle',
+  'reportagenewsarticle',
+  'reviewnewsarticle',
+  'report',
+  'satiricalarticle',
+  'scholarlyarticle',
+  'medicalscholarlyarticle',
+]
+
+const attributeLists = {
+  description: 'description',
+  image: 'image',
+  author: 'author',
+  published: 'datePublished',
+  type: '@type',
+}
+
+/**
+ * Parses JSON-LD data from a document and populates an entry object.
+ * Only populates if the original entry object is empty or undefined.
+ *
+ * @param {Document} document - The HTML Document
+ * @param {Object} entry - The entry object to merge/populate with JSON-LD.
+ * @returns {Object} The entry object after being merged/populated with data.
+ */
+export default (document, entry) => {
+  const ldSchema = document.querySelector('script[type="application/ld+json"]')?.textContent
+
+  if (!ldSchema) {
+    return entry
+  }
+
+  const ldJson = JSON.parse(ldSchema)
+  Object.entries(attributeLists).forEach(([key, attr]) => {
+    if ((typeof entry[key] === 'undefined' || entry[key] === '') && ldJson[attr]) {
+      if (key === 'type' && typeof ldJson[attr] === 'string') {
+        return entry[key] = typeSchemas.includes(ldJson[attr].toLowerCase()) ? ldJson[attr].toLowerCase() : ''
+      }
+
+      if (typeof ldJson[attr] === 'string') {
+        return entry[key] = ldJson[attr].toLowerCase()
+      }
+
+      if (Array.isArray(ldJson[attr]) && typeof ldJson[attr][0] === 'string') {
+        return entry[key] = ldJson[attr][0].toLowerCase()
+      }
+    }
+  })
+
+  return entry
+}
diff --git a/src/utils/extractMetaData.js b/src/utils/extractMetaData.js
index 47be5e09..ffc4cd74 100644
--- a/src/utils/extractMetaData.js
+++ b/src/utils/extractMetaData.js
@@ -1,10 +1,35 @@
 // utils -> extractMetaData
 
 import { DOMParser } from 'linkedom'
+import extractLdSchema from './extractLdSchema.js'
+
+/**
+ * @param {Element} node
+ * @param {Object} attributeLists
+ * @returns {?{key: string, content: string}}
+ */
+function getMetaContentByNameOrProperty (node, attributeLists) {
+  const content = node.getAttribute('content')
+  if (!content) return null
+
+  const property = node
+    .getAttribute('property')?.toLowerCase() ??
+    node.getAttribute('itemprop')?.toLowerCase()
+
+  const name = node.getAttribute('name')?.toLowerCase()
+
+  for (const [key, attrs] of Object.entries(attributeLists)) {
+    if (attrs.includes(property) || attrs.includes(name)) {
+      return { key, content }
+    }
+  }
+
+  return null
+}
 
 /**
  * @param html {string}
- * @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string}}
+ * @returns {{image: string, author: string, amphtml: string, description: string, canonical: string, source: string, published: string, title: string, url: string, shortlink: string, favicon: string, type: string}}
  */
 export default (html) => {
   const entry = {
@@ -19,6 +44,7 @@ export default (html) => {
     source: '',
     published: '',
     favicon: '',
+    type: '',
   }
 
   const sourceAttrs = [
@@ -80,6 +106,20 @@ export default (html) => {
     'date',
     'parsely-pub-date',
   ]
+  const typeAttrs = [
+    'og:type',
+  ]
+
+  const attributeLists = {
+    source: sourceAttrs,
+    url: urlAttrs,
+    title: titleAttrs,
+    description: descriptionAttrs,
+    image: imageAttrs,
+    author: authorAttrs,
+    published: publishedTimeAttrs,
+    type: typeAttrs,
+  }
 
   const document = new DOMParser().parseFromString(html, 'text/html')
   entry.title = document.querySelector('head > title')?.innerText
@@ -96,35 +136,13 @@ export default (html) => {
   })
 
   Array.from(document.getElementsByTagName('meta')).forEach(node => {
-    const content = node.getAttribute('content')
-    if (!content) {
-      return false
-    }
-    const property = node.getAttribute('property')?.toLowerCase() ?? node.getAttribute('itemprop')?.toLowerCase()
-    const name = node.getAttribute('name')?.toLowerCase()
-
-    if (sourceAttrs.includes(property) || sourceAttrs.includes(name)) {
-      entry.source = content
-    }
-    if (urlAttrs.includes(property) || urlAttrs.includes(name)) {
-      entry.url = content
-    }
-    if (titleAttrs.includes(property) || titleAttrs.includes(name)) {
-      entry.title = content
-    }
-    if (descriptionAttrs.includes(property) || descriptionAttrs.includes(name)) {
-      entry.description = content
-    }
-    if (imageAttrs.includes(property) || imageAttrs.includes(name)) {
-      entry.image = content
-    }
-    if (authorAttrs.includes(property) || authorAttrs.includes(name)) {
-      entry.author = content
-    }
-    if (publishedTimeAttrs.includes(property) || publishedTimeAttrs.includes(name)) {
-      entry.published = content
+    const result = getMetaContentByNameOrProperty(node, attributeLists)
+    if (result) {
+      entry[result.key] = result.content
     }
   })
 
-  return entry
+  const entries = extractLdSchema(document, entry)
+
+  return entries
 }
diff --git a/src/utils/extractMetaData.test.js b/src/utils/extractMetaData.test.js
index fc47307b..2a02ede0 100644
--- a/src/utils/extractMetaData.test.js
+++ b/src/utils/extractMetaData.test.js
@@ -7,7 +7,7 @@ import { isObject, hasProperty } from 'bellajs'
 
 import extractMetaData from './extractMetaData.js'
 
-const keys = 'url shortlink amphtml canonical title description image author source published favicon'.split(' ')
+const keys = 'url shortlink amphtml canonical title description image author source published favicon type'.split(' ')
 
 test('test extractMetaData(good content)', async () => {
   const html = readFileSync('./test-data/regular-article.html', 'utf8')
@@ -17,3 +17,12 @@ test('test extractMetaData(good content)', async () => {
     expect(hasProperty(result, k)).toBe(true)
   })
 })
+
+test('test extractMetaData(json ld schema content)', async () => {
+  const html = readFileSync('./test-data/regular-article-json-ld.html', 'utf8')
+  const result = extractMetaData(html)
+  expect(isObject(result)).toBe(true)
+  keys.forEach((k) => {
+    expect(hasProperty(result, k)).toBe(true)
+  })
+})
diff --git a/src/utils/parseFromHtml.js b/src/utils/parseFromHtml.js
index 0dbc8e74..71e13c74 100644
--- a/src/utils/parseFromHtml.js
+++ b/src/utils/parseFromHtml.js
@@ -45,6 +45,7 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
     author,
     published,
     favicon: metaFav,
+    type,
   } = meta
 
   const {
@@ -127,5 +128,6 @@ export default async (inputHtml, inputUrl = '', parserOptions = {}) => {
     source: getDomain(bestUrl),
     published,
     ttr: getTimeToRead(textContent, wordsPerMinute),
+    type,
   }
 }
diff --git a/test-data/regular-article-json-ld.html b/test-data/regular-article-json-ld.html
new file mode 100644
index 00000000..24ac042a
--- /dev/null
+++ b/test-data/regular-article-json-ld.html
@@ -0,0 +1,65 @@
+<!doctype html>
+<html>
+  <head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>Article title here - ArticleParser</title>
+    <meta name="keywords" content="alpha, beta, gamma">
+    <meta name="twitter:site" content="@ArticleParser">
+    <meta name="twitter:url" content="https://somewhere.com/path/to/article-title-here">
+    <meta name="twitter:card" content="summary_large_image">
+    <meta name="twitter:creator" content="@alice">
+    <meta property="og:title" content="Article title here">
+    <meta property="og:url" content="https://somewhere.com/path/to/article-title-here">
+
+
+<script type="application/ld+json">
+    { "@context": "https://schema.org",
+      "@type": "Article",
+      "author": "Alice",
+      "image": [
+        "https://somewhere.com/path/to/image.jpg",
+        "https://somewhere.com/path/to/image2.jpg",
+        "https://somewhere.com/path/to/image3.jpg"
+      ],
+      "datePublished": "23\/01\/2014",
+      "dateCreated": "23\/01\/2014",
+      "description": "Navigation here Few can name a rational peach that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs."
+    }
+  </script>
+
+    <link rel="stylesheet" href="/path/to/cssfile.css">
+    <link rel="canonical" href="https://somewhere.com/another/path/to/article-title-here">
+    <link rel="amphtml" href="https://m.somewhere.com/another/path/to/article-title-here.amp">
+    <link rel="shortlink" href="https://sw.re/419283">
+    <link rel="icon" href="https://somewhere.com/favicon.ico">
+
+    <link rel="alternate" title="ArticleParser" type="application/atom+xml" href="https://somewhere.com/atom.xml">
+
+    <link rel="manifest" href="/manifest.json">
+  </head>
+  <body>
+    <header>Page header here</header>
+    <main>
+      <section>
+        <nav>Navigation here</nav>
+      </section>
+      <section>
+        <h1>Article title here</h1>
+        <article>
+          <div class="contentdetail">Few can name a <a href="https://otherwhere.com/descriptions/rational-peach">rational peach</a> that isn't a conscientious goldfish! One cannot separate snakes from plucky pomegranates? Draped neatly on a hanger, the melons could be said to resemble knowledgeable pigs. Some posit the enchanting tiger to be less than confident. The literature would have us believe that an impartial turtle is not but a hippopotamus. Unfortunately, that is wrong; on the contrary, those cows are nothing more than pandas! The chicken is a shark; A turtle can hardly be considered a kind horse without also being a pomegranate. Zebras are witty persimmons.</div>
+          <p class="contentdetail">
+            Those cheetahs are nothing more than dogs. A <a href="/dict/watermelon">watermelon</a> is an exuberant kangaroo. An octopus is the tangerine of a grapes? The cherry is a shark. Recent controversy aside, they were lost without the cheerful plum that composed their fox. As far as we can estimate, one cannot separate camels from dynamic hamsters. Those tigers are nothing more than cows! A cow is a squirrel from the right perspective. Their banana was, in this moment, a helpful bear.</p>
+          <p>The first fair dog is, in its own way, a lemon.</p>
+          <address>4746 Kelly Drive, West Virginia</address>
+          <img src="./orange.png" style="border: solid 1px #000">
+        </article>
+      </section>
+      <section class="sidebar-widget">
+        <widget>Some widget here</widget>
+        <widget>Some widget here</widget>
+      </section>
+    </main>
+    <footer>Page footer here</footer>
+  </body>
+</html>

From 0fd6c66e4c987e44fa311a0d46895b9838906c7d Mon Sep 17 00:00:00 2001
From: Dong Nguyen <ndaidong@gmail.com>
Date: Wed, 6 Dec 2023 00:07:11 +0700
Subject: [PATCH 2/3] v8.0.8

- Merge pr #374 by @andremacola (issue #373)
- Update dependencies
- Update CI config
- Fix function call in eval.js
---
 .github/workflows/ci-test.yml         | 11 ++++-------
 .github/workflows/codeql-analysis.yml |  2 +-
 README.md                             |  1 +
 eval.js                               | 11 ++++++++---
 package.json                          | 14 +++++++-------
 5 files changed, 21 insertions(+), 18 deletions(-)

diff --git a/.github/workflows/ci-test.yml b/.github/workflows/ci-test.yml
index fb3996f9..470fbe52 100644
--- a/.github/workflows/ci-test.yml
+++ b/.github/workflows/ci-test.yml
@@ -12,13 +12,13 @@ jobs:
 
     strategy:
       matrix:
-        node_version: [18.x, 20.x]
+        node_version: [18.x, 20.x, 21.x]
 
     steps:
     - uses: actions/checkout@v3
 
     - name: setup Node.js v${{ matrix.node_version }}
-      uses: actions/setup-node@v3
+      uses: actions/setup-node@v4
       with:
         node-version: ${{ matrix.node_version }}
 
@@ -31,8 +31,8 @@ jobs:
         npm run build --if-present
         npm run test
 
-    - name: Coveralls GitHub Action
-      uses: coverallsapp/github-action@1.1.3
+    - name: Report Coveralls
+      uses: coverallsapp/github-action@v2
       with:
         github-token: ${{ secrets.GITHUB_TOKEN }}
 
@@ -43,6 +43,3 @@ jobs:
         key: ${{ runner.os }}-node-${{ hashFiles('**/package-lock.json') }}
         restore-keys: |
           ${{ runner.os }}-node-
-
-
-
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 2124bd64..a77d776a 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -38,7 +38,7 @@ jobs:
 
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v4
 
     # Initializes the CodeQL tools for scanning.
     - name: Initialize CodeQL
diff --git a/README.md b/README.md
index 618759d1..302906d1 100644
--- a/README.md
+++ b/README.md
@@ -105,6 +105,7 @@ The result - `article` - can be `null` or an object with the following structure
   favicon: String,
   content: String,
   published: Date String,
+  type: String, // page type
   source: String, // original publisher
   links: Array, // list of alternative links
   ttr: Number, // time to read in second, 0 = unknown
diff --git a/eval.js b/eval.js
index 829f9b63..e790b2d0 100644
--- a/eval.js
+++ b/eval.js
@@ -6,7 +6,7 @@ import { readFileSync, writeFileSync, existsSync } from 'node:fs'
 import { slugify } from 'bellajs'
 
 import { isValid as isValidUrl } from './src/utils/linker.js'
-import { extract } from './src/main.js'
+import { extractFromHtml } from './src/main.js'
 
 if (!existsSync('evaluation')) {
   execSync('mkdir evaluation')
@@ -15,7 +15,12 @@ if (!existsSync('evaluation')) {
 const extractFromUrl = async (url) => {
   try {
     console.time('extraction')
-    const art = await extract(url)
+    const res = await fetch(url)
+    const buffer = await res.arrayBuffer()
+    const decoder = new TextDecoder('iso-8859-1')
+    const html = decoder.decode(buffer)
+
+    const art = await extractFromHtml(html)
     console.log(art)
     const slug = slugify(art.title)
     writeFileSync(`evaluation/${slug}.html`, art.content, 'utf8')
@@ -28,7 +33,7 @@ const extractFromUrl = async (url) => {
 const extractFromFile = async (fpath) => {
   try {
     const html = readFileSync(fpath, 'utf8')
-    const art = await extract(html)
+    const art = await extractFromHtml(html)
     console.log(art)
   } catch (err) {
     console.trace(err)
diff --git a/package.json b/package.json
index f363969b..5398851c 100644
--- a/package.json
+++ b/package.json
@@ -1,5 +1,5 @@
 {
-  "version": "8.0.3",
+  "version": "8.0.4",
   "name": "@extractus/article-extractor",
   "description": "To extract main article from given URL",
   "homepage": "https://github.com/extractus/article-extractor",
@@ -33,15 +33,15 @@
     "@mozilla/readability": "^0.4.4",
     "bellajs": "^11.1.2",
     "cross-fetch": "^4.0.0",
-    "linkedom": "^0.15.1",
+    "linkedom": "^0.16.4",
     "sanitize-html": "2.11.0"
   },
   "devDependencies": {
-    "@types/sanitize-html": "^2.9.0",
-    "eslint": "^8.47.0",
-    "https-proxy-agent": "^7.0.1",
-    "jest": "^29.6.2",
-    "nock": "^13.3.2"
+    "@types/sanitize-html": "^2.9.5",
+    "eslint": "^8.55.0",
+    "https-proxy-agent": "^7.0.2",
+    "jest": "^29.7.0",
+    "nock": "^13.4.0"
   },
   "keywords": [
     "article",

From 2ec2573c82c53a03f7f6d0c90707e97106dcbb2a Mon Sep 17 00:00:00 2001
From: Dong Nguyen <ndaidong@gmail.com>
Date: Wed, 6 Dec 2023 00:14:48 +0700
Subject: [PATCH 3/3] Update examples

---
 examples/browser-article-parser/package.json | 2 +-
 examples/bun-article-parser/package.json     | 4 ++--
 examples/deno-article-parser/deno.json       | 4 ++--
 examples/tsnode-article-parser/package.json  | 3 ++-
 examples/tsnode-article-parser/tsconfig.json | 4 ++--
 5 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/examples/browser-article-parser/package.json b/examples/browser-article-parser/package.json
index 026a736e..33ba729c 100644
--- a/examples/browser-article-parser/package.json
+++ b/examples/browser-article-parser/package.json
@@ -7,6 +7,6 @@
   },
   "dependencies": {
     "express": "^4.18.2",
-    "got": "^13.0.0"
+    "got": "^14.0.0"
   }
 }
diff --git a/examples/bun-article-parser/package.json b/examples/bun-article-parser/package.json
index 769fcd65..58ef1e7c 100644
--- a/examples/bun-article-parser/package.json
+++ b/examples/bun-article-parser/package.json
@@ -5,10 +5,10 @@
     "start": "bun run index.ts"
   },
   "devDependencies": {
-    "bun-types": "^0.6.13"
+    "bun-types": "^1.0.15"
   },
   "dependencies": {
     "@extractus/article-extractor": "latest",
-    "hono": "^3.2.7"
+    "hono": "^3.11.2"
   }
 }
diff --git a/examples/deno-article-parser/deno.json b/examples/deno-article-parser/deno.json
index 1a1bdebf..b8a77bea 100644
--- a/examples/deno-article-parser/deno.json
+++ b/examples/deno-article-parser/deno.json
@@ -2,8 +2,8 @@
   "name": "deno-article-parser",
   "version": "1.0.0",
   "imports": {
-    "serve": "https://deno.land/std@0.203.0/http/server.ts",
-    "hono": "https://deno.land/x/hono@v3.7.2/mod.ts",
+    "serve": "https://deno.land/std/http/server.ts",
+    "hono": "https://deno.land/x/hono@v3.11.2/mod.ts",
     "article-extractor": "https://esm.sh/@extractus/article-extractor"
   },
   "tasks": {
diff --git a/examples/tsnode-article-parser/package.json b/examples/tsnode-article-parser/package.json
index fc9a9644..7f8f5569 100644
--- a/examples/tsnode-article-parser/package.json
+++ b/examples/tsnode-article-parser/package.json
@@ -2,12 +2,13 @@
   "name": "tsnode-article-parser",
   "version": "1.0.0",
   "main": "index.ts",
+  "type": "module",
   "scripts": {
     "prestart": "npx tsc",
     "start": "node dist/index.js"
   },
   "devDependencies": {
-    "typescript": "^5.1.6"
+    "typescript": "^5.3.2"
   },
   "dependencies": {
     "@extractus/article-extractor": "latest",
diff --git a/examples/tsnode-article-parser/tsconfig.json b/examples/tsnode-article-parser/tsconfig.json
index d58a717d..5e247d6c 100644
--- a/examples/tsnode-article-parser/tsconfig.json
+++ b/examples/tsnode-article-parser/tsconfig.json
@@ -1,8 +1,8 @@
 {
   "compilerOptions": {
-    "module": "commonjs",
+    "module": "es6",
     "esModuleInterop": true,
-    "target": "es6",
+    "target": "esnext",
     "moduleResolution": "node",
     "sourceMap": true,
     "outDir": "dist"