Skip to content

Commit 932232e

Browse files
authored
fix: refactor to use gdoc api (#257)
1 parent f920dc7 commit 932232e

File tree

10 files changed

+309
-24
lines changed

10 files changed

+309
-24
lines changed

.github/workflows/build.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,13 @@ jobs:
3131
- name: install
3232
run: |
3333
yarn
34+
- name: create-json secret
35+
uses: jsdaniell/[email protected]
36+
with:
37+
name: "key.json"
38+
json: ${{ secrets.GDOC_KEY }}
3439
- name: Generate db
3540
id: dbgen
36-
env:
37-
G_API_KEY: ${{ secrets.G_API_KEY }}
3841
run: |
3942
yarn gen
4043
yarn validate

.github/workflows/pr.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,11 @@ jobs:
2929
- name: lint
3030
run: |
3131
yarn lint
32+
- name: create-json secret
33+
uses: jsdaniell/[email protected]
34+
with:
35+
name: "key.json"
36+
json: ${{ secrets.GDOC_KEY }}
3237
- name: Test generation
33-
env:
34-
G_API_KEY: ${{ secrets.G_API_KEY }}
3538
run: |
3639
node src/generate.js test

.github/workflows/pr2.yml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
name: Database Lint 2
2+
3+
on:
4+
pull_request:
5+
6+
jobs:
7+
lint:
8+
name: Lint
9+
runs-on: ubuntu-latest
10+
steps:
11+
- name: Checkout code
12+
uses: actions/checkout@v3
13+
- id: nodeversion
14+
run: echo "NODE=$(cat .nvmrc)" >> $GITHUB_OUTPUT
15+
- name: restore Cache
16+
uses: actions/cache@master
17+
with:
18+
path: node_modules
19+
key: ${{ runner.os }}-${{ hashFiles('**/yarn.lock') }}
20+
- name: Use Node.js ${{steps.nodeversion.outputs.NODE}}
21+
uses: actions/setup-node@v3
22+
with:
23+
node-version: ${{ steps.nodeversion.outputs.NODE }}
24+
- name: install
25+
run: |
26+
yarn
27+
- name: lint
28+
run: |
29+
yarn lint
30+
- name: create-json secret
31+
uses: jsdaniell/[email protected]
32+
with:
33+
name: "key.json"
34+
json: ${{ secrets.GDOC_KEY }}
35+
- name: Test generation
36+
run: |
37+
node src/generate.js test

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,4 @@ package-lock.json
44
SAVE_IMG/
55
catalog_old.json
66
dump
7+
key.json

package.json

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,16 @@
2020
"standard": "^17.0.0"
2121
},
2222
"dependencies": {
23+
"@googleapis/docs": "^2.0.1",
2324
"@mixmaxhq/promise-pool": "^2.0.0",
2425
"ajv": "^6.12.6",
2526
"axios": "^1.4.0",
2627
"crc": "^4.3.2",
2728
"csv-stringify": "^6.3.3",
2829
"googleapis": "118",
2930
"he": "^1.2.0",
31+
"jsonpath": "^1.1.1",
32+
"lodash": "^4.17.21",
3033
"node-html-parser": "^6.1.5",
3134
"rimraf": "^5.0.1",
3235
"sharp": "^0.32.1",

src/generate.js

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ async function moduleScrap (catalog, moduleName, isTest = false) {
2525
const destFile = path.join(DEST, formattedName)
2626
if (moduleCatalog.hasError !== true) {
2727
catalog.push(moduleCatalog)
28-
fs.writeFileSync(destFile, JSON.stringify(moduleCatalog))
28+
fs.writeFileSync(destFile, JSON.stringify(moduleCatalog, null, ' '))
2929
} else {
3030
// using the previous version of the file
3131
console.warn(`ERRORS: ${formattedName}`)
@@ -49,7 +49,7 @@ async function jsonScrap (catalog, filename, isTest = false) {
4949
const destFile = path.join(DEST, formattedName)
5050
if (moduleCatalog.hasError !== true) {
5151
catalog.push(moduleCatalog)
52-
fs.writeFileSync(destFile, JSON.stringify(moduleCatalog))
52+
fs.writeFileSync(destFile, JSON.stringify(moduleCatalog, null, ' '))
5353
} else {
5454
// using the previous version of the file
5555
console.warn(`ERRORS: ${formattedName}`)
@@ -96,13 +96,16 @@ async function generate (isTest = false, targetCat = undefined) {
9696
const pool = new PromisePool({ numConcurrent: 2 })
9797
const customScraps = fs.readdirSync(customImporterPath)
9898
const jsonScraps = fs.readdirSync(jsonImporterPath)
99+
const total = customScraps.length + jsonScraps.length
100+
let idx = 1
99101
for (const s of customScraps) {
100102
if (targetCat && !path.join(customImporterPath, s).endsWith(targetCat)) {
101103
continue
102104
}
103105
await pool.start(
104106
async (cat, filename) => {
105107
await moduleScrap(cat, path.join(customImporterPath, filename), isTest)
108+
console.log(`${idx++}/${total}`)
106109
},
107110
catalog,
108111
s
@@ -115,6 +118,7 @@ async function generate (isTest = false, targetCat = undefined) {
115118
await pool.start(
116119
async (cat, filename) => {
117120
await jsonScrap(cat, path.join(jsonImporterPath, filename), isTest)
121+
console.log(`${idx++}/${total}`)
118122
},
119123
catalog,
120124
s

src/google/index.js

Lines changed: 141 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,141 @@
1+
const docs = require('@googleapis/docs')
2+
const path = require('path')
3+
const jp = require('jsonpath')
4+
const { genId, getAttributes, attributes, getNationality, isSelfOrdered } = require('../utils')
5+
const _ = require('lodash')
6+
7+
// get the credential client with the proper scopes
8+
async function getCredentials () {
9+
const auth = new docs.auth.GoogleAuth({
10+
keyFilename: path.join(__dirname, '..', '..', 'key.json'),
11+
scopes: ['https://www.googleapis.com/auth/documents']
12+
})
13+
return auth.getClient()
14+
}
15+
16+
// download the json google Doc representation
17+
async function downloadJsonDoc (id, credentials) {
18+
const client = docs.docs({ version: 'v1', auth: credentials })
19+
return client.documents.get({ documentId: id })
20+
}
21+
22+
// get the google doc image URI from the kixId within the jsonDoc
23+
function getImgUrl (kixId, jsonDoc) {
24+
const inlineObjectNode = jsonDoc.data.inlineObjects[kixId]
25+
return inlineObjectNode.inlineObjectProperties.embeddedObject.imageProperties.contentUri
26+
}
27+
28+
// parse the jsonDocument and hydrate the catalog object
29+
function gDocParse (catalog, jsonDoc) {
30+
catalog.selfOrder = false
31+
catalog.nationality = undefined
32+
33+
jp.query(jsonDoc.data.body.content, '$..textRun').forEach(x => {
34+
const nat = getNationality(x.content)
35+
if (nat) { catalog.nationality = nat }
36+
if (isSelfOrdered(x.content)) { catalog.isSelfOrdered = true }
37+
})
38+
39+
// parse all the tables
40+
const tables = jp.query(jsonDoc, '$.data.body.content..table')
41+
for (let i = 0; i < tables.length; i++) {
42+
const odd = !!(i % 2)
43+
// odd == colorway table
44+
// even == sculpt title table
45+
if (odd) {
46+
const colorwayCells = jp.query(tables[i], '$..tableCells')
47+
const flattenedCells = _.flatten(colorwayCells)
48+
const currentSculptObj = catalog.sculpts[catalog.sculpts.length - 1]
49+
for (const cell of flattenedCells) {
50+
const imgId = jp.query(cell, '$..inlineObjectId')
51+
if (!imgId.length) { continue }
52+
const contentNodes = _.flatten(jp.query(cell, '$..content'))
53+
let cellString = contentNodes.map(x => {
54+
if (typeof x !== 'string') { return undefined }
55+
return x.trim()
56+
}).filter(Boolean).join(' ')
57+
let isCover = false
58+
const reCover = new RegExp(`\\(${attributes.cover}\\)`, 'gim')
59+
if (reCover.test(cellString)) {
60+
isCover = true
61+
cellString = cellString.replace(reCover, '')
62+
}
63+
64+
const regDate = /\(([a-zA-Z ]*\d{4})\)/gim
65+
const dateMatch = regDate.exec(cellString)
66+
67+
const regCount = /\(count (\d+)\)/gim
68+
const countMatch = regCount.exec(cellString)
69+
70+
const regComm = /\(\*\)/gim
71+
const commMatch = regComm.exec(cellString)
72+
73+
const regGiveaway = /\(giveaway\)|\(give-away\)/gim
74+
const giveawayMatch = regGiveaway.exec(cellString)
75+
let releaseDate
76+
let totalCount
77+
let commissioned
78+
let giveaway
79+
if (dateMatch) {
80+
// eslint-disable-next-line prefer-destructuring
81+
releaseDate = dateMatch[1]
82+
cellString = cellString.replace(regDate, '')
83+
}
84+
if (countMatch) {
85+
// eslint-disable-next-line prefer-destructuring
86+
totalCount = countMatch[1]
87+
cellString = cellString.replace(regCount, '')
88+
}
89+
if (commMatch) {
90+
commissioned = true
91+
cellString = cellString.replace(regComm, '')
92+
}
93+
if (giveawayMatch) {
94+
giveaway = true
95+
cellString = cellString.replace(regGiveaway, '')
96+
}
97+
const sanitizedName = cellString.trim()
98+
const imgKixId = imgId[0]
99+
catalog.sculpts[catalog.sculpts.length - 1].colorways.push(
100+
{
101+
id: genId(`${catalog.name}-${currentSculptObj.name}-${sanitizedName}-${imgKixId}`),
102+
img: getImgUrl(imgKixId, jsonDoc),
103+
name: sanitizedName,
104+
isCover,
105+
releaseDate,
106+
totalCount,
107+
commissioned,
108+
giveaway,
109+
note: ''
110+
}
111+
)
112+
}
113+
} else {
114+
const contentSculpt = jp.query(tables[i], '$..textRun..content')
115+
if (contentSculpt.length !== 0) {
116+
const s = contentSculpt[0].trim()
117+
const fullContentSculpt = contentSculpt.join(' ')
118+
const sculptName = s
119+
const sculptDate = undefined
120+
const attributes = getAttributes(fullContentSculpt)
121+
catalog.sculpts.push({
122+
id: genId(`${catalog.name}-${s}`),
123+
name: sculptName,
124+
releaseDate: sculptDate,
125+
colorways: [],
126+
...attributes
127+
})
128+
} else {
129+
break
130+
}
131+
}
132+
}
133+
catalog.sculpts = catalog.sculpts.filter(x => x.colorways.length)
134+
return catalog
135+
}
136+
137+
module.exports = {
138+
getCredentials,
139+
downloadJsonDoc,
140+
gDocParse
141+
}

src/scraper/gdoc.js

Lines changed: 5 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
const htmlparser = require('node-html-parser')
2-
const { downloadFile, genId, gDriveParse, gDocUrl, isSelfOrdered, getNationality } = require('../utils')
1+
const { genId, gDocUrl } = require('../utils')
2+
const { gDocParse, getCredentials, downloadJsonDoc } = require('../google')
33

44
function scrapFrom (gdocID, pMeta = {}, tabsOperations = []) {
55
const meta = { ...pMeta }
@@ -15,30 +15,20 @@ function scrapFrom (gdocID, pMeta = {}, tabsOperations = []) {
1515

1616
return async function scrap () {
1717
try {
18-
const index = await downloadFile(gdocID)
19-
const rootNode = htmlparser.parse(index)
20-
const tabs = rootNode.querySelectorAll('table')
21-
tabsOperations.forEach((tabOperation) => {
22-
if (typeof tabOperation === 'function') {
23-
tabOperation(tabs)
24-
} else if (typeof tabOperation === 'string' && Array.prototype[tabOperation]) {
25-
Array.prototype[tabOperation].call(tabs)
26-
}
27-
})
18+
const creds = await getCredentials()
19+
const jsonDoc = await downloadJsonDoc(gdocID, creds)
2820
const catalog = {
2921
src: gDocUrl(gdocID),
3022
id: '',
3123
name: '',
3224
instagram: '',
3325
website: '',
3426
discord: '',
35-
nationality: getNationality(index),
36-
selfOrder: isSelfOrdered(index),
3727
sculpts: [],
3828
...meta
3929
}
4030
catalog.id = genId(meta.id || meta.name)
41-
return gDriveParse(catalog, tabs)
31+
return gDocParse(catalog, jsonDoc)
4232
} catch (e) {
4333
return {
4434
hasError: true,

src/utils.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,6 +326,7 @@ module.exports = {
326326
isSelfOrdered,
327327
getNationality,
328328
attributes,
329+
getAttributes,
329330
sortBy,
330331
launcher,
331332
resize,

0 commit comments

Comments
 (0)