Skip to content

Commit 87d0861

Browse files
committed
For migrating WordPress blog. Tabling, for re-rework.
1 parent 83a9ea1 commit 87d0861

13 files changed

+286
-97
lines changed

.vscode/launch.json

+14
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
{
2+
// Use IntelliSense to learn about possible attributes.
3+
// Hover to view descriptions of existing attributes.
4+
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
5+
"version": "0.2.0",
6+
"configurations": [
7+
{
8+
"type": "node",
9+
"request": "launch",
10+
"name": "Launch Program",
11+
"program": "${workspaceFolder}/example.js"
12+
}
13+
]
14+
}

.vscode/settings.json

+3
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
{
2+
"git.ignoreLimitWarning": true
3+
}

.vscode/tasks.json

+12
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
{
2+
// See https://go.microsoft.com/fwlink/?LinkId=733558
3+
// for the documentation about the tasks.json format
4+
"version": "2.0.0",
5+
"tasks": [
6+
{
7+
"type": "npm",
8+
"script": "start",
9+
"problemMatcher": []
10+
}
11+
]
12+
}

Makefile

+5-2
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,14 @@ test:
1717

1818
.PHONY: archive
1919
archive:
20-
yarn archive
20+
bin/archivator archive
2121

2222
.PHONY: analyze
2323
analyze:
24-
yarn analyze
24+
bin/archivator analyze
25+
26+
markdownify:
27+
yarn markdownify
2528

2629
dist:
2730
yarn dist

bin/archivator-markdownify.js

+24
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
import {
2+
catcher,
3+
iterateIntoArchivable
4+
} from '../src/module';
5+
import {
6+
read,
7+
handle,
8+
write
9+
} from '../src/markdownify';
10+
11+
const URL_LIST = 'archive/index.csv';
12+
13+
(async () => {
14+
/**
15+
* Something is going somewhat as an anti-pattern here.
16+
* We want Promise.all(...) at each step, and it's not how
17+
* it is as of now. Needs rework here. TODO
18+
*/
19+
for (const archivable of iterateIntoArchivable(URL_LIST)) {
20+
const contents = await read(archivable).catch(catcher);
21+
const handled = await handle(contents).catch(catcher);
22+
await write(handled).catch(catcher);
23+
}
24+
})();

package.json

+29-29
Original file line numberDiff line numberDiff line change
@@ -2,37 +2,51 @@
22
"name": "archivator",
33
"version": "1.0.0",
44
"description": "Fetch and stream body contents for URLs",
5-
"repository": {
6-
"type": "git",
7-
"url": "https://github.com/renoirb/archivator.git"
8-
},
95
"bugs": {
106
"url": "https://github.com/renoirb/archivator/issues"
117
},
12-
"bin": {
13-
"archivator-archive": "./src/archive.js",
14-
"archivator-analyze": "./src/analyze.js"
8+
"repository": {
9+
"type": "git",
10+
"url": "https://github.com/renoirb/archivator.git"
1511
},
12+
"license": "MIT",
13+
"author": "Renoir Boulanger <[email protected]> (http://renoirb.com/)",
1614
"main": "index.js",
1715
"module": "module.js",
16+
"bin": {
17+
"archivator-analyze": "./dist/analyze.js",
18+
"archivator-archive": "./dist/archive.js"
19+
},
1820
"scripts": {
21+
"analyze": "node_modules/.bin/babel-node src/analyze.js",
22+
"archive": "node_modules/.bin/babel-node src/archive.js",
23+
"dev": "node_modules/.bin/babel src/ -d dist/ -w -s",
1924
"dist": "node_modules/.bin/babel src/ --minified -d dist/ -s",
20-
"test": "node_modules/.bin/mocha --compilers js:babel-core/register",
2125
"lint": "node_modules/.bin/xo",
22-
"dev": "node_modules/.bin/babel src/ -d dist/ -w -s",
23-
"archive": "node_modules/.bin/babel-node src/archive.js",
24-
"analyze": "node_modules/.bin/babel-node src/analyze.js",
26+
"lint:fix": "node_modules/.bin/xo --fix",
2527
"markdownify": "node_modules/.bin/babel-node src/markdownify.js",
26-
"lint:fix": "node_modules/.bin/xo --fix"
28+
"test": "node_modules/.bin/mocha --compilers js:babel-core/register"
29+
},
30+
"xo": {
31+
"esnext": true,
32+
"ignores": [
33+
"dist/**",
34+
"example.js"
35+
],
36+
"plugins": [
37+
"unicorn"
38+
],
39+
"rules": {
40+
"func-names": 0
41+
},
42+
"space": 2
2743
},
28-
"author": "Renoir Boulanger <[email protected]> (http://renoirb.com/)",
29-
"license": "MIT",
3044
"dependencies": {
3145
"async-file": "^2.0.2",
3246
"babel-polyfill": "^6.23.0",
3347
"cheerio": "^0.22.0",
3448
"elasticsearch": "^12.1.3",
35-
"esm": "^3.0.81",
49+
"esm": "^3.2.25",
3650
"gen-readlines": "^0.1.3",
3751
"html-md-2": "^3.0.0",
3852
"node-fetch": "^1.6.3",
@@ -52,19 +66,5 @@
5266
"eslint-plugin-unicorn": "^1.0.0",
5367
"mocha": "^3.2.0",
5468
"xo": "^0.17.1"
55-
},
56-
"xo": {
57-
"esnext": true,
58-
"space": 2,
59-
"plugins": [
60-
"unicorn"
61-
],
62-
"ignores": [
63-
"dist/**",
64-
"example.js"
65-
],
66-
"rules": {
67-
"func-names": 0
68-
}
6969
}
7070
}

src/analyze.js

+11-11
Original file line numberDiff line numberDiff line change
@@ -7,22 +7,22 @@ import {
77
cheerioLoad
88
} from './common';
99

10-
import dictionary from './lists/stopwords.en';
11-
12-
/**
13-
* https://www.ranks.nl/stopwords
14-
* http://xpo6.com/list-of-english-stop-words/
15-
*/
16-
const stopWords = new Set(dictionary);
10+
import dictionary from './lists/stopwords.fr';
11+
import dictionaryEn from './lists/stopwords.en';
1712

1813
function normalize(input) {
1914
const dto = String(input) || '';
2015
return dto.replace(/[^\w\s]|_/g, '').toLowerCase();
2116
}
2217

2318
async function extractWords(recv, archivable) {
24-
const loaded = cheerioLoad(recv);
25-
return loaded.then(shard => {
19+
/**
20+
* https://www.ranks.nl/stopwords
21+
* http://xpo6.com/list-of-english-stop-words/
22+
*/
23+
const stopWordsSet = new Set([...dictionary, ...dictionaryEn]);
24+
25+
return cheerioLoad(recv).then(shard => {
2626
const truncate = archivable.truncate;
2727
shard(truncate).remove();
2828
const text = shard.text().split(' ');
@@ -31,7 +31,7 @@ async function extractWords(recv, archivable) {
3131
for (let i = 0; i < text.length; i++) {
3232
const word = normalize(text[i]);
3333
const withinCharRange = /^[a-zA-ZÀ-ÖØ-öø-ÿ]+$/.test(word);
34-
const isNotStopWord = stopWords.has(word) === false;
34+
const isNotStopWord = stopWordsSet.has(word) === false;
3535
const hasAtLeastTwo = word.length > 1;
3636
if (withinCharRange && isNotStopWord && hasAtLeastTwo) {
3737
if (foundOnce.has(word) === false) {
@@ -117,7 +117,7 @@ async function write(file, data = {}, boolOverwrite = true) {
117117
export default async archivable => {
118118
const slug = archivable.slug;
119119
const path = `archive/${slug}`;
120-
const cacheFile = `${path}/document.html`;
120+
const cacheFile = `${path}/cache.html`;
121121
const file = `${path}/analyze.json`;
122122
return Promise.resolve(cacheFile)
123123
.then(cacheFile => analyze(cacheFile, archivable))

src/common.js

+4-2
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,10 @@ function readCachedError(errorObj) {
5858
}
5959

6060
// Make possible to do extractLinks, markdownify, ... in parallel TODO
61-
async function cheerioLoad(recv, configObj = {}) {
62-
return new Promise(resolve => resolve(cheerio.load(recv, configObj)));
61+
function cheerioLoad(recv, configObj = {}) {
62+
// console.log('async cheerioLoad', { recv, configObj })
63+
const loading = cheerio.load(recv, configObj);
64+
return Promise.resolve(loading);
6365
}
6466

6567
const urlNotInBlacklist = u => {

src/lists/stopwords.en.json

+14-2
Original file line numberDiff line numberDiff line change
@@ -563,5 +563,17 @@
563563
"yourself",
564564
"yourselves",
565565
"youve",
566-
"zero"
567-
]
566+
"zero",
567+
"january",
568+
"february",
569+
"march",
570+
"april",
571+
"may",
572+
"june",
573+
"july",
574+
"august",
575+
"september",
576+
"october",
577+
"november",
578+
"december"
579+
]

src/lists/stopwords.fr.json

+46
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
[
2+
"a",
3+
"avec",
4+
"ce",
5+
"dans",
6+
"de",
7+
"de",
8+
"des",
9+
"du",
10+
"en",
11+
"est",
12+
"jai",
13+
"je",
14+
"la",
15+
"le",
16+
"les",
17+
"long",
18+
"mon",
19+
"non",
20+
"off",
21+
"oh",
22+
"ok",
23+
"or",
24+
"pas",
25+
"possible",
26+
"pour",
27+
"que",
28+
"qui",
29+
"sensible",
30+
"sont",
31+
"sur",
32+
"une",
33+
"zero",
34+
"janvier",
35+
"fevrier",
36+
"mars",
37+
"avril",
38+
"mai",
39+
"juin",
40+
"juillet",
41+
"aout",
42+
"septembre",
43+
"octobre",
44+
"novembre",
45+
"decembre"
46+
]

0 commit comments

Comments
 (0)