Skip to content

Commit 83a9ea1

Browse files
committed
Make into ESM. Review patterns.
1 parent f238519 commit 83a9ea1

16 files changed

+505
-316
lines changed

bin/archivator

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/usr/bin/env bash
2+
3+
Function=Function//; node -r esm "$0" "$@"; exit
4+
5+
/**
6+
* Leverage Require ESM Node.js CLI option.
7+
*
8+
* Source https://github.com/nodejs/modules/issues/152#issuecomment-405657530
9+
*/
10+
11+
// import path from 'path';
12+
13+
const [
14+
handler,
15+
fileName,
16+
...args
17+
] = process.argv
18+
19+
// const srcDir = path.resolve('../src')
20+
// module.paths.push(srcDir)
21+
22+
import './archivator-archive'

bin/archivator-archive.js

+22
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
import {
2+
catcher,
3+
fetcher,
4+
transformer,
5+
analyzer,
6+
iterateIntoArchivable
7+
} from '../src/module';
8+
9+
const URL_LIST = 'archive/index.csv';
10+
11+
(async () => {
12+
/**
13+
* Something is going somewhat as an anti-pattern here.
14+
* We want Promise.all(...) at each step, and it's not how
15+
* it is as of now. Needs rework here. TODO
16+
*/
17+
for (const archivable of iterateIntoArchivable(URL_LIST)) {
18+
await fetcher(archivable).catch(catcher);
19+
await transformer(archivable).catch(catcher);
20+
await analyzer(archivable).catch(catcher);
21+
}
22+
})();

package.json

+4-1
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
"description": "Fetch and stream body contents for URLs",
55
"repository": {
66
"type": "git",
7-
"url": "git@github.com:renoirb/archivator.git"
7+
"url": "https://github.com/renoirb/archivator.git"
88
},
99
"bugs": {
1010
"url": "https://github.com/renoirb/archivator/issues"
@@ -13,6 +13,8 @@
1313
"archivator-archive": "./src/archive.js",
1414
"archivator-analyze": "./src/analyze.js"
1515
},
16+
"main": "index.js",
17+
"module": "module.js",
1618
"scripts": {
1719
"dist": "node_modules/.bin/babel src/ --minified -d dist/ -s",
1820
"test": "node_modules/.bin/mocha --compilers js:babel-core/register",
@@ -30,6 +32,7 @@
3032
"babel-polyfill": "^6.23.0",
3133
"cheerio": "^0.22.0",
3234
"elasticsearch": "^12.1.3",
35+
"esm": "^3.0.81",
3336
"gen-readlines": "^0.1.3",
3437
"html-md-2": "^3.0.0",
3538
"node-fetch": "^1.6.3",

src/analyze.js

+38-46
Original file line numberDiff line numberDiff line change
@@ -3,37 +3,33 @@
33
import * as fsa from 'async-file';
44

55
import {
6-
readLines,
7-
handleIndexSourceErrors,
8-
readCached,
9-
figureOutTruncateAndSelector,
6+
readFileWithErrorHandling,
107
cheerioLoad
118
} from './common';
129

10+
import dictionary from './lists/stopwords.en';
11+
1312
/**
1413
* https://www.ranks.nl/stopwords
1514
* http://xpo6.com/list-of-english-stop-words/
1615
*/
17-
const stopWords = new Set(require('./stopwords'));
18-
19-
const URL_LIST = 'archive/index.csv';
20-
const OVERWRITE = true;
16+
const stopWords = new Set(dictionary);
2117

22-
function transformText(input) {
18+
function normalize(input) {
2319
const dto = String(input) || '';
2420
return dto.replace(/[^\w\s]|_/g, '').toLowerCase();
2521
}
2622

27-
async function extractWords(recv, source) {
23+
async function extractWords(recv, archivable) {
2824
const loaded = cheerioLoad(recv);
2925
return loaded.then(shard => {
30-
const {_, truncate} = figureOutTruncateAndSelector(source);
26+
const truncate = archivable.truncate;
3127
shard(truncate).remove();
3228
const text = shard.text().split(' ');
3329
const words = Object.create(null);
3430
const foundOnce = new Set();
3531
for (let i = 0; i < text.length; i++) {
36-
const word = transformText(text[i]);
32+
const word = normalize(text[i]);
3733
const withinCharRange = /^[a-zA-ZÀ-ÖØ-öø-ÿ]+$/.test(word);
3834
const isNotStopWord = stopWords.has(word) === false;
3935
const hasAtLeastTwo = word.length > 1;
@@ -51,20 +47,18 @@ async function extractWords(recv, source) {
5147
});
5248
}
5349

54-
async function read(source) {
55-
const path = `archive/${source.slug}`;
56-
const cacheFile = `${path}/cache.html`;
57-
const targetFileName = `${path}/analyze.json`;
50+
async function analyze(cacheFile, archivable) {
5851
const cacheExists = await fsa.exists(cacheFile);
5952
const data = Object.create(null);
6053
if (cacheExists === true) {
61-
const cacheData = await readCached(cacheFile);
62-
const words = await extractWords(cacheData, source);
63-
data.words = words;
54+
const cacheData = await readFileWithErrorHandling(cacheFile);
55+
const words = await extractWords(cacheData, archivable);
56+
const {sorted, keywords} = await sortedAndKeywords(words);
57+
data.words = Object.assign({}, sorted);
58+
data.keywords = Object.assign({}, keywords);
6459
}
6560

66-
console.log(`\nProcessing ${path}`);
67-
return {file: targetFileName, data};
61+
return data;
6862
}
6963

7064
function sort(subject = {}) {
@@ -82,52 +76,50 @@ function sort(subject = {}) {
8276
return sortable; // array in format [ [ key1, val1 ], [ key2, val2 ], ... ]
8377
}
8478

85-
async function analyze(recv) {
86-
const words = recv.data.words || {};
79+
async function sortedAndKeywords(words = {}) {
8780
const keywords = Object.create(null);
88-
const sorted = sort(words);
81+
const sorted = Object.create(null);
82+
const sorting = sort(words);
8983
const max = 10;
9084
let iter = 0;
91-
for (const popular of sorted) {
85+
for (const popular of sorting) {
9286
const used = popular[1]; // word has been used n times
9387
const word = popular[0];
9488
if (iter <= max && used > 3) {
9589
keywords[word] = used;
9690
}
91+
sorted[word] = used;
9792
iter++;
9893
}
9994

10095
const wordCount = Object.keys(words).length;
101-
const paddedCounter = String(wordCount).padStart('5', ' ');
102-
let logLine = paddedCounter + ' words found';
96+
let logLine = ' analysis:';
97+
console.log(logLine);
98+
logLine = ' words: ' + wordCount;
10399
console.log(logLine);
104100
const firstThreeKeywords = Object.keys(keywords).slice(0, 3).join(', ');
105-
logLine = ' Top keywords: ' + firstThreeKeywords;
101+
logLine = ' keywords: ' + firstThreeKeywords;
106102
console.log(logLine);
107103

108-
recv.data.keywords = keywords;
109-
110-
return recv;
104+
return {sorted, keywords};
111105
}
112106

113-
async function write({file, data = {}}, boolOverwrite = false) {
107+
async function write(file, data = {}, boolOverwrite = true) {
114108
const destExists = await fsa.exists(file);
109+
const contents = JSON.stringify(data);
115110
if (destExists === false || (destExists === true && boolOverwrite)) {
116-
await fsa.writeTextFile(file, JSON.stringify(data), 'utf8');
111+
await fsa.writeTextFile(file, contents, 'utf8');
117112
}
118113

119-
return {file, data};
114+
return {file, contents};
120115
}
121116

122-
/**
123-
* Something is going somewhat as an anti-pattern here.
124-
* We want Promise.all(...) at each step, and it's not how
125-
* it is as of now. Needs rework here. TODO
126-
*/
127-
for (const url of readLines(URL_LIST)) {
128-
Promise.resolve(url)
129-
.then(u => read(u))
130-
.then(descriptor => analyze(descriptor))
131-
.then(descriptor => write(descriptor, OVERWRITE))
132-
.catch(handleIndexSourceErrors);
133-
}
117+
export default async archivable => {
118+
const slug = archivable.slug;
119+
const path = `archive/${slug}`;
120+
const cacheFile = `${path}/document.html`;
121+
const file = `${path}/analyze.json`;
122+
return Promise.resolve(cacheFile)
123+
.then(cacheFile => analyze(cacheFile, archivable))
124+
.then(analyzed => write(file, analyzed));
125+
};

src/archivable.js

+130
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
import {URL} from 'url';
2+
3+
import normalilzeUrlSlug from './normalizer/slugs';
4+
import {cheerioLoad, urlNotInBlacklist} from './common';
5+
import extractAssets from './assets-handler';
6+
7+
function simplifyBody(truncate, selector, body) {
8+
return cheerioLoad(body).then(shard => {
9+
shard(truncate).remove();
10+
return shard(selector);
11+
});
12+
}
13+
14+
async function extractLinks(body) {
15+
const links = new Set();
16+
return cheerioLoad(body).then(shard => {
17+
shard('a[href]').each((_, element) => {
18+
const href = shard(element).attr('href');
19+
try {
20+
const hrefObj = new URL(href);
21+
const isHttpLink = hrefObj.protocol.startsWith('http');
22+
if (isHttpLink) {
23+
const rewritten = `${hrefObj.origin}${hrefObj.pathname}`;
24+
if (urlNotInBlacklist(rewritten)) {
25+
links.add(rewritten);
26+
}
27+
}
28+
} catch (err) { }
29+
});
30+
return [...links];
31+
});
32+
}
33+
34+
function appendDefaultTruncate(truncateArg) {
35+
// Truncate is to strip off any patterns we do not want
36+
// as part of our archived article.
37+
let truncate = (truncateArg.length === 0) ? '' : `${truncateArg},`;
38+
truncate += 'script,style,noscript';
39+
return truncate;
40+
}
41+
42+
function appendDefaultSelector(selectorArg) {
43+
// If we know exactly where the main content is, otherwise grab the whole
44+
// document body.
45+
return (selectorArg.length === 0) ? 'body' : `${selectorArg}`;
46+
}
47+
48+
/**
49+
* Given every row in source file .csv
50+
* http://example.org/a/b.html;selector;truncate
51+
*
52+
* selector is the CSS selector where the main content is
53+
* truncate is a list of CSS selectors to strip off
54+
*/
55+
class Archivable {
56+
constructor(
57+
url,
58+
truncate = '',
59+
selector = '',
60+
) {
61+
if (typeof url === 'undefined') {
62+
const message = `First argument is URL, and is Required`;
63+
throw new Error(message);
64+
}
65+
this.url = url;
66+
this.slug = normalilzeUrlSlug(url);
67+
this.truncate = truncate;
68+
this.selector = selector;
69+
this.assets = [];
70+
this.links = [];
71+
this.body = '';
72+
}
73+
74+
async setBody(body) {
75+
const truncate = this.truncate;
76+
const selector = this.selector;
77+
const simplified = await simplifyBody(truncate, selector, body);
78+
const html = simplified.html();
79+
this.body = html;
80+
const links = await extractLinks(html);
81+
this.links = [...links];
82+
const assets = await extractAssets(this);
83+
this.assets = [...assets];
84+
}
85+
86+
toJSON() {
87+
const url = this.url;
88+
const slug = this.slug;
89+
const truncate = this.truncate;
90+
const selector = this.selector;
91+
const assets = [...this.assets];
92+
const links = [...this.links];
93+
94+
return {
95+
url,
96+
slug,
97+
truncate,
98+
selector,
99+
assets,
100+
links
101+
};
102+
}
103+
104+
static fromJSON(arg) {
105+
const argIsString = typeof arg === 'string';
106+
if (argIsString === false) {
107+
const message = 'Only String is supported';
108+
throw new Error(message);
109+
}
110+
const {
111+
url,
112+
truncate,
113+
selector
114+
} = JSON.parse(arg);
115+
116+
return new Archivable(url, truncate, selector);
117+
}
118+
119+
static fromLine(
120+
line = 'http://localhost;;',
121+
) {
122+
const [url, selectorArg = '', truncateArg = ''] = line.split(';');
123+
const truncate = appendDefaultTruncate(truncateArg);
124+
const selector = appendDefaultSelector(selectorArg);
125+
126+
return new Archivable(url, truncate, selector);
127+
}
128+
}
129+
130+
export default Archivable;

src/archive.js

-18
This file was deleted.

0 commit comments

Comments
 (0)