3
3
import * as fsa from 'async-file' ;
4
4
5
5
import {
6
- readLines ,
7
- handleIndexSourceErrors ,
8
- readCached ,
9
- figureOutTruncateAndSelector ,
6
+ readFileWithErrorHandling ,
10
7
cheerioLoad
11
8
} from './common' ;
12
9
10
+ import dictionary from './lists/stopwords.en' ;
11
+
13
12
/**
14
13
* https://www.ranks.nl/stopwords
15
14
* http://xpo6.com/list-of-english-stop-words/
16
15
*/
17
- const stopWords = new Set ( require ( './stopwords' ) ) ;
18
-
19
- const URL_LIST = 'archive/index.csv' ;
20
- const OVERWRITE = true ;
16
+ const stopWords = new Set ( dictionary ) ;
21
17
22
- function transformText ( input ) {
18
+ function normalize ( input ) {
23
19
const dto = String ( input ) || '' ;
24
20
return dto . replace ( / [ ^ \w \s ] | _ / g, '' ) . toLowerCase ( ) ;
25
21
}
26
22
27
- async function extractWords ( recv , source ) {
23
+ async function extractWords ( recv , archivable ) {
28
24
const loaded = cheerioLoad ( recv ) ;
29
25
return loaded . then ( shard => {
30
- const { _ , truncate} = figureOutTruncateAndSelector ( source ) ;
26
+ const truncate = archivable . truncate ;
31
27
shard ( truncate ) . remove ( ) ;
32
28
const text = shard . text ( ) . split ( ' ' ) ;
33
29
const words = Object . create ( null ) ;
34
30
const foundOnce = new Set ( ) ;
35
31
for ( let i = 0 ; i < text . length ; i ++ ) {
36
- const word = transformText ( text [ i ] ) ;
32
+ const word = normalize ( text [ i ] ) ;
37
33
const withinCharRange = / ^ [ a - z A - Z À - Ö Ø - ö ø - ÿ ] + $ / . test ( word ) ;
38
34
const isNotStopWord = stopWords . has ( word ) === false ;
39
35
const hasAtLeastTwo = word . length > 1 ;
@@ -51,20 +47,18 @@ async function extractWords(recv, source) {
51
47
} ) ;
52
48
}
53
49
54
- async function read ( source ) {
55
- const path = `archive/${ source . slug } ` ;
56
- const cacheFile = `${ path } /cache.html` ;
57
- const targetFileName = `${ path } /analyze.json` ;
50
+ async function analyze ( cacheFile , archivable ) {
58
51
const cacheExists = await fsa . exists ( cacheFile ) ;
59
52
const data = Object . create ( null ) ;
60
53
if ( cacheExists === true ) {
61
- const cacheData = await readCached ( cacheFile ) ;
62
- const words = await extractWords ( cacheData , source ) ;
63
- data . words = words ;
54
+ const cacheData = await readFileWithErrorHandling ( cacheFile ) ;
55
+ const words = await extractWords ( cacheData , archivable ) ;
56
+ const { sorted, keywords} = await sortedAndKeywords ( words ) ;
57
+ data . words = Object . assign ( { } , sorted ) ;
58
+ data . keywords = Object . assign ( { } , keywords ) ;
64
59
}
65
60
66
- console . log ( `\nProcessing ${ path } ` ) ;
67
- return { file : targetFileName , data} ;
61
+ return data ;
68
62
}
69
63
70
64
function sort ( subject = { } ) {
@@ -82,52 +76,50 @@ function sort(subject = {}) {
82
76
return sortable ; // array in format [ [ key1, val1 ], [ key2, val2 ], ... ]
83
77
}
84
78
85
- async function analyze ( recv ) {
86
- const words = recv . data . words || { } ;
79
+ async function sortedAndKeywords ( words = { } ) {
87
80
const keywords = Object . create ( null ) ;
88
- const sorted = sort ( words ) ;
81
+ const sorted = Object . create ( null ) ;
82
+ const sorting = sort ( words ) ;
89
83
const max = 10 ;
90
84
let iter = 0 ;
91
- for ( const popular of sorted ) {
85
+ for ( const popular of sorting ) {
92
86
const used = popular [ 1 ] ; // word has been used n times
93
87
const word = popular [ 0 ] ;
94
88
if ( iter <= max && used > 3 ) {
95
89
keywords [ word ] = used ;
96
90
}
91
+ sorted [ word ] = used ;
97
92
iter ++ ;
98
93
}
99
94
100
95
const wordCount = Object . keys ( words ) . length ;
101
- const paddedCounter = String ( wordCount ) . padStart ( '5' , ' ' ) ;
102
- let logLine = paddedCounter + ' words found' ;
96
+ let logLine = ' analysis:' ;
97
+ console . log ( logLine ) ;
98
+ logLine = ' words: ' + wordCount ;
103
99
console . log ( logLine ) ;
104
100
const firstThreeKeywords = Object . keys ( keywords ) . slice ( 0 , 3 ) . join ( ', ' ) ;
105
- logLine = ' Top keywords: ' + firstThreeKeywords ;
101
+ logLine = ' keywords: ' + firstThreeKeywords ;
106
102
console . log ( logLine ) ;
107
103
108
- recv . data . keywords = keywords ;
109
-
110
- return recv ;
104
+ return { sorted, keywords} ;
111
105
}
112
106
113
- async function write ( { file, data = { } } , boolOverwrite = false ) {
107
+ async function write ( file , data = { } , boolOverwrite = true ) {
114
108
const destExists = await fsa . exists ( file ) ;
109
+ const contents = JSON . stringify ( data ) ;
115
110
if ( destExists === false || ( destExists === true && boolOverwrite ) ) {
116
- await fsa . writeTextFile ( file , JSON . stringify ( data ) , 'utf8' ) ;
111
+ await fsa . writeTextFile ( file , contents , 'utf8' ) ;
117
112
}
118
113
119
- return { file, data } ;
114
+ return { file, contents } ;
120
115
}
121
116
122
- /**
123
- * Something is going somewhat as an anti-pattern here.
124
- * We want Promise.all(...) at each step, and it's not how
125
- * it is as of now. Needs rework here. TODO
126
- */
127
- for ( const url of readLines ( URL_LIST ) ) {
128
- Promise . resolve ( url )
129
- . then ( u => read ( u ) )
130
- . then ( descriptor => analyze ( descriptor ) )
131
- . then ( descriptor => write ( descriptor , OVERWRITE ) )
132
- . catch ( handleIndexSourceErrors ) ;
133
- }
117
+ export default async archivable => {
118
+ const slug = archivable . slug ;
119
+ const path = `archive/${ slug } ` ;
120
+ const cacheFile = `${ path } /document.html` ;
121
+ const file = `${ path } /analyze.json` ;
122
+ return Promise . resolve ( cacheFile )
123
+ . then ( cacheFile => analyze ( cacheFile , archivable ) )
124
+ . then ( analyzed => write ( file , analyzed ) ) ;
125
+ } ;
0 commit comments