-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.js
149 lines (128 loc) · 4.87 KB
/
app.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import path from 'path'
import { fileURLToPath } from 'url'
const __filename = fileURLToPath(import.meta.url)
const __dirname = path.dirname(__filename)
import fs from 'fs'
import * as pdfjs from 'pdfjs-dist'
const extractTextFromPDF = async (pdfPath) => {
const pdf = await pdfjs.getDocument({ url: pdfPath, verbosity: 0 }).promise
let textItems = []
async function getTextFromPage(pageNum) {
const page = await pdf.getPage(pageNum)
const textContent = await page.getTextContent()
textItems = [...textItems, ...textContent.items]
if (pageNum < pdf.numPages) {
await getTextFromPage(pageNum + 1)
}
}
await getTextFromPage(1)
return textItems
}
const availableBankModules = fs.readdirSync(`${__dirname}/bank_modules`).filter(f => f.endsWith('.js')).map(f => f.slice(0, -3))
const usage = (exitCode = 0) => {
console.log(
`Usage: ${process.argv[0].split('/').pop()} ${process.argv[1].split('/').pop()} [-h] [-o OUTPUT_DIR] FILE/DIR [FILE/DIR ...]
Description:
Parse transactions from supported bank statement PDFs into a JSON array.
Options:
-h Show this help message and exit.
-o Reorganize input files by moving them into a specified output directory
Arguments:
FILE/DIR The path to a PDF file or directory in which to search (recursively) for PDF files.
You can specify more than one.
Results from all files are merged into one array, sorted by date.`
)
if (exitCode !== null && exitCode !== undefined) {
process.exit(exitCode)
}
}
let argi = 2
if (process.argv[argi] == '-h') {
usage()
}
const ORGANIZE_TARGET = process.argv[argi] == '-o' ? process.argv[argi + 1] : null
if (ORGANIZE_TARGET) {
argi += 2
if (fs.existsSync(ORGANIZE_TARGET) && !fs.statSync(ORGANIZE_TARGET).isDirectory()) {
console.error('Invalid -o directory!')
usage(1)
}
}
const bankModules = {}
for (let i = 0; i < availableBankModules.length; i++) {
bankModules[availableBankModules[i]] = await import(`${__dirname}/bank_modules/${availableBankModules[i]}.js`)
}
const pathArgs = process.argv.slice(argi)
if (pathArgs.length == 0) {
console.error('No files/dirs specified!\n')
usage(1)
}
const getTargetFilesFromArg = (pathArg) => {
const files = []
if (!fs.existsSync(pathArg)) {
console.error(`Invalid file/dir: ${pathArg}\n`)
usage(1)
}
if (fs.statSync(pathArg).isFile()) {
files.push(pathArg)
} else {
const procDir = (dirPath) => {
const fileList = fs.readdirSync(dirPath);
fileList.forEach(f => {
const filePath = `${dirPath}/${f}`
if (fs.statSync(filePath).isDirectory()) {
procDir(filePath)
} else if (filePath.toLowerCase().endsWith('.pdf')) {
files.push(filePath)
}
});
}
procDir(pathArg)
}
return files
}
const files = []
pathArgs.forEach(a => files.push.apply(files, getTargetFilesFromArg(a)))
const formatDate = (date) => {
const year = date.getFullYear()
const month = String(date.getMonth() + 1).padStart(2, '0')
const day = String(date.getDate()).padStart(2, '0')
return `${year}-${month}-${day}`
}
const entries = []
for (let i = 0; i < files.length; i++) {
try {
const fileText = await extractTextFromPDF(files[i]);
let fileValidatedInfo = null
let j
for (j = 0; j < availableBankModules.length; j++) {
fileValidatedInfo = bankModules[availableBankModules[j]].validateFile(fileText)
if (fileValidatedInfo) {
break
}
}
if (fileValidatedInfo) {
entries.push.apply(entries, bankModules[availableBankModules[j]].extractEntries(fileText).map(e => {
e.bank = fileValidatedInfo.bank
e.account = fileValidatedInfo.account
return e
}))
if (ORGANIZE_TARGET) {
const targetDir = `${ORGANIZE_TARGET}/${fileValidatedInfo.bank.split(' ').join('-')}/${fileValidatedInfo.account.split(' ').join('-')}`
if (!fs.existsSync(targetDir)) {
fs.mkdirSync(targetDir, { recursive: true })
}
const fn = `${`${fileValidatedInfo.bank}_${fileValidatedInfo.account}`.split(' ').join('-')}_${formatDate(fileValidatedInfo.date)}.pdf`
fs.cpSync(files[i], `${targetDir}/${fn}`)
fs.unlinkSync(files[i])
}
} else {
console.error(`Did not recognize PDF (skipped): ${files[i]}`)
}
} catch (e) {
console.error(files[i])
throw e
}
}
entries.sort((a, b) => a.date - b.date)
console.log(JSON.stringify(entries, null, ' '))