-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpdf-parser.rb
159 lines (139 loc) · 3.84 KB
/
pdf-parser.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
#!/usr/bin/ruby
require 'pp'
require 'pdf/reader'
# Return information about pay-sheet extracted from PDF.
# {:meta=>
# {:title=>"PAY18E",
# :creator=>"OpenText Exstream Version 9.5.304 64-bit",
# :date=>"12/14/2018 20:41:59",
# :author=>"Registered to: DGCP",
# :pages=>1,
# :filename=>"docs/2018_12_BP_decembre.pdf",
# :version=>"V2.0 - 26062018"},
# :pay=>
# {:month=>"decembre",
# :year=>"2018",
# :net=>XXXX,
# :brut=>XXXX,
# :employeur=>XXXX},
# :tax=>{:year=>XXXX, :month=>XXXX.X},
# :extra=>
# {:account=>"FR76 XXXXXXX",
# :heures=>"+.DE.120.H",
# :date_paiement=>"18 decembre 2018 ",
# :secu=>"1.XXXX",
# :secu_cle=>"58",
# :grade=>"XXXX",
# :indice=>123,
# :indice_nbi=>75}}
class PaySheetPdfParser
def self.parse file
pdf = PDF::Reader.new(file)
txt=pdf.pages[0].text.split("\n")
infos={
:meta =>{
:title => pdf.info[:Title].strip,
:creator => pdf.info[:Creator].strip,
:date => pdf.info[:CreationDate].strip,
:author => pdf.info[:Author].strip,
:pages => pdf.pages.size,
:filename => file
},
:pay => {},
:tax => {},
:extra => {},
}
txt.each do |_l|
if _l.match(/NET À PAYER\s+(\d[ \d,]+\d)\s+€/)
infos[:pay][:net]=$1.tr(' ','').tr(',','.').to_f
end
if _l.match(/MOIS DE\s+(.+)$/)
date=$1.downcase
infos[:pay][:month]=date.split[0]
infos[:pay][:year]=date.split[1]
end
end
pdf.pages[0].raw_content.split("\n").each do |_l|
# version de la fiche de paye
# ... (PAY18E) Tj 0.000 -1.000 1.000 0.000 4832 1236 Tm ( - V1.4 - 25102016) ...
if _l.match(/PAY18E.*\( - (.*)\) /)
infos[:meta][:version]=$1
end
# traitement brut
if _l.match(/\/F243 75.0000 Tf 321[24] 6011 Td \(\s*(.*)\) Tj/)
infos[:pay][:brut]=$1.tr(',','.').to_f
end
# cout employeur
if _l.match(/\/F243 83.3333 Tf 1836 1894 Td \(\s*(.*)\) Tj/)
infos[:pay][:employeur]=$1.tr(',','.').to_f
end
# montant imposable de l'année
if _l.match(/\/F243 83.3333 Tf 183 1188 Td \(\s*(.*)\) Tj/)
infos[:tax][:year]=$1.tr(' ','').to_f
end
# montant imposable de du mois
if _l.match(/\/F243 83.3333 Tf 989 1188 Td \(\s*(.*)\) Tj/)
infos[:tax][:month]=$1.tr(' ','').to_f
end
# indice
if _l.match(/\/F243 83.3333 Tf 3462 6252 Td \(\s*(.*)\) Tj/)
infos[:extra][:indice]=$1.to_i
end
# indice NBI
if _l.match(/\/F243 83.3333 Tf 3860 6252 Td \(\s*(.*)\) Tj/)
infos[:extra][:indice_nbi]=$1.sub('NBI ','').to_i
end
# grade
if _l.match(/\/F243 83.3333 Tf 1784 6252 Td \(\s*(.*)\) Tj/)
infos[:extra][:grade]=$1
end
# num secu
if _l.match(/\/F243 83.3333 Tf 370 6252 Td \(\s*(.*)\) Tj/)
infos[:extra][:secu]=$1.tr(' ','.')
end
# num secu/clé
if _l.match(/\/F243 83.3333 Tf 1386 6252 Td \(\s*(.*)\) Tj/)
infos[:extra][:secu_cle]=$1.tr(' ','.')
end
# temps activité (heures)
if _l.match(/\/F243 83.3333 Tf 4293 6885 Td \(\s*(.*)\) Tj/)
infos[:extra][:heures]=$1.tr(' ','.')
end
# date mise en paiement
if _l.match(/\/F243 83.3333 Tf 484 586 Td \(\s*(.*)\) Tj/)
infos[:extra][:date_paiement]=$1.gsub(/ */,' ').downcase
end
# compte
if _l.match(/\/F243 83.3333 Tf 89 312 Td \(\s*(.*?)\) Tj\s0 -95 Td \(\s+(.*)\)/)
infos[:extra][:account]=$1+'-'+$2
end
end
return infos
end
def self.parse_raw file
pdf = PDF::Reader.new(file)
pdf.pages[0].raw_content
end
end
doc="docs/2017_12_BP_decembre.pdf"
test=:pdf_parser
test=:parse
# test=:origami
case test
when :origami
require 'origami'
pdf = Origami::PDF.read doc
pp pdf.pages.each do |p|
pp p.class
end
when :pdf_parser
pp PaySheetPdfParser.parse doc
when :parse
Dir.glob('docs/*BP*.pdf').sort.each do |f|
pp PaySheetPdfParser.parse f
end
when :parse_raw
Dir.glob('docs/*BP*.pdf').sort.each do |f|
IO.write(f + '.raw', PaySheetPdfParser.parse_raw(f).to_s)
end
end