-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcleancsv.py
107 lines (91 loc) · 3.11 KB
/
cleancsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from __future__ import print_function
import os
import glob
from datetime import datetime
from datatoolbox import logg_info, makedir
def rm_temp_files():
for f in glob.glob("tmp/vcf*.csv"):
os.remove(f)
# Si existe tmp/historical limpia csv historico
if os.path.exists('tmp/historical'):
rm_temp_files()
clean_hist = True
else:
clean_hist = False
# Si existe tmp/year limpia csv historico
if os.path.exists('tmp/year'):
rm_temp_files()
clean_year = True
else:
clean_year = False
year = datetime.now().year
def histcsv_files(year):
lastyear = year - 1
inityear = [2002, 2002, 1981, 2002, 2000]
fondos = list('ABCDE')
histcsv = ['vcf%s%s-%s.csv'%(i,iy,lastyear)
for i,iy in zip(fondos,inityear)]
return histcsv
def yearcsv_files(year):
fondos = list('ABCDE')
return ['vcf%s%s-%s.csv'%(i,year,year) for i in fondos]
histcsv = histcsv_files(year)
yearcsv = yearcsv_files(year)
def fillheader(linea):
''' Crea un encabezado valido para pandas Multi-level index
'''
lista = linea.split(';')
for i in range(2,len(lista),2):
#Repite Nombre AFP
lista[i] = lista[i-1]
# Remueve "\n" final
lista[-1] = lista[-1].replace('\n','')
#Repite Nombre AFP final y agrega "\n"
lista.append(lista[-1]+'\n')
return ';'.join(lista)
def cleancsv(filecsv):
''' Separa en archivos csv validos
'''
file_name, file_ext = filecsv.split('.')
with open('rawdata/'+filecsv, 'rU') as f_in:
lines = f_in.readlines()
# Enumera lineas en blanco que separan trozos de datos
# Trozos tienen distintas AFP encabezados no homogeneos
blanks = [i for (i, s) in enumerate(lines) if s == "\n"]
for j in range(len(blanks)):
# Crea archivos csv validos con trozos
f_out = open('tmp/'+file_name+'_%s.'%j+file_ext, 'w')
if j < len(blanks)-1:
# Linea en blanco antes de encabezados, por eso
# blanks[j]+1
# Primera fila encabezado rellena espacios
f_out.write(fillheader(lines[blanks[j]+1]))
# Segunda fila encabezado hasta linea en blanco
f_out.writelines(lines[blanks[j]+2:blanks[j+1]])
else:
# Primera fila encabezado rellena espacios
f_out.write(fillheader(lines[blanks[j]+1]))
# Ultimo trozo archivo no termina en linea en blanco
f_out.writelines(lines[blanks[j]+2:])
f_out.close()
def main():
logg_info('cleancsv', tipo='INFO', status='INIT')
# Crea carpeta temporal
makedir('tmp')
if clean_hist:
for filecsv in histcsv:
cleancsv(filecsv)
# Exec ok
logg_info('cleancsv historical', tipo='INFO', status='OK')
# Remove msg to clean historical
os.remove('tmp/historical')
if clean_year:
for filecsv in yearcsv:
cleancsv(filecsv)
# Exec ok
logg_info('cleancsv year', tipo='INFO', status='OK')
# Remove msg to clean year
os.remove('tmp/year')
logg_info('cleancsv', tipo='INFO', status='DONE')
if __name__ == "__main__":
main()