-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplit_csv.py
86 lines (66 loc) · 2.86 KB
/
split_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# This files splits the GLBRT_processed data into smaller files county_name_year.csv in a files 'split_files'
# For entries with no county name the entries will be added to No County_year.csv
# Pass the second argument is the relative path of the processed data
import csv
import os
import sys
if len(sys.argv) != 2:
print('Usage: python split_csv.py giant-file.csv')
sys.exit()
input_file = sys.argv[1]
output_dir = 'split_files'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
print("Created 'split_files' directory")
county_year_data = {}
print("Counting total input rows")
with open(input_file, 'r') as infile:
total_input_rows = sum(1 for line in infile) - 1
print(f"Total input rows (excluding header): {total_input_rows}")
print("\nReading and splitting file")
with open(input_file, 'r') as infile:
reader = csv.reader(infile)
original_headers = next(reader)
clean_headers = [h.replace(' ', '').lower() for h in original_headers]
print("Original headers:", original_headers)
print("Cleaned headers:", clean_headers)
land_use_index = clean_headers.index('land_use')
county_index = clean_headers.index('county_name')
year_index = clean_headers.index('year')
processed_rows = 0
for row in reader:
processed_rows += 1
if len(row) > 23:
row[land_use_index] = f"{row[land_use_index + 1]} {row[land_use_index]}"
row = row[:land_use_index + 1] + row[land_use_index + 2:]
county = row[county_index]
if not county.strip():
county = "No County"
year = row[year_index]
key = (county, year)
if key not in county_year_data:
county_year_data[key] = [clean_headers[:23]]
county_year_data[key].append(row[:23])
print("\nCreating county/year files")
total_output_rows = 0
file_counts = []
for (county, year), rows in county_year_data.items():
rows_in_file = len(rows) - 1
total_output_rows += rows_in_file
output_file = os.path.join(output_dir, f"{county}_{year}.csv")
with open(output_file, 'w', newline='') as outfile:
writer = csv.writer(outfile)
writer.writerows(rows)
file_counts.append((f"{county}_{year}.csv", rows_in_file))
print(f"Created {output_file} with {rows_in_file} rows")
print("\nRow Count Summary:")
print(f"Total input rows: {total_input_rows}")
print(f"Total output rows: {total_output_rows}")
if total_input_rows == total_output_rows:
print("Counts match for the rows")
else:
print(f"! Row count mismatch: Difference of {abs(total_input_rows - total_output_rows)} rows")
print(f"\nCreated {len(county_year_data)} files in 'split_files' directory")
print("\nDetailed file counts (sorted by row count):")
for filename, count in sorted(file_counts, key=lambda x: x[1], reverse=True):
print(f"{filename}: {count} rows")