-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy patheva_data_analysis.py
148 lines (116 loc) · 4.04 KB
/
eva_data_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
Copyright (c) 2024 Amber Walsh
MIT License
Details found in LICEENSE
"""
import os
import matplotlib.pyplot as plt
import pandas as pd
import sys
import re
def read_json_to_dataframe(input_file):
"""
Read the data from a JSON file into a Pandas dataframe.
Clean the data by removing any incomplete rows and sort by date
Args:
input_file (str): the path to the JSON file.
Returns:
eva_df (pd.DataFrame): The cleaned and sorted data as a dataframe
"""
eva_df = pd.read_json(input_file, convert_dates=['date'])
eva_df['eva'] = eva_df['eva'].astype(float)
eva_df.dropna(axis=0, inplace=True)
eva_df.sort_values('date', inplace=True)
return eva_df
def write_dataframe_to_csv(df, output_file):
"""
Saves the dataframe containing data from JSON file to a CSV
Args:
df (pd.DataFrame): dataframe condtaining JSON file data
output_file (str): name of CSV file to be saved to
"""
print(f'Saving to CSV file {output_file}')
df.to_csv(output_file, index=False)
def plot_cumulative_time_in_space(df, graph_file):
"""
fill in...
"""
# extract duration date from dataframe, and calculate sum of duration
# of spacewalks in hours
df = add_duration_hours_variable(df)
df['cumulative_time'] = df['duration_hours'].cumsum()
plt.plot(df['date'], df['cumulative_time'], 'ko-')
plt.xlabel('Year')
plt.ylabel('Total time spent in space to date (hours)')
plt.tight_layout()
plt.savefig(graph_file)
plt.show()
def text_to_duration(duration):
"""
Convert a text format duration "HH:MM" to duration in hours
Args:
duration (str): The duration in HH:MM
Returns:
duration_hours (float): The duration in hours
"""
hours, minutes = duration.split(":")
duration_hours = int(hours) + int(minutes)/60
return duration_hours
def add_duration_hours_variable(df):
"""
Add duration in hours (duration_hours) variable to the dataset
Args:
df (pd.DataFrame): The input dataframe
Returns:
df_copy (pd.DataFrame): A copy of the df_ with the new duration_hours variable added
"""
df_copy = df.copy()
df_copy['duration_hours'] = df_copy['duration'].apply(text_to_duration)
return df_copy
def calculate_crew_size(crew):
"""
Calculate the size of the crew for a single crew entry
Args:
crew (str): The text entry in the crew column containing a list of crew member names
Returns:
int: The crew size
"""
if crew.split() == []:
return None
else:
return len(re.split(r';', crew))-1
def add_crew_size_column(df):
"""
Add crew_size column to the dataset containing the value of the crew size
Args:
df (pd.DataFrame): The input data frame.
Returns:
df_copy (pd.DataFrame): A copy of df with the new crew_size variable added
"""
print('Adding crew size variable (crew_size) to dataset')
df_copy = df.copy()
df_copy["crew_size"] = df_copy["crew"].apply(
calculate_crew_size
)
return df_copy
if __name__ == "__main__":
print("--START--")
if not os.path.exists('./results/'):
os.makedirs('./results/')
if len(sys.argv) < 3:
# less than 3 command line arguments, assuming we use default file names
# https://data.nasa.gov/resource/eva.json (with modifications)
input_file = open('data/eva-data.json', 'r')
output_file = open('results/eva-data.csv', 'w') # JSON data output to this CSV file
else:
input_file = sys.argv[1]
output_file = sys.argv[2]
graph_file = 'results/cumulative_eva_graph.png' # name that graph will be saved to
# Read in and clean up data - including sorting by date
eva_data = read_json_to_dataframe(input_file)
eva_data = add_crew_size_column(eva_data)
# convert and save data to CSV file
write_dataframe_to_csv(eva_data, output_file)
# plot cumulative time vs date and save output graph
plot_cumulative_time_in_space(eva_data, graph_file)
print("--END--")