-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathdata_processing.py
68 lines (52 loc) · 2.4 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import comet_ml
import pandas as pd
from io import StringIO
import os
def get_raw_data(workspace_name: str, artifact_name: str):
"""
In this function, we will check if the raw data exists in Comet Artifacts. If it does, we will download it from there,
if not we will upload it from the local directory.
Once the file is available locally, we will load it into a pandas dataframe and return it.
"""
exp = comet_ml.get_running_experiment()
try:
artifact = exp.get_artifact(artifact_name=f"{artifact_name}_raw")
# Download the artifact
artifact.download(path="./")
except Exception as e:
print(f"Error downloading artifact: {e}")
artifact = comet_ml.Artifact(name=f"{artifact_name}_raw", artifact_type="dataset")
artifact.add("./credit_scoring_dataset.csv")
exp.log_artifact(artifact)
df = pd.read_csv("./credit_scoring_dataset.csv")
return df
def preprocess_data(df: pd.DataFrame):
"""
In this function, we will preprocess the data to make it ready for the model. We will store the preprocessed data in a
new Comet Artifact.
"""
# Select the relevant columns
df = df.loc[:, ['CustAge', 'CustIncome', 'EmpStatus', 'UtilRate', 'OtherCC', 'ResStatus', 'TmAtAddress', 'TmWBank',
'probdefault']]
# Rename the target column
df.rename({'probdefault': 'probability_default'}, inplace=True, axis=1)
# Convert the categorical columns to category type
for c in ['EmpStatus', 'OtherCC', 'ResStatus']:
df[c] = df[c].astype('category')
# Save the preprocessed data to a new Comet Artifact
csv_buffer = StringIO()
df.to_csv(csv_buffer, index=False)
csv_buffer.seek(0)
artifact = comet_ml.Artifact(name=f"{artifact_name}_preprocessed", artifact_type="dataset")
artifact.add(local_path_or_data=csv_buffer, logical_path="preprocessed_data.csv")
exp = comet_ml.get_running_experiment()
exp.log_artifact(artifact)
return df
if __name__ == "__main__":
workspace_name = os.environ["COMET_WORKSPACE"]
project_name = os.environ["COMET_PROJECT_NAME"]
artifact_name = os.environ["COMET_PROJECT_NAME"]
exp = comet_ml.Experiment(workspace=workspace_name, project_name=project_name)
df = get_raw_data(workspace_name, artifact_name)
processed_df = preprocess_data(df)
print("Data preprocessing complete.")