-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning_script.py
114 lines (88 loc) · 4.13 KB
/
data_cleaning_script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
# Loading the data into a DataFrame using pandas
dataset = pd.read_csv(r"layoffs.csv")
# Exploring the data
print("The percentage of null values overall:\n", (dataset.isnull().sum().sum() / (dataset.shape[0] * dataset.shape[1])) * 100)
print("View the first few rows\n", dataset.head()) # View the first few rows
print("Get information about data types and missing values\n", dataset.info()) # Get information about data types and missing values
print("Get summary statistics\n", dataset.describe()) # Get summary statistics
# HANDLING MISSING VALUES
print("Null Values per Column before handling them:\n", (dataset.isnull().sum() / dataset.shape[0]) * 100) # none are above 50% so we can choose to fill them using mean mode or other methods
# Filling in categorical columns
dataset['industry'] = dataset['industry'].fillna(dataset['industry'].mode()[0])
dataset['stage'] = dataset['stage'].fillna(dataset['stage'].mode()[0])
# Forward filling the dates
dataset['date'] = dataset['date'].fillna(method='ffill')
# Filling in numerical columns
dataset['percentage_laid_off'] = dataset['percentage_laid_off'].fillna(dataset['percentage_laid_off'].mean()) # it is normally distributed with little chances of outliers so we use mean
dataset['total_laid_off'] = dataset['total_laid_off'].fillna(dataset['total_laid_off'].median()) # it is not normally distributed and thus used median
dataset['funds_raised_millions'] = dataset['funds_raised_millions'].fillna(dataset['funds_raised_millions'].median()) # it is not normally distributed and thus used median
print("Null Values per Column After handling them:\n", (dataset.isnull().sum() / dataset.shape[0]) * 100)
# REPLACING AND HANDLING DATATYPES
dataset['date'] = pd.to_datetime(dataset['date']) # Convert the date column to datetime type
# OUTLIER DETECTION
# Total Laid Off - Before
plt.subplot(1, 2, 1)
sns.distplot(dataset['total_laid_off'])
plt.title("Before")
# Remove outliers from total_laid_off using z_score
z_score = np.abs(stats.zscore(dataset['total_laid_off'])) # computing z_scores
dataset = dataset[(z_score < 3)]
# Total Laid Off - After
plt.subplot(1, 2, 2)
sns.distplot(dataset['total_laid_off'])
plt.title("After")
plt.show()
# Funds Raised Millions - Before
plt.subplot(1, 2, 1)
sns.distplot(dataset['funds_raised_millions'])
plt.title("Before")
# IQR method for funds_raised_millions
IQR = dataset['funds_raised_millions'].quantile(0.75) - dataset['funds_raised_millions'].quantile(0.25)
min_range = dataset['funds_raised_millions'].quantile(0.25) - (1.5 * IQR)
max_range = dataset['funds_raised_millions'].quantile(0.75) + (1.5 * IQR)
dataset = dataset[dataset['funds_raised_millions'] <= max_range]
# Funds Raised Millions - After
plt.subplot(1, 2, 2)
sns.distplot(dataset['funds_raised_millions'])
plt.title("After")
plt.show()
# HANDLING DUPLICATES
dataset.drop_duplicates(inplace=True)
# ENCODING THE DATASET
# Initialize LabelEncoder
label_encoder = LabelEncoder()
# Apply Label Encoding to categorical columns
categorical_columns = ['company', 'location', 'industry', 'country', 'stage']
for column in categorical_columns:
dataset[column] = label_encoder.fit_transform(dataset[column])
# FEATURE SCALING
# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()
# Boxplot for total_laid_off - Before
plt.subplot(1, 2, 1)
sns.boxplot(x=dataset['total_laid_off'])
plt.title("Before")
dataset['total_laid_off'] = min_max_scaler.fit_transform(dataset[['total_laid_off']])
plt.subplot(1, 2, 2)
sns.boxplot(x=dataset['total_laid_off'])
plt.title("After")
plt.show()
# Boxplot for funds_raised_millions - Before
plt.subplot(1, 2, 1)
sns.boxplot(x=dataset['funds_raised_millions'])
plt.title("Before")
dataset['funds_raised_millions'] = min_max_scaler.fit_transform(dataset[['funds_raised_millions']])
plt.subplot(1, 2, 2)
sns.boxplot(x=dataset['funds_raised_millions'])
plt.title("After")
plt.show()
print(dataset.describe())
print(dataset.shape)
print("Cleaned data now saved to a file.")
dataset.to_csv('cleaned_data.csv', index=False)