-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathclean_dataset_test.py
More file actions
151 lines (118 loc) · 6.91 KB
/
clean_dataset_test.py
File metadata and controls
151 lines (118 loc) · 6.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from clean_dataset import *
import numpy as np
import pandas as pd
import pdb
import unittest
class Test_get_numeric_outliers( unittest.TestCase ) :
def test_basic(self) :
data = np.array([10,10,11,12,13,10,11,11,14,14,1,15,19,20,17,17,])
testcase = pd.Series( data )
expected = {1 : "Lower abs dist 3 sigma from mean-clean-avg-dist"}
self.assertEqual( get_numeric_outliers(testcase), expected )
def test_max(self) :
data = np.array([10,10,11,12,13,10,11,11,14,14,15,19,20,17,17,30])
testcase = pd.Series( data )
expected = { 30 : "Upper abs dist 3 sigma from mean-clean-avg-dist"}
self.assertEqual( get_numeric_outliers(testcase), expected )
def test_double(self) :
data = np.array([10,10,11,12,13,10,1,11,11,14,14,15,19,20,17,17,30])
testcase = pd.Series( data )
expected = { 30 : "Upper abs dist 3 sigma from mean-clean-avg-dist",
1 : "Lower abs dist 3 sigma from mean-clean-avg-dist"}
self.assertEqual( get_numeric_outliers(testcase), expected )
# def test_nan(self) :
# data = np.array([10,10,11,12,13,10,1,11,11,14,14,15,19,20,17,17,np.NaN])
# testcase = pd.Series( data )
# expected = { 30 : "Upper abs dist 3 sigma from mean-clean-avg-dist",
# 1 : "Lower abs dist 3 sigma from mean-clean-avg-dist"}
# self.assertEqual( get_numeric_outliers(testcase), expected )
class Test_numeric_if_cleaned( unittest.TestCase ) :
def test_if_cleaned( self ) :
data = np.array(['$100', '1.1', '2', '3', '4', '4', '4', '?'])
testcase = pd.Series( data )
expectd = True
self.assertEqual( is_numeric_if_cleaned( testcase), expectd )
def test_if_cl_strs( self ) :
data = np.array(['$100', 'one', 'two', 'one', 'two', 'three', 'four', 'four', '4'])
testcase = pd.Series( data )
expectd = False
self.assertEqual( is_numeric_if_cleaned( testcase), expectd )
class Test_get_string_outliers( unittest.TestCase ) :
def setUp( self ) :
# https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
headers=["symboling","normalized-losses","make","fuel-type","aspiration","num-of-doors","body-style","drive-wheels","engine-location","wheel-base","length","width","height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"]
df = pd.read_csv( "DATA/imports-85.data", names=headers)
# self.tc_short = df['engine-type']
self.tc_short = df['num-of-doors']
def test_very_short( self ) :
testcase = self.tc_short
expected = { "?" : "Suspiciously low string length"}
self.assertEqual( get_string_outliers(testcase), expected )
class Test_find_numeric_columns( unittest.TestCase ) :
def setUp( self ) :
# https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
headers=["symboling","normalized-losses","make","fuel-type","aspiration","num-of-doors","body-style","drive-wheels","engine-location","wheel-base","length","width","height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"]
df = pd.read_csv( "DATA/imports-85.data", names=headers)
self.df1 = df
# https://www.kaggle.com/jinxbe/wnba-player-stats-2017
self.df2 = pd.read_csv( "DATA/WNBA Stats.csv")
def test_wnba_experience( self ) :
testcase = self.df2
expected = ['Experience']
self.assertEqual( find_numeric_columns(testcase), expected )
def test_cars( self ) :
testcase = self.df1
expected = ['normalized-losses', 'bore', 'stroke', 'horsepower', 'peak-rpm', 'price']
self.assertEqual( find_numeric_columns(testcase), expected )
class Test_get_ID_col( unittest.TestCase ) :
def setUp( self ) :
# https://www.kaggle.com/jinxbe/wnba-player-stats-2017
self.df = pd.read_csv( "DATA/WNBA Stats.csv")
# https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
headers=["symboling","normalized-losses","make","fuel-type","aspiration","num-of-doors","body-style","drive-wheels","engine-location","wheel-base","length","width","height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"]
self.df1 = pd.read_csv( "DATA/imports-85.data", names=headers)
def test_wnba_name( self ) :
testcase = self.df
expected = 'Name'
self.assertEqual( get_ID_col(testcase), expected )
def test_cars( self ) :
testcase = self.df1
expected = 'make'
self.assertEqual( get_ID_col(testcase), expected )
class Test_clean_num_col ( unittest.TestCase ) :
def setUp( self ) :
# https://www.kaggle.com/jinxbe/wnba-player-stats-2017
self.df = pd.read_csv( "DATA/WNBA Stats.csv")
# https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data
headers=["symboling","normalized-losses","make","fuel-type","aspiration","num-of-doors","body-style","drive-wheels","engine-location","wheel-base","length","width","height","curb-weight","engine-type","num-of-cylinders","engine-size","fuel-system","bore","stroke","compression-ratio","horsepower","peak-rpm","city-mpg","highway-mpg","price"]
self.df1 = pd.read_csv( "DATA/imports-85.data", names=headers)
def test_cars( self ) :
testcase = self.df1['normalized-losses']
expected = 41 # number of NaN in the series
self.assertEqual( clean_numeric_col(testcase).isna().sum(), expected )
def test_wnba_experience( self ) :
testcase = self.df['Experience']
expected = 0
self.assertEqual( clean_numeric_col(testcase).isna().sum(), expected )
class Test_rank_in_col( unittest.TestCase ) :
def setUp( self ) :
# https://www.kaggle.com/jinxbe/wnba-player-stats-2017
self.df = pd.read_csv( "DATA/WNBA Stats.csv")
def test_wnba_exp1( self ) :
testcase = rank_in_col( self.df, 'Experience', 'Name', 'Plenette Pierson' )
expected = 1
self.assertEqual( testcase, expected )
def test_wnba_exp2( self ) :
testcase = rank_in_col( self.df, 'Experience', 'Name', 'Diana Taurasi' )
expected = 6
self.assertEqual( testcase, expected )
def test_wnba_saniya( self ) :
testcase = rank_in_col( self.df, 'Experience', 'Name', 'Saniya Chong' )
expected = 121
self.assertEqual( testcase, expected )
def test_wnba_reverse( self ) :
testcase = rank_in_col( self.df, 'Experience', 'Name', 'Saniya Chong', reverse=True)
expected = 121
self.assertEqual( testcase, expected )
pdb.set_trace()
unittest.main()