-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathrandom_forest_utilities.py
More file actions
111 lines (90 loc) · 4.4 KB
/
random_forest_utilities.py
File metadata and controls
111 lines (90 loc) · 4.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import numpy as np
import sklearn.ensemble
import matplotlib.pyplot as plt
import pandas as pd
from pointers import *
import pdb
def load_results(filename = location_hardness_models, sep = '\t', header = 40):
'''
This function loads sources from our CSC results tsv and outputs a Pandas data
structure to be used in the random forest training.
'''
Cat = pd.read_csv(filename, sep = sep, header = header)
# usable_sources = Cat[(np.abs(Cat['hard_hs']) < 0.995) & (np.abs(Cat['hard_hm']) < 0.995) & \
# (np.abs(Cat['hard_ms'])<0.995)]
usable_sources = Cat
return usable_sources
def sig_source_subset(catalog, min_sig, max_sig):
selection_index = np.logical_and(catalog['significance'] >= min_sig, \
catalog['significance'] < max_sig)
return catalog[selection_index].copy()
def source_subset(usable_sources, N = 6e3, seed = None):
source_total = float(len(usable_sources['hard_hs']))
crit_val = N/source_total
np.random.seed(seed)
selection_val = np.random.random(int(source_total))
selection_index = selection_val < crit_val
return usable_sources[selection_index].copy()
def feature_list():
feature_list = ['gal_l','gal_b','err_ellipse_r0','err_ellipse_r1','err_ellipse_ang',\
'significance','hard_hm','hard_hs','nh_gal','acis_num','acis_time','likelihood',\
'flux_powlaw','powlaw_gamma','powlaw_nh','powlaw_ampl','powlaw_stat',\
'flux_bb','bb_kt','bb_nh','bb_ampl','bb_stat','flux_brems','brems_kt','brems_nh',\
'brems_norm','brems_stat']
return feature_list
def extract_values(sources, features = feature_list()):
training_features = sources.loc[:,features].values
# The random forest tools in scikit-learn are in 32 bit, so we need to make
# sure there are no numbers that can't be represented in 32 bits.
np.clip(training_features, -9999, 2e6, out = training_features)
return training_features
def save_significance_bins():
catalog = load_results()
sig = np.copy(catalog['significance'])
bin_walls = np.array([3, 4.5, 6, 7.5, max(sig) + 1])
for i in np.arange(0,len(bin_walls)-1):
binmin = bin_walls[i]; binmax = bin_walls[i+1]
filename = 'significance_bin__' + str(binmin) + '--' + str(binmax)
sig_bin = catalog[np.logical_and(sig >= binmin, sig < binmax)]
sig_bin.to_csv(path_or_buf = results_path + filename + '.csv')
def clean_data(data):
'''
Function that cleans the input Pandas data frame. The process looks for features
that have missing values (represented by -9999), creates a new feature named
missing_[feature name] consisting of 0 (False) or 1 (True), then replaces the
missing values (-9999) with a 0.
Input:
data: A Pandas dataframe with missing values represented by -9999
Output:
cleaned_data: A copy of the input data that has been cleaned using the
described process
cleaned_feature_list: The updated feature list including the new features
'''
# Checking to make sure the input data hasn't already been cleaned. Without this
# step, cleaning cleaned data would erase the missing_feature-name features
current_features = data.columns.to_numpy()
if any('missing' in header for header in current_features):
return data, current_features
cleaned_data = data.loc[:, feature_list()].copy()
cleaned_feature_list = feature_list()
headers = cleaned_data.columns.to_numpy()
model_list = ['flux_powlaw','powlaw_gamma','powlaw_nh','powlaw_ampl','powlaw_stat',\
'flux_bb','bb_kt','bb_nh','bb_ampl','bb_stat','flux_brems','brems_kt','brems_nh',\
'brems_norm','brems_stat']
model_checked = False
for header in headers:
if sum(cleaned_data[header] == -9999) > 0:
if header in model_list:
if model_checked is False:
missing_feature = 'missing_models'
model_checked = True
else:
continue
else:
missing_feature = 'missing_' + header
cleaned_data[missing_feature] = pd.Series([0 for x in range(len(cleaned_data.index))], \
index=cleaned_data.index)
cleaned_data.at[cleaned_data[header] == -9999, missing_feature] = 1
cleaned_feature_list.append(missing_feature)
cleaned_data[cleaned_data == -9999] = 0
return cleaned_data, cleaned_feature_list