-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgradient_boost_peaks.py
124 lines (96 loc) · 4.25 KB
/
gradient_boost_peaks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
from astropy.table import Table
from astropy.table import Column
from astropy.table import vstack
from astropy.table import join
from sklearn import ensemble
from sklearn import preprocessing as skpp
import fnmatch
import os
import os.path
import sys
def load_all_in_dir(path):
pattern = "stacked*-continuum-peaks.csv"
tables_list = []
exp_list = []
lines_list = None
for file in os.listdir(path):
if fnmatch.fnmatch(file, pattern):
data = Table.read(os.path.join(path, file), format="ascii.csv")
exp = int(file.split("-")[2][3:])
exp_col = Column([exp]*len(data), name="EXP_ID")
data.add_column(exp_col)
tables_list.append(data)
exp_list.append(exp)
if lines_list is None:
lines_list = Table()
lines_list.add_column(data['source'])
lines_list.add_column(data['wavelength_target'])
return vstack(tables_list), exp_list, lines_list
def load_observation_metadata(path, file = "annotated_metadata.csv"):
data = Table.read(os.path.join(path, file), format="ascii.csv")
return data
def trim_observation_metadata(data, copy=False):
if copy:
data = data.copy()
kept_columns = ['EXP_ID', 'RA', 'DEC', 'AZ', 'ALT', 'AIRMASS',
'LUNAR_MAGNITUDE', 'LUNAR_ELV', 'LUNAR_SEP', 'SOLAR_ELV',
'SOLAR_SEP', 'GALACTIC_CORE_SEP', 'GALACTIC_PLANE_SEP']
#'EPHEM_DATE',
removed_columns = [name for name in data.colnames if name not in kept_columns]
data.remove_columns(removed_columns)
return data
def get_X_and_y(data_subset, obs_metadata, use_con_flux=False):
# X: n_samples, n_features
# y: n_samples
# E.g., for this case, X will be the observation metadata for each exposure and y will be
# the total flux for a given line/peak (actually x2: total_flux and total_con_flux)
y_col = 'total_flux' if not use_con_flux else 'total_con_flux'
full_table = join(obs_metadata, data_subset['EXP_ID', y_col])
labels = full_table['EXP_ID']
full_table.remove_column('EXP_ID')
y = full_table[y_col]
full_table.remove_column(y_col)
return full_table, y, labels
def ndarrayidze(table):
temp_arr = np.array(table, copy=False)
length = len(temp_arr)
width = 0
if length > 0:
width = len(temp_arr[0])
return temp_arr.view('<f8').reshape( (length, width) )
def get_feature_importances(data_table, obs_metadata, lines_table, use_con_flux=False):
feature_importances_list = []
X_colnames = None
for line_name, line_wavelength in lines_table['source', 'wavelength_target']:
subset = data_table[(data_table['source'] == line_name) & (data_table['wavelength_target'] == line_wavelength)]
X, y, labels = get_X_and_y(subset, obs_metadata, use_con_flux)
if X_colnames is None:
X_colnames = X.colnames
params = {'n_estimators': 500, 'max_depth': 4, 'min_samples_split': 1,
'learning_rate': 0.01, 'loss': 'lad'}
clf = ensemble.GradientBoostingRegressor(**params)
X = ndarrayidze(X)
# Scaling is optional, but I think I'm going to do it (for now) for all methods,
# just in comparing between valued here and with e.g. ICA there are fewer diffs
X = skpp.scale(X)
y = skpp.scale(y)
clf.fit(X, y)
feature_importances_list.append(clf.feature_importances_)
fi = np.array(feature_importances_list)
fi_table = Table(fi, names = X_colnames)
fi_table.add_column(lines_table['source'])
fi_table.add_column(lines_table['wavelength_target'])
return fi_table
def main():
path = "."
if len(sys.argv) == 2:
path = sys.argv[1]
data_table, exposure_list, lines_table = load_all_in_dir(path)
obs_metadata = trim_observation_metadata(load_observation_metadata(path))
fi_flux_table = get_feature_importances(data_table, obs_metadata, lines_table)
fi_flux_table.write("gradient_boost_flux_fi.csv", format="ascii.csv")
fi_con_flux_table = get_feature_importances(data_table, obs_metadata, lines_table, use_con_flux=True)
fi_con_flux_table.write("gradient_boost_con_flux_fi.csv", format="ascii.csv")
if __name__ == '__main__':
main()