diff --git a/nmma/em/injection.py b/nmma/em/injection.py index e6c61fc5b..06b6e81dd 100644 --- a/nmma/em/injection.py +++ b/nmma/em/injection.py @@ -326,68 +326,23 @@ def create_light_curve_data( sim.to_csv(args.outdir + "/too.csv", index=False) if rubin_ToO: - print("Using rubin observing strategy.") start = tmin + tc - if args.rubin_ToO_type == "platinum": - # platinum means 90% GW skymap <30 sq deg - # I made this name up, this is the gold strategy for an event similar to GW170817 (close and well localized) - # Three observations Night0 with grizy filters - # One scan Night 1,2,3 w/ same filters + if args.rubin_ToO_type == "BNS": strategy = [ - [1 / 24.0, ["ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], - [2 / 24.0, ["ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], - [4 / 24.0, ["ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], - [1.0, ["ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], - [2.0, ["ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], - [3.0, ["ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], + [1 / 24.0, ["sdssu", "ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], + [2 / 24.0, ["sdssu", "ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], + [4 / 24.0, ["sdssu", "ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], + [1.0, ["ps1__g", "ps1__z", "ps1__i"]], ] - elif args.rubin_ToO_type == "gold": - # gold means 90% GW skymap <100 sq deg - # Three pointings Night 0 with gri (possibly grz if more sensitive to KNe) - # One scan Night 1,2,3 w/ r+i + elif args.rubin_ToO_type == "NSBH": strategy = [ - [1 / 24.0, ["ps1__g", "ps1__r", "ps1__i"]], - [2 / 24.0, ["ps1__g", "ps1__r", "ps1__i"]], - [4 / 24.0, ["ps1__g", "ps1__r", "ps1__i"]], - [1.0, ["ps1__r", "ps1__i"]], - [2.0, ["ps1__r", "ps1__i"]], - [3.0, ["ps1__r", "ps1__i"]], - ] - elif args.rubin_ToO_type == "gold_z": - # gold means 90% GW skymap <100 sq deg - # Three pointings Night 0 with gri (possibly grz if more sensitive to KNe) - # One scan Night 1,2,3 w/ r+i - strategy = [ - [1 / 24.0, ["ps1__g", "ps1__r", "ps1__z"]], - [2 / 24.0, ["ps1__g", "ps1__r", "ps1__z"]], - [4 / 24.0, ["ps1__g", "ps1__r", "ps1__z"]], - [1.0, ["ps1__r", "ps1__i"]], - [2.0, ["ps1__r", "ps1__i"]], - [3.0, ["ps1__r", "ps1__i"]], - ] - elif args.rubin_ToO_type == "silver": - # silver means 90% GW skymap <500 sq deg - # One scan Night 0 w/ g+i or g+z - # One scan each Night 1,2,3 w/ same filters - strategy = [ - [1 / 24.0, ["ps1__g", "ps1__i"]], - [1.0, ["ps1__g", "ps1__i"]], - [2.0, ["ps1__g", "ps1__i"]], - [3.0, ["ps1__g", "ps1__i"]], - ] - elif args.rubin_ToO_type == "silver_z": - # silver means 90% GW skymap <500 sq deg - # One scan Night 0 w/ g+i or g+z - # One scan each Night 1,2,3 w/ same filters - strategy = [ - [1 / 24.0, ["ps1__g", "ps1__z"]], - [1.0, ["ps1__g", "ps1__z"]], - [2.0, ["ps1__g", "ps1__z"]], - [3.0, ["ps1__g", "ps1__z"]], + [1 / 24.0, ["sdssu", "ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], + [4 / 24.0, ["sdssu", "ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], + [1.0, ["sdssu", "ps1__g", "ps1__r", "ps1__i", "ps1__z", "ps1__y"]], + [2.0, ["ps1__g", "ps1__z", "ps1__i"]], ] else: - raise ValueError("args.rubin_ToO_type should be either platinum, gold, or silver") - # took type names from Rubin 2024 Workshop write-up + raise ValueError("args.rubin_ToO_type should be either BNS or NSBH") mjds, passbands = [], [] sim = pd.DataFrame() @@ -414,7 +369,6 @@ def create_light_curve_data( assume_sorted=True, ) times = group["mjd"].tolist() - #print("The times of observation are: ", times) data_per_filt = np.vstack([times, lc(times), lcerr(times)]).T data[filt] = data_per_filt passbands_to_keep.append(filt) diff --git a/nmma/mlmodel/__init__.py b/nmma/mlmodel/__init__.py new file mode 100644 index 000000000..8b1378917 --- /dev/null +++ b/nmma/mlmodel/__init__.py @@ -0,0 +1 @@ + diff --git a/nmma/mlmodel/dataprocessing.py b/nmma/mlmodel/dataprocessing.py index 947165187..d91bbd11b 100644 --- a/nmma/mlmodel/dataprocessing.py +++ b/nmma/mlmodel/dataprocessing.py @@ -4,410 +4,518 @@ import json import warnings from tqdm import tqdm -import nflows.utils as torchutils -from IPython.display import clear_output -from time import time -from time import sleep import torch from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split from os.path import exists -from .resnet import ResNet device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -bands = ['ztfg', 'ztfr', 'ztfi'] -detection_limit = 22.0 -num_repeats = 50 -num_channels = 3 -num_points = 121 - -t_zero = 44242.00021937881 -t_min = 44240.00050450478 -t_max = 44269.99958898723 -days = int(round(t_max - t_min)) -time_step = 0.25 - -def open_json(file_name, dir_path): +def open_json( + file_name, dir_path +): ''' - Opens a json file, loads the data as a dictionary, and closes the file + Opens a json file, loads data as a dictionary, and closes the file Inputs: - file_name = /name of json file.json - dir_path = directory containing json files - Returns: - data = dictionary containing json content + file_name: /name of json file.json + dir_path: directory containing json files + Outputs: + data: dictionary containing json content ''' f = open(dir_path + file_name) data = json.load(f) f.close() return data -def get_names(path, label, set, num): +def get_names( + path, label, set, num +): ''' Gets the file path for the fixed data Inputs: - path = string, directory to point to - label = string, label assigned during nmma light curve generation - set = int, number in directory name - num = int, number of files to unpack - Returns: - file_names = list, contains full path file names + path: string, directory to point to + label: string, label assigned during nmma light curve generation + set: int, number in directory name + num: int, number of files to unpack + Outputs: + file_names: list, contains full path file names ''' file_names = [0] * num for i in range(0, num): - one_name = path + '/{}_batch_{}/{}_{}_{}.json'.format(label, set, label, set, i) + one_name = path + '/{}_batch_{}/{}_{}_{}.json'.format( + label, set, label, set, i + ) file_names[i] = one_name return file_names -def json_to_df(file_names, num_sims, detection_limit=detection_limit, bands=bands): +def json_to_df( + file_name, dir_path, detection_limits, bands, +): ''' - Flattens json files into a dataframe + Flattens a light curve json file into a DataFrame Inputs: - file_names = list, contains full path file names as strings - num_sims = int, number of files to unpack - detection_limit = float, photometric detection limit - bands = list, contains the json photometry keys as strings - Returns: - df_list = list of dataframes containing the photometry data, time, and number of total detections across all bands + file_name: light curve json file name + detection_limits: list, photometric detection limits per band + bands: list, contains the json photometry keys as strings + Outputs: + df_unpacked: DataFrame containing the photometry data, time, + and number of total detections across all bands ''' - df_list = [0] * num_sims - for i in tqdm(range(num_sims)): - data = json.load(open(file_names[i], "r")) - df = pd.DataFrame.from_dict(data, orient="columns") - df_unpacked = pd.DataFrame(columns=bands) - counter = 0 - for j in range(len(bands)): - df_unpacked[['t', bands[j], 'x']] = pd.DataFrame(df[bands[j]].tolist(), index= df.index) - for val in df_unpacked[bands[j]]: - if val != detection_limit: - counter += 1 - else: - pass - df_unpacked['num_detections'] = np.full(len(df_unpacked), counter) - df_unpacked['sim_id'] = np.full(len(df_unpacked), i) - df_unpacked = df_unpacked.drop(columns=['x']) - df_list[i] = df_unpacked - return df_list + data = open_json(file_name, dir_path) + df = pd.DataFrame.from_dict(data, orient="columns") + df_unpacked = pd.DataFrame(columns=bands) + counter = 0 + for j in range(len(bands)): + df_unpacked[['t', bands[j], 'x']] = pd.DataFrame( + df[bands[j]].tolist(), index= df.index + ) + for val in df_unpacked[bands[j]]: + if val != detection_limits[j]: + counter += 1 + else: + pass + df_unpacked['num_detections'] = np.full(len(df_unpacked), counter) + df_unpacked = df_unpacked.drop(columns=['x']) + return df_unpacked -def gen_prepend_filler(column_list, detection_limit, t_min, t_max, step = 0.25): +def extract_number( + file_name +): ''' - front end padding + Gets the number in a file name Inputs: - column_list: list of columns of the dataframe - detection_limit: number that is used as the filler, ie the detection limit - t_min: minimum time - t_max: maximum time - step: time increment + file_name: string, name of file that has a number Outputs: - filler_df: dataframe to pad the existing data + Number in file name, or inf if none ''' - # Fill according to step size, regardless of count - pre = np.arange(start=t_min, stop=t_max, step=step) - prefill_dict = {} - for col in column_list: - if col == 't': - prefill_dict['t'] = pre - else: - prefill_dict[col] = [detection_limit]*len(pre) - prefill_df = pd.DataFrame(prefill_dict) - return prefill_df + try: + return int("".join(filter(str.isdigit, file_name))) + except ValueError: + return float("inf") -def gen_append_filler(column_list, detection_limit, t_min, count, step=0.25): +def directory_json_to_df( + dir_path, label, detection_limits, bands +): ''' - back end padding + Takes a directory of light curves and converts them to DataFrames Inputs: - column_list: list of columns of the dataframe - detection_limit: number that is used as the filler, ie the detection limit - t_min: minimum time - t_max: maximum time - step: time increment + dir_path: directory containing light curve files + label: string, label used when generating the light curve data + detection_limits: list, photometric detection limit per band + bands: list, contains the json photometry keys as strings Outputs: - filler_df: dataframe to pad the existing data + df_list: list, contains all files as DataFrames ''' - # Fill according to min value and specified count - aft = np.arange(start=t_min, stop=t_min+(count*step), step=step) - aftfill_dict = {} - for col in column_list: - if col == 't': - aftfill_dict['t'] = aft - else: - aftfill_dict[col] = [detection_limit]*len(aft) - aftfill_df = pd.DataFrame(aftfill_dict) - return aftfill_df + df_list = [] + for file in sorted(os.listdir(dir_path), key=extract_number): + if file.endswith(".json") and file.startswith(label): + df = json_to_df( + file, dir_path, detection_limits, bands + ) + df['simulation_id'] = extract_number(file) + df_list.append(df) + return df_list -def pad_the_data(actual_df, column_list, desired_count=num_points, filler_time_step=time_step, filler_data=detection_limit): +def pad_the_data(df, t_min, t_max, step, data_fillers, bands): ''' - pads both ends of the light curve dataframe to ensure consistent number of data points across all data + Takes DataFrames and adds filler values to both ends, preserves + original time information. Inputs: - actual_df: existing light curve data - desired_count: desired length of data, ie number of points for the light curve - filler_time_step: time increment - filler_data: number that is used as the filler, ie the detection limit + df: DataFrame with 't' and photometric columns + t_min: float, global minimum start time + t_max: float, global maximum end time + step: float, time step between rows + data_fillers: list, to use in the filler rows + bands: list of photometric columns to fill + (e.g. ['ztfg', 'ztfr', 'ztfi']) Outputs: - cat_df: padded dataframe + df_padded: DataFrame with original data and padded rows, + covering full time range ''' - actual_df.iloc[:, actual_df.columns.get_loc('t')] = actual_df.iloc[:, actual_df.columns.get_loc('t')].apply(lambda x: x - t_min) - cat_df = actual_df - cat_count = len(cat_df) - prepended_count = 0 - if (actual_df['t'].min() >= filler_time_step): - filler_max_time = actual_df['t'].min() - filler_time_step # stop one time step before current min - prepend_filler_df = gen_prepend_filler(column_list, filler_data, 0, filler_max_time, filler_time_step) - prepended_count = len(prepend_filler_df) - cat_df = pd.concat([prepend_filler_df, actual_df], ignore_index=True) - cat_count = len(cat_df) - append_count = desired_count - cat_count - if append_count > 0: - max_t = cat_df['t'].max() - steps_per_count = 1/filler_time_step - filler_min_time = int(max_t*steps_per_count)/steps_per_count + filler_time_step # start at next time step - append_filler_df = gen_append_filler(column_list, filler_data, filler_min_time, append_count) - cat_df = pd.concat([cat_df, append_filler_df], ignore_index=True) - cat_count = len(cat_df) - assert(len(cat_df) == desired_count) - return cat_df + df = df.copy() + df = df.sort_values('t').reset_index(drop=True) + full_time = np.arange(t_min, t_max, step) + num_points = len(full_time) + t_start = df['t'].min() + t_end = df['t'].max() + prepend_times = full_time[full_time < t_start] + append_times = full_time[full_time > t_end] + + def make_filler(times): + return pd.DataFrame({ + 't': times, + **{band: [val] * len(times) for band, val in zip(bands, data_fillers)}, + 'simulation_id': np.nan, + 'num_detections': np.nan + }) + + prepend_df = make_filler(prepend_times) + append_df = make_filler(append_times) + df_padded = pd.concat([prepend_df, df, append_df], ignore_index=True) + df_padded = df_padded.sort_values('t').reset_index(drop=True) + assert np.isclose(df_padded['t'].min(), t_min), \ + f"Start time is {df_padded['t'].min()}, expected {t_min}" + assert np.isclose(df_padded['t'].max(), t_max - step), \ + f"End time is {df_padded['t'].max()}, expected {t_max - step}" + try: + assert len(df_padded) == num_points+1, \ + f"Lengthb is {len(df_padded)}, expected {num_points+1}" + except AssertionError as e: + count = num_points + 1 - len(df_padded) + addt_times = np.arange(t_max, t_max+(step*count), step) + addt_append = make_filler(addt_times) + df_padded = pd.concat([df_padded, addt_append], ignore_index=True) + df_padded = df_padded.sort_values('t').reset_index(drop=True) + assert len(df_padded) == num_points+1, \ + f"Length is {len(df_padded)}, expected {num_points+1}" + return df_padded -def pad_all_dfs(df_list): +def pad_all_dfs( + df_list, t_min, t_max, step, data_fillers, bands +): ''' - Pads multiple dataframes at a time + Pads multiple DataFrames at a time Inputs: - df_list: list of dataframes to pad + df_list: list of DataFrames to pad Outputs: - padded_df_list: list of dataframes after padding + padded_df_list: list of DataFrames after padding ''' padded_df_list = [] for i in tqdm(range(len(df_list))): df = df_list[i] - sim_num = df.iloc[0, df.columns.get_loc('sim_id')] + sim_num = df.iloc[0, df.columns.get_loc('simulation_id')] det_num = df.iloc[0, df.columns.get_loc('num_detections')] - df = pad_the_data(df) - df['sim_id'] = np.full(len(df), sim_num) + df = pad_the_data( + df, t_min, t_max, step, data_fillers, bands + ) + df['simulation_id'] = np.full(len(df), sim_num) df['num_detections'] = np.full(len(df), det_num) padded_df_list.append(df) return padded_df_list -def load_in_data(data_dir, name, csv_no, num_points=num_points, num_repeats=num_repeats): +def find_min_max_t( + df_list +): ''' - Loading in multiple saved csv files containing light curve data as one dataframe + Finds the maximum and minimum time of all provided DataFrames Inputs: - data_dir: directory containing the csv files - csv_no: number of csv files to load in - num_points: number of data points per light curve - num_repeats: repeats of injection parameters to determine batches + df_list: list of DataFrames Outputs: - data_df: single dataframe containing the data + t_min: float, minimum time across all DataFrames + t_max: float, maximum time across all DataFrames ''' - data_list = [] - for i in range (0, csv_no): - data_list.append(pd.read_csv(data_dir + '{}_{}.csv'.format(name, i))) - data_df = pd.concat(data_list) - num_sims = int(len(data_df)/num_points) - sim_list = [] - sim_no = 0 - for i in range(0, num_sims): - for j in range(0, num_points): - sim_list.append(sim_no) - sim_no += 1 - data_df['sim_id'] = sim_list - batch_list = [] - batch_no = 0 - num_batches = int((len(data_df)/num_points)/num_repeats) - data_df = data_df.iloc[0:(num_batches*num_points*num_repeats), :].copy() - for i in range(0, num_batches): - for j in range(0, num_points*num_repeats): - batch_list.append(batch_no) - batch_no += 1 - data_df['batch_id'] = batch_list - return data_df - -def match_fix_to_var(data_dir, name1, name2, start, stop, num_points=num_points, num_repeats=num_repeats): + t_mins = [] + t_maxs = [] + for i in range(len(df_list)): + t_mins.append(df_list[i]['t'].min()) + t_maxs.append(df_list[i]['t'].max()) + t_min = min(t_mins) + t_max = max(t_maxs) + return t_min, t_max + +def grab_injection( + inj_file, dir_path +): ''' - Matches the shifted injection light curve data to its fixed counterpart + Reads in the injection file Inputs: - data_dir: directory containing the data - name1: label of the varied csv files - name2: label of the fixed csv files - start: starting csv number - stop: ending csv number - num_points: number of points in the light curve - num_repeats: number of repeated mass, velocity, lanthanide injections + inj_file: string, injection file name + dir_path: string, directory path Outputs: - fixed_data_df: returns the fixed portion of the light curve data - varied_data_df: returns the shifted/varied portion of the light curve data + inj_df: DataFrame containing the injection parameters ''' - # initiate list for dataframes - fixed_list = [] - varied_list = [] - # do all data processing for a given number of dataframes - for i in range (start, stop): - # load in the data - df_var = pd.read_csv(data_dir + '{}_{}.csv'.format(name1, i)) - df_fix = pd.read_csv(data_dir + '{}_{}.csv'.format(name2, i)) - # match the two dataframes to each other based on sim id - matched = df_var.merge(df_fix, left_on=['sim_id', df_var.groupby('sim_id').cumcount()], - right_on=['sim_id', df_fix.groupby('sim_id').cumcount()]) - # grab the fixed and varied portions of the dataframe - fix_df = matched.iloc[:, 12:] - var_df = matched.iloc[:, :12] - # adjust columns and column names - fix_df.columns = fix_df.columns.str.rstrip('_y') - var_df.columns = var_df.columns.str.rstrip('_x') - var_df = var_df.drop(columns=['key_1']) - # add to list of dataframes - fixed_list.append(fix_df) - varied_list.append(var_df) - # concatenate the list of dataframes together - fixed_data_df = pd.concat(fixed_list) - varied_data_df = pd.concat(varied_list) - # overwrite the simulation id's and add batch id's - num_sims = int(len(fixed_data_df)/num_points) - sim_list = [] - sim_no = 0 - for i in range(0, num_sims): - for j in range(0, num_points): - sim_list.append(sim_no) - sim_no += 1 - fixed_data_df['sim_id'] = sim_list - varied_data_df['sim_id'] = sim_list - batch_list = [] - batch_no = 0 - num_batches = int((len(fixed_data_df)/num_points)/num_repeats) - fixed_data_df = fixed_data_df.iloc[0:(num_batches*num_points*num_repeats), :].copy() - varied_data_df = varied_data_df.iloc[0:(num_batches*num_points*num_repeats), :].copy() - for i in range(0, num_batches): - for j in range(0, num_points*num_repeats): - batch_list.append(batch_no) - batch_no += 1 - fixed_data_df['batch_id'] = batch_list - varied_data_df['batch_id'] = batch_list - - return fixed_data_df, varied_data_df + data = open_json(inj_file, dir_path) + content = data['injections']['content'] + inj_df = pd.DataFrame.from_dict(content) + return inj_df -def matched(data_dir, name1, name2, start, stop, num_points=num_points, num_repeats=num_repeats): +def load_light_curves_df( + dir_path, + inj_file, + label, + detection_limits, + bands, + step, + data_fillers, + num_repeats=1, + add_batch_id=False, +): ''' - Matches light curves with the same injection parameters + Converts NMMA generated light curves to a DataFrame Inputs: - data_dir: file path for data - name1: label of the varied csv files - name2: label of the fixed csv files - start: starting csv number - stop: ending csv number - num_points: number of points in the light curve - num_repeats: number of repeated mass, velocity, lanthanide injections + dir_path: string, directory path + inj_file: string, injection file name + label: string, label assigned during nmma light curve generation + detection_limits: list, photometric detection limit per band + bands: list of photometric columns to fill + (e.g. ['ztfg', 'ztfr', 'ztfi']) + step: float, time step between rows + data_fillers: list, value to use in the filler rows per band + num_repeats: int, number of repeated injections + add_batch_id: bool, adds batching based on repeats Outputs: - matched_df: combined dataframe of the shifted and fixed light curves + lc_df: DataFrame, contains light curve and injection data ''' - # initiate list for dataframes - matched_list = [] - # do all data processing for a given number of dataframes - for i in range (start, stop): - # load in the data - df_var = pd.read_csv(data_dir + '{}_{}.csv'.format(name1, i)) - df_fix = pd.read_csv(data_dir + '{}_{}.csv'.format(name2, i)) - # match the two dataframes to each other based on sim id - matched = df_var.merge(df_fix, left_on=['sim_id', df_var.groupby('sim_id').cumcount()], - right_on=['sim_id', df_fix.groupby('sim_id').cumcount()]) - matched_list.append(matched) - matched_df = pd.concat(matched_list) - return matched_df + df_list = directory_json_to_df( + dir_path=dir_path, + label=label, + detection_limits=detection_limits, + bands=bands) + t_min, t_max = find_min_max_t(df_list) + num_points = len(np.arange(t_min, t_max, step)) + 1 + padded_list = pad_all_dfs( + df_list, + t_min=t_min, + t_max=t_max, + step=step, + data_fillers=data_fillers, + bands=bands) + all_padded_lcs = pd.concat(padded_list).reset_index(drop=True) + inj_df = grab_injection(inj_file=inj_file, dir_path=dir_path) + lc_df = all_padded_lcs.merge(inj_df, on='simulation_id') + if num_repeats <= 0: + print('Warning: num_repeats must be at least 1 (for one lc!).' + + 'Defaulting to 1.') + num_repeats = 1 + if add_batch_id == True: + lc_df['batch_id'] = lc_df.index // (num_points * num_repeats) + return lc_df -def add_batch_sim_nums_all(df, num_points=num_points, num_repeats=num_repeats): +def df_to_tensor( + lc_df, params, bands, num_repeats, num_points +): ''' - Adds a simulation and batch id number to each light curve + Converts DataFrames into pytorch tensors Inputs: - df: dataframe containing light curve data - num_points: number of points in the light curve - num_repeats: number of repeated mass, velocity, lanthanide injections + lc_df: DataFrame, contains data and injection parameters + params: list of injection parameters + bands: list of photometric columns to fill + (e.g. ['ztfg', 'ztfr', 'ztfi']) + num_repeats: int, number of repeated injections + num_points: int, length of each light curve Outputs: - None + tensor_data: list of tensors containing lc data + tensor_params: list of tensors containing lc params ''' - num_batches_split = int((len(df)/num_points)/num_repeats) - batch_list_split = [] - batch_no = 0 - for i in range(0, num_batches_split): - for j in range(0, num_repeats*num_points): - batch_list_split.append(batch_no) - batch_no += 1 - df['batch_id'] = batch_list_split + num_channels = len(bands) + num_batches = len(lc_df['batch_id'].unique()) + tensor_data = [] + tensor_params = [] + for idx in tqdm(range(0, num_batches)): + lc_data = torch.tensor( + lc_df[bands].loc[lc_df['batch_id'] == idx].values.reshape( + num_repeats, num_points, num_channels + ), + dtype=torch.float32 + ).transpose(1, 2) + lc_params = torch.tensor( + lc_df[params].loc[lc_df['batch_id'] == idx].iloc[::num_points].values, + dtype=torch.float32 + ).unsqueeze(2).transpose(1,2) + tensor_data.append(lc_data) + tensor_params.append(lc_params) + return tensor_data, tensor_params - num_sims_split = int(len(df)/num_points) - sim_list_split = [] - sim_no = 0 - for i in range(0, num_sims_split): - for j in range(0, num_points): - sim_list_split.append(sim_no) - sim_no += 1 - df['sim_id'] = sim_list_split +def load_embedding_dataset( + dir_path_var, + inj_file_var, + label_var, + dir_path_fix, + inj_file_fix, + label_fix, + detection_limits, + bands, + step, + data_fillers, + params, + num_repeats=1, +): + ''' + Loads NMMA generated lc's into tensors suitable for training + Inputs: + dir_path_fix: string, directory path for fixed lcs + dir_path_var: string, directory path for varied lcs + inj_file_fix: string, injection file name for fixed lcs + inj_file_var: string, injection file name for varied lcs + label_fix: string, label assigned to fixed lcs + label_var: string, label assigned to varied lcs + detection_limits: list, photometric detection limit per band + bands: list of photometric columns to fill + (e.g. ['ztfg', 'ztfr', 'ztfi']) + step: float, time step between rows + data_fillers: list, values to use for data padding per band + params: list of injection parameters + num_repeats: int, number of repeated injections + Outputs: + lc_data_fix: tensor containing fixed lc data + lc_params_fix: tensor containing fixed lc params + lc_data_var: tensor containing varied lc data + lc_params_var: tensor containing varied lc params + ''' + if num_repeats <= 0: + print('Warning: num_repeats must be at least 1 (for one lc!).' + + 'Defaulting to 1.') + num_repeats = 1 + + df_list_var = directory_json_to_df( + dir_path=dir_path_var, + label=label_var, + detection_limits=detection_limits, + bands=bands) + t_min, t_max = find_min_max_t(df_list_var) + num_points = len(np.arange(t_min, t_max, step)) + 1 + padded_list_var = pad_all_dfs( + df_list_var, + t_min=t_min, + t_max=t_max, + step=step, + data_fillers=data_fillers, + bands=bands) + all_padded_lcs_var = pd.concat(padded_list_var).reset_index(drop=True) + inj_df_var = grab_injection(inj_file=inj_file_var, dir_path=dir_path_var) + lc_df_var = all_padded_lcs_var.merge(inj_df_var, on='simulation_id') + lc_df_var['batch_id'] = lc_df_var.index // (num_points * num_repeats) + lc_data_var, lc_params_var = df_to_tensor( + lc_df=lc_df_var, + params=params, + bands=bands, + num_repeats=num_repeats, + num_points=num_points + ) + lc_data_var = torch.stack(lc_data_var, dim=0) + lc_params_var = torch.stack(lc_params_var, dim=0) -def get_test_names(path, label, set, num): - ''' - Gets the file path for the fixed data + if dir_path_fix and inj_file_fix and label_fix: + df_list_fix = directory_json_to_df( + dir_path=dir_path_fix, + label=label_fix, + detection_limits=detection_limits, + bands=bands) + padded_list_fix = pad_all_dfs( + df_list_fix, + t_min=t_min, + t_max=t_max, + step=step, + data_fillers=data_fillers, + bands=bands) + all_padded_lcs_fix = pd.concat(padded_list_fix).reset_index(drop=True) + inj_df_fix = grab_injection(inj_file=inj_file_fix, dir_path=dir_path_fix) + lc_df_fix = all_padded_lcs_fix.merge(inj_df_fix, on='simulation_id') + lc_df_fix['batch_id'] = lc_df_fix.index // (num_points * num_repeats) + lc_data_fix, lc_params_fix = df_to_tensor( + lc_df=lc_df_fix, + params=params, + bands=bands, + num_repeats=num_repeats, + num_points=num_points + ) + lc_data_fix = torch.stack(lc_data_fix, dim=0) + lc_params_fix = torch.stack(lc_params_fix, dim=0) + else: + lc_data_fix, lc_params_fix = None, None + + return lc_data_var, lc_params_var, lc_data_fix, lc_params_fix + +def min_max_params(lc_params): + ''' + Gets the minimum and maximum value of all parameters in a tensor Inputs: - path = string, directory to point to - label = string, label assigned during nmma light curve generation - set = int, number in directory name - num = int, number of files to unpack - Returns: - list, contains full path file names + lc_params: tensor, shape [batch, repeats, 1, num_params] + Outputs: + param_mins: tensor, shape [num_params], minimum values + param_maxs: tensor, shape [num_params], maximum values ''' - file_names = [0] * num - for i in range(0, num): - one_name = path + '/{}{}_{}.json'.format(label, set, i) - file_names[i] = one_name - return file_names + flat_params = lc_params.reshape(-1, lc_params.shape[-1]) + param_mins = flat_params.min(dim=0).values + param_maxs = flat_params.max(dim=0).values + return param_mins, param_maxs + +def normalize_params(lc_params, param_mins, param_maxs): + ''' + Applies min-max normalization to lc_params using provided per-param min/max + Inputs: + lc_params: tensor, shape [batch, repeats, 1, num_param] + param_mins: tensor, shape [num_params], minimum values + param_maxs: tensor, shape [num_params], maximum values + Returns: + lc_params_normed: tensor, shape [batch, repeats, 1, num_params] + ''' + param_range = param_maxs - param_mins + param_range = torch.where( + param_range == 0, + torch.ones_like(param_range), + param_range + ) + lc_params_normed = (lc_params - param_mins) / param_range + return lc_params_normed -def repeated_df_to_tensor(df_varied, df_fixed, batches): +def mean_std_lc(lc_data): ''' - Converts dataframes into pytorch tensors + Gets the minimum and maximum value of all parameters in a tensor Inputs: - df_varied: dataframe containing the shifted light curve information - df_fixed: dataframe containing the analagous fixed light curve information - batches: number of unique mass, velocity, and lanthanide injections + lc_data: tensor, shape [batch, repeats, channels, num_points] Outputs: - data_shifted_list: list of tensors of shape [repeats, channels, num_points] containing the shifted light curve photometry - data_unshifted_list: list of tensors of shape [repeats, channels, num_points] containing the fixed light curve photometry - param_shifted_list: list of tensors of shape [repeats, 1, 5] containing the injection parameters of the shifted light curves - param_unshifted_list: list of tensors of shape [repeats, 1, 5] containing the injection parameters of the fixed light curves + param_mins: tensor, shape [num_params], minimum values + param_maxs: tensor, shape [num_params], maximum values ''' - data_shifted_list = [] - data_unshifted_list = [] - param_shifted_list = [] - param_unshifted_list = [] - for idx in tqdm(range(0, batches)): - data_shifted = torch.tensor(df_varied.loc[df_varied['batch_id'] == idx].iloc[:, 1:4].values.reshape(num_repeats, num_points, num_channels), - dtype=torch.float32).transpose(1, 2) - data_unshifted = torch.tensor(df_fixed.loc[df_fixed['batch_id'] == idx].iloc[:, 1:4].values.reshape(num_repeats, num_points, num_channels), - dtype=torch.float32).transpose(1, 2) - param_shifted = torch.tensor(df_varied.loc[df_varied['batch_id'] == idx].iloc[::num_points, 6:11].values, - dtype=torch.float32).unsqueeze(2).transpose(1,2) - param_unshifted = torch.tensor(df_fixed.loc[df_fixed['batch_id'] == idx].iloc[::num_points, 5:10].values, - dtype=torch.float32).unsqueeze(2).transpose(1,2) - data_shifted_list.append(data_shifted) - data_unshifted_list.append(data_unshifted) - param_shifted_list.append(param_shifted) - param_unshifted_list.append(param_unshifted) - return data_shifted_list, data_unshifted_list, param_shifted_list, param_unshifted_list + flat_params = lc_params.reshape(-1, lc_params.shape[-1]) + param_mins = flat_params.min(dim=0).values + param_maxs = flat_params.max(dim=0).values + return param_mins, param_maxs -class Paper_data(Dataset): - def __init__(self, data_shifted_paper, data_unshifted_paper, - param_shifted_paper, param_unshifted_paper, - num_batches_paper_sample): +def global_mean_std_lc(lc_data, detection_limits, data_fillers): + ''' + Computes a single global mean and std across all bands and time points + Inputs: + lc_data: tensor, shape [batch, repeats, channels, num_points] + detection_limits: list, photometric detection limits per band + data_fillers: list, values used for padding the data + Outputs: + global_mean: tensor, [mean] of the entire set of light curves + global_std: tensor, [std] of the entire set of light curves + ''' + bands = lc_data.shape[2] + mask = torch.ones_like(lc_data, dtype=bool) + + for b in range(bands): + mask[:, :, b, :] &= (lc_data[:, :, b, :] != detection_limits[b]) + mask[:, :, b, :] &= (lc_data[:, :, b, :] != data_fillers[b]) + + valid_vals = lc_data[mask] + global_mean = valid_vals.mean() + global_std = valid_vals.std(unbiased=False) if valid_vals.numel() > 1 else 1.0 + + return global_mean, global_std + +class Embedding_Data(Dataset): + def __init__( + self, + lc_data_var, + lc_params_var, + lc_data_fix, + lc_params_fix, + detection_limits, + data_fillers, + ): super().__init__() - self.data_shifted_paper = data_shifted_paper - self.data_unshifted_paper = data_unshifted_paper - self.param_shifted_paper = param_shifted_paper - self.param_unshifted_paper = param_unshifted_paper - self.num_batches_paper_sample = num_batches_paper_sample + self.lc_data_var = lc_data_var + self.lc_params_var = lc_params_var + self.lc_data_fix = lc_data_fix + self.lc_params_fix = lc_params_fix + + param_mins, param_maxs = min_max_params(self.lc_params_var) + self.lc_params_var = normalize_params(lc_params_var, param_mins, param_maxs) + self.lc_params_fix = normalize_params(lc_params_fix, param_mins, param_maxs) + global_mean, global_std = global_mean_std_lc(lc_data_var, detection_limits, data_fillers) + self.lc_data_var = self.lc_data_var.sub_(global_mean).div_(global_std) + self.lc_data_fix = self.lc_data_fix.sub_(global_mean).div_(global_std) def __len__(self): - return self.num_batches_paper_sample + return len(self.lc_data_var) def __getitem__(self, idx): - if torch.is_tensor(idx): - idx = idx.tolist() - return ( - self.param_shifted_paper[idx].to(device), - self.param_unshifted_paper[idx].to(device), - self.data_shifted_paper[idx].to(device), - self.data_unshifted_paper[idx].to(device) + self.lc_data_var[idx], + self.lc_params_var[idx], + self.lc_data_fix[idx], + self.lc_params_fix[idx] ) diff --git a/nmma/mlmodel/embedding.py b/nmma/mlmodel/embedding.py index 98fbafe7c..c09eabc13 100644 --- a/nmma/mlmodel/embedding.py +++ b/nmma/mlmodel/embedding.py @@ -4,16 +4,10 @@ from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split import torch.nn.functional as F from torch.utils.tensorboard import SummaryWriter -from .resnet import ResNet +from resnet import ResNet device = torch.device("cuda" if torch.cuda.is_available() else "cpu") -bands = ['ztfg', 'ztfr', 'ztfi'] -detection_limit = 22.0 -num_repeats = 50 -num_channels = 3 -num_points = 121 - class VICRegLoss(nn.Module): ''' Variance-Invariance-Covariance Regularization Loss Function @@ -58,7 +52,8 @@ def __init__(self, activation=F.relu, dropout_probability=0.1, use_batch_norm=True, - zero_initialization=True): + zero_initialization=True + ): ''' Inputs: channels: number of separate dimensions, here it is photometric bands @@ -104,7 +99,7 @@ def __init__(self, activation=F.relu, dropout_probability=0.1, use_batch_norm=True, - ): + ): ''' Inputs: in_channels: starting number of channels, ie number of photometric bands @@ -148,21 +143,28 @@ def __init__(self, num_blocks=4, kernel_size=5, num_dim_final=10, + expander_dim=150, activation=torch.tanh, - num_channels=num_channels, - num_points=num_points - ): + num_channels=3, + num_points=121, + ): + self.expander_dim = expander_dim super(SimilarityEmbedding, self).__init__() self.layer_norm = nn.LayerNorm([num_channels, num_points]) self.num_hidden_layers_f = num_hidden_layers_f self.num_hidden_layers_h = num_hidden_layers_h - self.layers_f = ResNet(num_ifos=[3,None], layers=[2,2], kernel_size=kernel_size, context_dim=100) + self.layers_f = ResNet( + num_ifos=[3,None], + layers=[2,2], + kernel_size=kernel_size, + context_dim=100 + ) self.contraction_layer = nn.Linear(in_features=100, out_features=num_dim) - # self.layers_f = ConvResidualNet(in_channels=num_channels, out_channels=1, hidden_channels=20, num_blocks=num_blocks, kernel_size=kernel_size) - # self.contraction_layer = nn.Linear(in_features=in_features, out_features=num_dim) - self.expander_layer = nn.Linear(num_dim, 20) - self.layers_h = nn.ModuleList([nn.Linear(20, 20) for _ in range(num_hidden_layers_h)]) - self.final_layer = nn.Linear(20, num_dim_final) + self.expander_layer = nn.Linear(num_dim, self.expander_dim) + self.layers_h = nn.ModuleList( + [nn.Linear(self.expander_dim, self.expander_dim) for _ in range(num_hidden_layers_h)] + ) + self.final_layer = nn.Linear(self.expander_dim, num_dim_final) self.activation = activation def forward(self, x): @@ -185,7 +187,7 @@ def train_one_epoch_se(epoch_index, verbose, vicreg_loss, **vicreg_kwargs, - ): +): ''' Training function Inputs: @@ -240,7 +242,8 @@ def val_one_epoch_se(epoch_index, data_loader, similarity_embedding, vicreg_loss, - **vicreg_kwargs): + **vicreg_kwargs +): ''' Validation training function Inputs: diff --git a/nmma/mlmodel/lightcurves_to_tensors.py b/nmma/mlmodel/lightcurves_to_tensors.py new file mode 100644 index 000000000..438e7cf69 --- /dev/null +++ b/nmma/mlmodel/lightcurves_to_tensors.py @@ -0,0 +1,135 @@ +import numpy as np +import pandas as pd +import os, sys, time, glob +import json +import warnings +from tqdm import tqdm +import torch +from torch.utils.data import Dataset, DataLoader, TensorDataset, random_split +from os.path import exists +import argparse + +from data_processing import load_embedding_dataset + +def main(): + parser = argparse.ArgumentParser( + description='Load NMMA light curves into training tensors' + ) + parser.add_argument( + '--dir_path_var', type=str, required=True, + help='Directory containing varied light curves.', + ) + parser.add_argument( + '--inj_file_var', type=str, required=True, + help='Name of varied injection file in dir-path-var.', + ) + parser.add_argument( + '--label_var', type=str, required=True, + help='Label of the varied light curves.', + ) + parser.add_argument( + '--dir_path_fix', type=str, required=True, + help='Directory containing fixed light curves. Use None to skip.', + ) + parser.add_argument( + '--inj_file_fix', type=str, required=True, + help='Name of fixed injection file in dir-path-fix. Use None to skip.', + ) + parser.add_argument( + '--label_fix', type=str, required=True, + help='Label of the fixed light curves. Use None to skip.', + ) + parser.add_argument( + '--detection_limits', + type=lambda s: [f.strip() for f in s.split(',')], + required=True, + help='Photometric detection limit.', + ) + parser.add_argument( + '--filters', + type=lambda s: [f.strip() for f in s.split(',')], + required=True, + help='Comma-separated list of photometric bands, e.g. ztfg,ztfr,ztfi', + ) + parser.add_argument( + '--dt', type=float, + help='Time step in day (default: 0.1)', + default=0.1, + ) + parser.add_argument( + '--data_fillers', type=float, + help='Values to use for data padding per band.', + default=22.0, + ) + parser.add_argument( + '--params', + type=lambda s: [f.strip() for f in s.split(',')], + help='Comma-separated list of inference parameters.', + ) + parser.add_argument( + '--num_repeats', type=int, + help='Number of repeated injections.', + default=1, + ) + parser.add_argument( + '--save_lc_data_fix', type=str, + help='Optionally save fixed lc data in tensor form.', + ) + parser.add_argument( + '--save_lc_params_fix', type=str, + help='Optionally save fixed parameters in tensor form.', + ) + parser.add_argument( + '--save_lc_data_var', type=str, + help='Optionally save varied lc data in tensor form.', + ) + parser.add_argument( + '--save_lc_params_var', type=str, + help='Optionally save varied lc parameters in tensor form.', + ) + + args = parser.parse_args() + + if args.dir_path_fix and args.inj_file_fix and args.label_fix: + lc_data_var, lc_params_var, lc_data_fix, lc_params_fix = load_embedding_dataset( + dir_path_var=args.dir_path_var, + inj_file_var=args.inj_file_var, + label_var=args.label_var, + dir_path_fix=args.dir_path_fix, + inj_file_fix=args.inj_file_fix, + label_fix=args.label_fix, + detection_limits=args.detection_limits, + bands=args.filters, + step=args.dt, + data_fillers=args.data_fillers, + params=args.params, + num_repeats=args.num_repeats, + ) + else: + lc_data_fix, lc_params_fix = None, None + lc_data_var, lc_params_var, _, _ = load_embedding_dataset( + dir_path_var=args.dir_path_var, + inj_file_var=args.inj_file_var, + label_var=args.label_var, + dir_path_fix=None, + inj_file_fix=None, + label_fix=None, + detection_limits=args.detection_limits, + bands=args.filters, + step=args.dt, + data_fillers=args.data_fillers, + params=args.params, + num_repeats=args.num_repeats, + ) + + if args.save_lc_data_fix: + torch.save(lc_data_fix, args.save_lc_data_fix) + if args.save_lc_params_fix: + torch.save(lc_params_fix, args.save_lc_params_fix) + if args.save_lc_data_var: + torch.save(lc_data_var, args.save_lc_data_var) + if args.save_lc_params_var: + torch.save(lc_params_var, args.save_lc_params_var) + +if __name__ == "__main__": + main()