LSSTDESC · dkirkby · Sep 1, 2020 · Sep 1, 2020 · Sep 1, 2020 · Sep 15, 2020
diff --git a/example/zotbin_dc2.yaml b/example/zotbin_dc2.yaml
@@ -0,0 +1,47 @@
+metrics:  [SNR_3x2, FOM_3x2, FOM_DETF_3x2]
+bands: riz
+training_file: data/training.hdf5
+validation_file: data/validation.hdf5
+output_file: example/zotbin_dc2_output.txt
+# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo"
+metrics_impl: jax-cosmo
+
+run:
+  # This is a class name which will be looked up
+  ZotBin:
+    run_4:
+      # This setting is sent to the classifier
+      bins: 4
+      # These special settings decide whether the
+      # color and error colums are passed to the classifier
+      # as well as the magnitudes
+      colors: False
+      errors: False
+      # ZotBin initialization data used for fast metric calculations.
+      # Available from https://portal.nersc.gov/cfs/lsst/dkirkby/zotbin/
+      # and /global/cfs/cdirs/lsst/www/dkirkby/zotbin at nersc.
+      init: binned_3_80.npz
+      # ZotBin grouping parameters...
+      # The number of percentile bins to use along each feature axis.
+      # Use ncpt=10 for a quick test or npct=20 for a better result.
+      npct: 10
+      # The number of groups to build from the initial lattice of ncpt ** nband cells.
+      # Groups are built iteratively by maximizing the joint similarity of cells
+      # in feature and redshift distribution space.
+      ngrp: 150
+      # The similarity measure to use for redshift distributions.
+      # Must be one of: cosine, weighted, EMD (Earth mover's distance = Wasserstein W1 metric)
+      similarity: cosine
+      # ZotBin optimization parameters...
+      # Which metric to optimize, must be one of FOM_DETF_3x2, FOM_3x2, SNR_3x2.
+      metric: FOM_DETF_3x2
+      # Number of trials with different random initializations.
+      ntrial: 1
+      # Number of optimization steps per trial.
+      nsteps: 500
+      # Print progress every interval steps of each trial.
+      interval: 250
+      # Initial learning rate for the Adam optimizer.
+      eta: 0.02
+      # Seed for random initializations.
+      seed: 123
diff --git a/example/zotnet_buzzard.yaml b/example/zotnet_buzzard.yaml
@@ -0,0 +1,46 @@
+metrics:  [SNR_3x2, FOM_3x2, FOM_DETF_3x2]
+bands: riz
+training_file: data_buzzard/training.hdf5
+validation_file: data_buzzard/validation.hdf5
+output_file: example/zotnet_buzzard_output.txt
+# Backend implementing the metrics, either: "firecrown" (default), "jax-cosmo"
+metrics_impl: jax-cosmo
+
+run:
+  # This is a class name which will be looked up
+  ZotNet:
+    run_4:
+      # This setting is sent to the classifier
+      bins: 4
+      # These special settings decide whether the
+      # color and error colums are passed to the classifier
+      # as well as the magnitudes
+      colors: False
+      errors: False
+      # ZotBin initialization data used for fast metric calculations.
+      # Available from https://portal.nersc.gov/cfs/lsst/dkirkby/zotbin/
+      # and /global/cfs/cdirs/lsst/www/dkirkby/zotbin at nersc.
+      init: binned_3_80.npz
+      # ZotNet training parameters...
+      # Number of nodes per hidden layer.
+      nhidden: 64
+      # Number of hidden layers.
+      nlayer: 2
+      # Maximum number of samples to use.
+      ndata: 500000
+      # Fraction of samples to use for training vs validation.
+      trainfrac: 0.9
+      # Batch size for SGD.
+      batchsize: 10000
+      # Which metric to optimize, must be one of FOM_DETF_3x2, FOM_3x2, SNR_3x2.
+      metric: FOM_DETF_3x2
+      # Number of trials with different random initializations.
+      ntrial: 1
+      # Number of training epochs per trial.
+      nepoch: 20
+      # Print progress every interval steps of each trial.
+      interval: 5
+      # Initial learning rate for the Adam optimizer.
+      eta: 0.02
+      # Seed for random initializations.
+      seed: 123
diff --git a/tomo_challenge/.gitignore b/tomo_challenge/.gitignore
@@ -0,0 +1 @@
+__pycache__
diff --git a/tomo_challenge/classifiers/zotbin.py b/tomo_challenge/classifiers/zotbin.py
@@ -0,0 +1,172 @@
+from pathlib import Path
+
+from .base import Tomographer
+import numpy as np
+
+try:
+    import jax.experimental.optimizers
+except ImportError:
+    print('The ZotBin classifier needs the jax and jax-cosmo packages.')
+
+try:
+    #from zotbin.group import groupbins, load_groups, save_groups, fdigitize
+    #from zotbin.binned import get_zedges_chi
+    from zotbin.util import prepare, get_signature, get_file
+    from zotbin.flow import learn_flow
+    from zotbin.binned import load_binned
+    from zotbin.group import groupbins, load_groups, fdigitize, assign_bins
+    from zotbin.optimize import optimize
+except ImportError:
+    print('The ZotBin classifierr needs the zotbin package:\n  pip install git+https://github.com/dkirkby/zotbin.git')
+
+
+class ZotBin(Tomographer):
+    """ ZotBin method.
+    """
+
+    # valid parameter -- see below
+    valid_options = [
+        'bins', 'init', 'npct', 'ngrp', 'similarity',
+        'metric', 'ntrial', 'nsteps', 'interval', 'eta', 'seed']
+    # this settings means arrays will be sent to train and apply instead
+    # of dictionaries
+    wants_arrays = True
+
+    def __init__ (self, bands, options):
+        """Constructor
+
+        Parameters:
+        -----------
+        bands: str
+          string containg valid bands, like 'riz' or 'griz'
+        options: dict
+          options come through here. Valid keys are listed as valid_options
+          class variable.
+
+        Note:
+        -----
+        Valiad options are:
+            'bins' - number of tomographic bins
+
+        """
+        self.bands = bands
+        self.opt = options
+        self.preprocessor = None
+        self.fedges = None
+        similarity = options['similarity']
+        if similarity not in ('cosine', 'weighted', 'EMD'):
+            raise ValueError(f'Invalid similarity: "{similarity}".')
+        metric = options['metric']
+        if metric not in ('SNR_3x2', 'FOM_3x2', 'FOM_DETF_3x2'):
+            raise ValueError(f'Invalid optimization metric: "{metric}".')
+        self.init_data = load_binned(get_file(options['init']))
+
+    def train (self, data, z):
+        """Trains the classifier
+
+        Parameters:
+        -----------
+        training_data: numpy array, size Ngalaxes x Nbands
+          training data, each row is a galaxy, each column is a band as per
+          band defined above
+        training_z: numpy array, size Ngalaxies
+          true redshift for the training sample
+
+        """
+        print(f'train: input data shape is {data.shape}.')
+        # Prepare input features.
+        features, detected = prepare(data, self.bands)
+        features = features[detected]
+        z = z[detected]
+        # Use cached preprocessed data if available.
+        signature = get_signature(features)
+        pname = Path('preprocessed_{0}.npy'.format(signature))
+        if pname.exists():
+            print('Using cached preprocessed data.')
+            U = np.load(pname)
+        else:
+            # Learn a preprocessing transform to an approximately uniform distribution of features.
+            print('Learning preprocessor normalizing flow...')
+            self.preprocessor = learn_flow(features[:400000])
+            # Proprocess the input features.
+            U = self.preprocessor(features)
+            # Cache the preprocessed data for next time.
+            np.save(pname, U)
+            print('Cached preprocessed data.')
+        # Load or calculate groups in feature space.
+        method = self.opt['similarity']
+        npct = self.opt['npct']
+        ngrp = self.opt['ngrp']
+        fname = f'groups_{method}_{npct}_{ngrp}_{signature}.npz'
+        if not Path(fname).exists():
+            print(f'Calculating {ngrp} feature space groups with npct={npct}...')
+            groupbins(U, z, self.init_data[0], npct, ngrp_save=[ngrp], method=method,
+                      plot_interval=None, savename=fname)
+        _, self.fedges, self.grpid, self.zhist, _ = load_groups(fname)
+        print(f'Loaded {ngrp} groups with npct={npct}.')
+        # Optimize the weights for combining groups into the requested number of nbins
+        # for the specified metric.
+        args = {k: self.opt[k] for k in ('metric', 'ntrial', 'interval', 'seed')}
+        args['nbin'] = self.opt['bins']
+        args['opt_args'] = dict(
+            optimizer=jax.experimental.optimizers.adam(self.opt['eta']),
+            nsteps=self.opt['nsteps'])
+        print(f'Optimizing final bins with {args}...')
+        best_scores, self.weights, self.dndz_bin, _ = optimize(
+            mixing_matrix=self.zhist, init_data=self.init_data, **args)
+        print(f'Best scores after optimization: {best_scores}')
+
+    def apply (self, data):
+        """Applies training to the data.
+
+        Note that a bin number of -1 indicates that a galaxy should not be used.
+
+        Parameters:
+        -----------
+        Data: numpy array, size Ngalaxes x Nbands
+          testing data, each row is a galaxy, each column is a band as per
+          band defined above
+
+        Returns:
+        tomographic_selections: numpy array, int, size Ngalaxies
+          tomographic selection for galaxies return as bin number for
+          each galaxy.
+        """
+        print(f'apply: input data shape is {data.shape}.')
+        features, detected = prepare(data, self.bands)
+        tomo_sel = np.full(len(features), -1, int)
+        # Use cached preprocessed data if available.
+        features = features[detected]
+        signature = get_signature(features)
+        pname = Path('preprocessed_{0}.npy'.format(signature))
+        if pname.exists():
+            print('Using cached preprocessed data.')
+            U = np.load(pname)
+        elif self.preprocessor is None:
+            raise RuntimeError('No preprocessor defined: has the train step been run?')
+        else:
+            # Apply the learned transform.
+            print('Preprocessing...')
+            U = self.preprocessor(features)
+            # Cache the preprocessed data for next time.
+            np.save(pname, U)
+            print('Cached preprocessed data.')
+        # Assign galaxies to feature groups.
+        if self.fedges is None:
+            raise RuntimeError('No groups defined: has the train step been run?')
+        feature_bin = fdigitize(U, self.fedges)
+        feature_grp = self.grpid[feature_bin]
+        nempty = np.count_nonzero(feature_grp == -1)
+        print(f'Found {nempty} galaxies outside the training feature space.')
+        # Assign feature groups to output bins.
+        tomo_sel[detected] = assign_bins(feature_grp, self.weights, self.opt['seed'])
+        # Save results before returning.
+        nbin, ngrp = self.weights.shape
+        metric = self.opt['metric']
+        method = self.opt['similarity']
+        npct = self.opt['npct']
+        fname = f'zotbin_{metric}_{nbin}_{method}_{npct}_{ngrp}_{signature}.npz'
+        np.savez(fname, idx=tomo_sel.astype(np.uint8), weights=self.weights, dndz=self.dndz_bin)
+        print(f'Saved {fname}')
+
+        return tomo_sel