diff --git a/.gitignore b/.gitignore
index 5c09f44..44207ce 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,3 +11,11 @@ data/*.zip
 data/submission.csv.gz
 !data/2017-08-15_2017-09-11.csv.zip
 
+data/*
+*/.DS_STORE
+.DS_STORE
+images/
+ex_figs/
+
+*.png
+output/
diff --git a/Adam_HD_optimizer.py b/Adam_HD_optimizer.py
new file mode 100755
index 0000000..341f44b
--- /dev/null
+++ b/Adam_HD_optimizer.py
@@ -0,0 +1,92 @@
+#Copy paste from https://github.com/zadaianchuk/HyperGradientDescent/blob/master/Adam_HD_optimizer.py
+#Hypergradient Descent Optimizer
+
+
+
+
+from __future__ import division
+
+import tensorflow as tf
+
+class AdamHDOptimizer(tf.train.GradientDescentOptimizer):
+
+    def __init__(self, alpha_0, beta =10**(-7), name="HGD", mu=0.99, eps = 10**(-8),type_of_learning_rate ="global"):
+        super(AdamHDOptimizer, self).__init__(beta, name=name)
+
+        self._mu = mu
+        self._alpha_0 = alpha_0
+        self._beta = beta
+        self._eps = eps
+        self._type = type_of_learning_rate
+
+
+    def minimize(self, loss, global_step):
+
+        # Algo params as constant tensors
+        mu = tf.convert_to_tensor(self._mu, dtype=tf.float32)
+        alpha_0=tf.convert_to_tensor(self._alpha_0, dtype=tf.float32)
+        beta=tf.convert_to_tensor(self._beta, dtype=tf.float32)
+        eps = tf.convert_to_tensor(self._eps, dtype=tf.float32)
+
+        var_list = tf.trainable_variables()
+
+        # create and retrieve slot variables for:
+        # direction of previous step
+        ds = [self._get_or_make_slot(var,
+                  tf.constant(0.0, tf.float32, var.get_shape()), "direction", "direction")
+                  for var in var_list]
+        # current learning_rate alpha
+        if self._type == "global":
+            alpha = self._get_or_make_slot(alpha_0, alpha_0, "learning_rate", "learning_rate")
+        else:
+            alphas = [self._get_or_make_slot(var,
+                      tf.constant(self._alpha_0, tf.float32, var.get_shape()), "learning_rates", "learning_rates")
+                      for var in var_list]
+        #  moving average estimation
+        ms = [self._get_or_make_slot(var,
+            tf.constant(0.0, tf.float32, var.get_shape()), "m", "m")
+            for var in var_list]
+        vs = [self._get_or_make_slot(var,
+            tf.constant(0.0, tf.float32, var.get_shape()), "v", "v")
+            for var in var_list]
+        # power of mu for bias-corrected first and second moment estimate
+        mu_power = tf.get_variable("mu_power", shape=(), dtype=tf.float32, trainable=False, initializer=tf.constant_initializer(1.0))
+
+        # update moving averages of first and second moment:
+        grads = tf.gradients(loss, var_list)
+        grads_squared = [tf.square(g) for g in grads]
+        m_updates = [m.assign(mu*m + (1.0-mu)*g) for m, g in zip(ms, grads)] #new means
+        v_updates = [v.assign(mu*v + (1.0-mu)*g2) for v, g2 in zip(vs, grads_squared)]
+        mu_power_update = [tf.assign(mu_power,tf.multiply(mu_power,mu))]
+        # bais correction of the estimates
+        with tf.control_dependencies(v_updates+m_updates+mu_power_update):
+            ms_hat = [tf.divide(m,tf.constant(1.0) - mu_power) for m in ms]
+            vs_hat = [tf.divide(v,tf.constant(1.0) - mu_power) for v in vs]
+
+        #update of learning rate alpha, main difference between ADAM and ADAM-HD
+        if self._type == "global":
+            hypergrad = sum([tf.reduce_sum(tf.multiply(d,g)) for d,g in zip(ds, grads)])
+            alphas_update = [alpha.assign(alpha-beta*hypergrad)]
+        else:
+            hypergrads = [tf.multiply(d,g) for d,g in zip(ds, grads)]
+            alphas_update = [alpha.assign(alpha-beta*hypergrad) for alpha,hypergrad in zip(alphas,hypergrads)]
+
+        # update step directions
+        with tf.control_dependencies(alphas_update): #we want to be sure that alphas calculated using previous step directions
+            ds_updates=[d.assign(-tf.divide(m, tf.sqrt(v) + self._eps)) for (m,v,d) in zip(ms_hat,vs_hat,ds)]
+
+        # update parameters of the model
+        with tf.control_dependencies(ds_updates):
+                if self._type == "global":
+                    dirs = [alpha*d for  d in ds]
+                    alpha_norm = alpha
+                else:
+                    dirs = [alpha*d for  d, alpha in zip(ds,alphas)]
+                    alpha_norm = sum([tf.reduce_mean(alpha**2) for alpha in alphas])
+                variable_updates = [v.assign_add(d) for v, d in zip(var_list, dirs)]
+                global_step.assign_add(1)
+                # add summaries  (track alphas changes)
+                with tf.name_scope("summaries"):
+                    with tf.name_scope("per_iteration"):
+                        alpha_norm_sum=tf.summary.scalar("alpha", alpha_norm, collections=[tf.GraphKeys.SUMMARIES, "per_iteration"])
+        return tf.group(*variable_updates)
\ No newline at end of file
diff --git a/MAKEFEATURES_TRAIN_ALL.sh b/MAKEFEATURES_TRAIN_ALL.sh
new file mode 100644
index 0000000..3b2b37d
--- /dev/null
+++ b/MAKEFEATURES_TRAIN_ALL.sh
@@ -0,0 +1,61 @@
+#WHen doing the chunking backtest approach, need to train/retrain model after 
+#each new chunk of training data comes in.
+
+#For this setup, just retrain from scrach (not starting at last checkpoint of 
+#previous training chunk; completely starting over again)
+
+
+# ==============================================================================
+# PARAMETERS
+# ==============================================================================
+#For each of the N training sets: train model
+#true false whether to remake feature sets  vs. just skip directly to training
+MAKE_FEATURESETS=false
+#Make some cached features for all the training/test sets
+makefeats_names="TRAINset1 TRAINset2 TRAINset3 TRAINset4 TESTset1 TESTset2 TESTset3 TESTset4"
+train_names="TRAINset1 TRAINset2 TRAINset3 TRAINset4"
+#In training, max number of epochs to do. By 25-50 things have usually plateaud
+MAX_EPOCH=50
+
+
+
+
+if $MAKE_FEATURESETS; then
+
+    echo 'Cleaning up, then remaking feature sets'
+    #Clean up between feature sets
+    cd data
+    rm -R TRAIN*
+    rm -R TEST*
+    rm -R cpt/
+    rm -R cpt_tmp/
+    rm -R logs/
+    rm *.pkl
+    cd ..
+    ll data/
+    
+        
+    # =============================================================================
+    # make_features.py
+    # =============================================================================
+    for v in $makefeats_names; do
+        #Create the features for our data
+        echo 'running make_features.py'
+        echo $v
+        python3 make_features.py data/$v ours daily full --add_days=0
+    done
+fi
+    
+    
+# =============================================================================
+# trainer.py    
+# ============================================================================= 
+for v in $train_names; do
+    echo 'running trainer.py'
+    echo $v
+    #By default, is already doing forward split, so also do side split
+    python3 trainer.py full daily --name=$v --hparam_set='encdec' --n_models=3 --asgd_decay=0.99 --max_steps=11500 --save_from_step=10 --max_epoch=$MAX_EPOCH --patience=5 --verbose --save_epochs_performance
+    # --side_split    #using the side_split option gives unrealistic values for SMAPE: 
+    #says training, side split, and forward step SMAPEs are all only 3-8 %, so clearly unrealistic. 
+    #Not sure if Kaggle guy calculated things differently when doing side_eval option??? Just leave off for now, only do forward eval.
+done
\ No newline at end of file
diff --git a/PERFORMANCE_HEATMAPS.py b/PERFORMANCE_HEATMAPS.py
new file mode 100755
index 0000000..164818d
--- /dev/null
+++ b/PERFORMANCE_HEATMAPS.py
@@ -0,0 +1,227 @@
+#Analyze the different performance metrics 
+#Make the performance heatmaps
+
+#There will be 4 different TRAIN-TEST sets, 
+#each has a model trained on that train set and tested on that test set.
+#So asssume to simulate production environment where we would retrain model 
+#every so often, we have e.g. 4 tests of the model, each with say 3 months more 
+#data appended to it. So, assume we will just do 4 separate analyses.
+
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+import os
+import numpy as np
+import pickle
+from collections import defaultdict
+
+
+# =============================================================================
+# PARAMETERS
+# =============================================================================
+OUTDIR = 'output'
+NAMES = ['TESTset1', 'TESTset2', 'TESTset3', 'TESTset4']
+
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+def load_dict(path):
+    with open(path,'rb') as gg:
+        d = pickle.load(gg)
+#    print(d)
+    return d
+    
+    
+def aggregate__overall(data_dict, real_only, id_subsets, bad_ids):
+    """
+    For each (history,horizon) pair, marginalized over all id's and dates
+    
+    format is (history,horizon,backoffset,id) : {'SMAPE':smape, 'bias':bi, #'MAE':mae, 'predict_start_date':dates[0], 'predict_end_date':dates[-1]}
+    """
+#    print(data_dict.items())
+    agg_dict = defaultdict(lambda:[])
+    for k,v in data_dict.items():
+        series_id = k[3]
+        #Only use the real series, ignore the synthetic ones
+        #(synthetic series have name like {id}__... )
+        if real_only:
+            if '__' in series_id:
+                continue
+        #If have a set of holdout id's:
+        if id_subsets:
+            if series_id not in id_subsets:
+                continue
+        #Regardless of mode, if this is one of the corrupted time series, ignore it:
+        if series_id in bad_ids:
+            continue
+    
+        history = k[0]
+        horizon = k[1]
+        smape = v['SMAPE']
+        agg_dict[(history,horizon)] += [smape]        
+    
+    
+    
+    #Now get mean SMAPE
+    metrics_dict = {}
+    for k,v in agg_dict.items():
+        mean = np.nanmean(v)
+        median = np.nanmedian(v)
+        sd = np.nanstd(v)
+        pctl_5 = np.percentile([i for i in v if np.isfinite(i)],5)#nanpercentile
+        pctl_95 = np.percentile([i for i in v if np.isfinite(i)],95)
+        metrics_dict[k] = {'mean':mean, 'median':median, 'sd':sd, '5pctl':pctl_5, '95pctl':pctl_95}
+    
+    histories = list(np.unique([i[0] for i in metrics_dict.keys()]))
+    horizons = list(np.unique([i[1] for i in metrics_dict.keys()]))
+#    print(metrics_dict)
+#    print(histories)
+#    print(horizons)
+    
+    metrics_arrays = {}
+    for metric in ['mean','median', 'sd','5pctl','95pctl']:
+        _array = np.nan * np.ones((len(histories),len(horizons)))
+        for k,v in metrics_dict.items():
+            i = histories.index(k[0])
+            j = horizons.index(k[1])
+            _array[i,j] = v[metric]
+        metrics_arrays[metric] = _array
+    print(metrics_arrays)
+    return metrics_dict, histories, horizons, metrics_arrays
+    
+
+
+
+def make_heatmap(metrics_arrays, histories, horizons, outdir, name):
+    """
+    Visualize the SMAPE values
+    """
+    #For scale, get highest value for heatmap.
+    #Use 200 (worst possible SMAPE), vs.
+    #to improve dynamic range use the highest measured SMAPE value from the heatmaps
+    
+#    print('metrics_arrays')
+#    print(metrics_arrays)
+    for k,v in metrics_arrays.items():
+        
+        savename = k+'_'+name
+        
+        vmax = np.nanmin([200.,np.nanmax(np.ceil(v))])
+        
+        plt.figure()
+        plt.imshow(v,vmin=0.,vmax=vmax)
+        plt.title(savename,fontsize=15)
+        plt.colorbar()
+        plt.xlabel('Horizon',fontsize=15)
+        plt.ylabel('History',fontsize=15)
+        plt.xticks(np.arange(len(horizons)),horizons,fontsize=15)
+        plt.yticks(np.arange(len(histories)),histories,fontsize=15)
+        
+        for x, hor in enumerate(np.arange(len(horizons))):
+            for y, hist in enumerate(np.arange(len(histories))):
+                s = np.round(v[y,x],1)
+                plt.text(x-.25, y, s)
+    #    plt.grid()
+        savepath = os.path.join(outdir,f'{savename}.png')
+        plt.savefig(savepath)
+    
+    
+    
+    
+if __name__=='__main__':
+#    parser = argparse.ArgumentParser()
+#    parser.add_argument('--logdir', default='data/logs', help="Directory where numpy arrays of performance are")
+#    parser.add_argument('--K_last', default=3, dest='K_last', help='Save out per EPOCH metrics (NOT per step, only per EPOCH')
+#    args = parser.parse_args()
+#    param_dict = dict(vars(args))
+#
+#    make_heatmaps(**param_dict)
+    
+    
+    #for each of the 4 dicts:
+    
+    
+    #HOLLYWOOD
+    #Make list of id's that were held out from training, to assess transfer ability
+    HOLD_OUTS = [str(i) for i in range(500)] #Not actually held out, but just get an idea of performance on earlier ids
+    special_ids = [str(i) for i in [531, 1007, 143, 130, 197, 203, 209, 215, 342, 476, 328, 182, 200, 145, 242, 44, 94, 147, 1, 5, 6, 7, 8, 12, 387, 429, 1005, 943]]
+    id_dict = {'allIDs':[],
+         'special_ids':special_ids,
+         'holdout_ids':HOLD_OUTS}
+    
+    #Some of the ID's are just bad, have multiple month long gaps from corrupted data, etc., so can ignore them
+    #For now just use everything to get conservative estimate of performance
+    BAD_IDs = []#['44','46','581','582','583','584']
+    
+    
+    
+    
+    # =============================================================================
+    # Aggregated over all 4 test sets                
+    # =============================================================================
+    all_data = {}
+    for chunkname in NAMES:
+        print('chunkname: ',chunkname)    
+        path = os.path.join(OUTDIR,f'hist_horiz__{chunkname}.pickle')
+        data = load_dict(path)    
+        new_data = {k+(chunkname,): v for k,v in data.items()}
+        all_data.update(new_data)
+    
+    for real_only in [True,False]:
+        for k, id_subsets in id_dict.items():
+            
+            r = 'real' if real_only else 'realAndsynthetic'
+            name = '4Ave' + '_' + r + '_' + k
+            print(name)
+            
+            
+            metrics_dict, histories, horizons, metrics_arrays = aggregate__overall(all_data, real_only, id_subsets, BAD_IDs)
+            make_heatmap(metrics_arrays, histories, horizons, OUTDIR, name)
+            
+            #Save out the metrics dict
+            dict_savename = os.path.join(OUTDIR,f"hist_horiz__{name}__allchunks__metrics.pickle")
+            with open(dict_savename, "wb") as outp:
+                pickle.dump(metrics_dict, outp)    
+    
+    
+    
+    
+    
+    # =============================================================================
+    # Individual test sets
+    # =============================================================================
+    #For the 4 chunk backtesting performance assessment
+    for chunkname in NAMES:
+        print('chunkname: ',chunkname)    
+        path = os.path.join(OUTDIR,f'hist_horiz__{chunkname}.pickle')
+        data = load_dict(path)
+        
+        for real_only in [True,False]:
+            for k, id_subsets in id_dict.items():
+                
+                r = 'real' if real_only else 'realAndsynthetic'
+                name = chunkname + '_' + r + '_' + k
+                print(name)
+                
+                
+                metrics_dict, histories, horizons, metrics_arrays = aggregate__overall(data, real_only, id_subsets, BAD_IDs)
+                make_heatmap(metrics_arrays, histories, horizons, OUTDIR, name)
+                
+                #Save out the metrics dict
+                dict_savename = os.path.join(OUTDIR,f"hist_horiz__{name}__metrics.pickle")
+                with open(dict_savename, "wb") as outp:
+                    pickle.dump(metrics_dict, outp)    
+                    
+                    
+                    
+                     
+                    
+                    
+   
+                    
+                    
+                    
\ No newline at end of file
diff --git a/PREDICT.py b/PREDICT.py
new file mode 100755
index 0000000..d51d207
--- /dev/null
+++ b/PREDICT.py
@@ -0,0 +1,230 @@
+"""
+
+
+
+
+
+NOT USED ANYMORE
+
+
+
+Created on Mon Jun 18 14:03:35 2018
+
+@author: gk
+"""
+
+
+
+#After training, do the predictions [but here  as a script instead of .ipynb]
+
+
+import tensorflow as tf
+
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+import os
+import pandas as pd
+import numpy as np
+from trainer import predict
+from hparams import build_hparams
+import hparams
+
+
+
+# =============================================================================
+# PARAMETRS
+# =============================================================================
+FEATURES_SET = 'full'# 'arturius' 'simple' 'full'
+SAMPLING_PERIOD = 'daily'
+DATA_TYPE = 'ours' #'kaggle' #'ours'
+Nmodels = 3
+PARAM_SETTING = 's32' #Which of the parameter settings to use [s32 is the default Kaggle one, with a few thigns modified as I want]
+PARAM_SETTING_FULL_NAME = hparams.params_s32 #Which of the parameter settings to use corresponding to the PARAM_SETTING. The mapping is defined in hparams.py at the end in "sets = {'s32':params_s32,..."
+OUTPUT_DIR = 'output'
+
+
+
+
+
+
+# =============================================================================
+# Performance Metrics
+# =============================================================================
+def smape(true, pred):
+    summ = np.abs(true) + np.abs(pred)
+    smape = np.where(summ == 0, 0, np.abs(true - pred) / summ)
+    #return np.mean(kaggle_smape) * 200
+    return smape * 200
+
+def mean_smape(true, pred):
+    raw_smape = smape(true, pred)
+    masked_smape = np.ma.array(raw_smape, mask=np.isnan(raw_smape))
+    return masked_smape.mean()
+
+
+
+def bias(true, pred):
+    """
+    Check if the forecasts are biased up or down
+    """
+    return np.sum(true - pred) / np.sum(true + pred)
+
+
+
+# =============================================================================
+# 
+# =============================================================================
+#read_all funcion loads the (hardcoded) file "data/all.pkl", or otherwise train2.csv
+print('loading data...')
+from make_features import read_all
+df_all = read_all(DATA_TYPE,SAMPLING_PERIOD)
+print('df_all.columns')
+print(df_all.columns)
+
+
+# =============================================================================
+# 
+# =============================================================================
+prev = df_all#.loc[:,:'2017-07-08']
+paths = [p for p in tf.train.get_checkpoint_state(f'data/cpt/{PARAM_SETTING}').all_model_checkpoint_paths]
+
+#tf.reset_default_graph()
+#preds = predict(paths, default_hparams(), back_offset=0,
+#                    n_models=3, target_model=0, seed=2, batch_size=2048, asgd=True)
+t_preds = []
+for tm in range(Nmodels):
+    tf.reset_default_graph()
+    t_preds.append(predict(FEATURES_SET, SAMPLING_PERIOD, paths, build_hparams(PARAM_SETTING_FULL_NAME), history_window_size, horizon_window_size, back_offset=0, 
+                    n_models=Nmodels, target_model=tm, seed=2, batch_size=2048, asgd=True))
+#def predict(features_set, sampling_period, checkpoints, hparams, history_window_size, horizon_window_size, return_x=False, verbose=False, back_offset=0, n_models=1,
+#            target_model=0, asgd=False, seed=1, batch_size=1024): #For predict: allow horizon_window_size to be fixed
+
+
+# =============================================================================
+# average the N models predictions
+# =============================================================================
+preds = sum(t_preds)/float(Nmodels)
+
+
+# =============================================================================
+# look at missing
+# =============================================================================
+missing_pages = prev.index.difference(preds.index)
+print('missing_pages',missing_pages)
+# Use zeros for missing pages
+rmdf = pd.DataFrame(index=missing_pages,
+                data=np.tile(0, (len(preds.columns),len(missing_pages))).T, columns=preds.columns)
+if DATA_TYPE=='kaggle':
+    f_preds = preds.append(rmdf).sort_index()
+elif DATA_TYPE=='ours':
+    f_preds = preds
+# Use zero for negative predictions
+f_preds[f_preds < 0.5] = 0
+# Rouns predictions to nearest int
+f_preds = np.round(f_preds).astype(np.int64)
+
+
+
+
+print(f_preds)
+
+# =============================================================================
+# save out all predictions all days (for our stuff will be relevant, for his Kaggle maybe just needed one day)
+# =============================================================================
+#firstK = 1000 #for size issues, for now while dev, just a few to look at
+#ggg = f_preds.iloc[:firstK]
+#ggg.to_csv('data/all_days_submission.csv.gz', compression='gzip', index=False, header=True)
+f_preds.to_csv(f'{OUTPUT_DIR}/all_predictions_ours.csv.gz', compression='gzip', index=False, header=True)
+
+
+
+
+# =============================================================================
+# visualize to do wuick check
+# =============================================================================
+"""
+pages = ['(236984)_Astier_fr.wikipedia.org_all-access_all-agents', \
+         '龍抬頭_zh.wikipedia.org_mobile-web_all-agents',\
+         "'Tis_the_Season_(Vince_Gill_and_Olivia_Newton-John_album)_en.wikipedia.org_mobile-web_all-agents",\
+         'Peter_Townsend_(RAF_officer)_en.wikipedia.org_mobile-web_all-agents',\
+         "Heahmund_en.wikipedia.org_desktop_all-agents"]
+"""
+
+randomK = 1000
+print('Saving figs of {} time series as checks'.format(randomK))
+pagenames = list(f_preds.index)
+pages = np.random.choice(pagenames, size=min(randomK,len(pagenames)), replace=False)
+N = pages.size
+for jj, page in enumerate(pages):
+    print(f"{jj} of {N}")
+    plt.figure()
+    if DATA_TYPE=='kaggle':
+        prev.loc[page].fillna(0).plot()#logy=True)
+        f_preds.loc[page].fillna(0).plot(logy=True)
+    elif DATA_TYPE=='ours':
+        prev.loc[int(page)].plot()
+        f_preds.loc[page].plot()
+    plt.title(page)
+    if not os.path.exists(OUTPUT_DIR):
+        os.mkdir(OUTPUT_DIR)
+    pathname = os.path.join(OUTPUT_DIR, 'fig_{}.png'.format(jj))
+    plt.savefig(pathname)
+    plt.close()
+    
+    
+    
+#Cannot view on the AWS so move to local:   
+#zip -r output.zip output
+#cp output.zip /home/...../sync
+    
+    
+    
+    
+    
+    
+    
+    
+    
+        
+#For the Kaggle data, can also output compeition submission format:    
+if DATA_TYPE=='kaggle':   
+    # =============================================================================
+    # load, maniupalte test data    
+    # =============================================================================
+    def read_keys():
+        import os.path
+        key_file = 'data/keys2.pkl'
+        if os.path.exists(key_file):
+            return pd.read_pickle(key_file)
+        else:
+            print('Reading keys...')
+            raw_keys = pd.read_csv('data/key_2.csv.zip')
+            print('Processing keys...')
+            pagedate = raw_keys.Page.str.rsplit('_', expand=True, n=1).rename(columns={0:'page',1:'date_str'})
+            keys = raw_keys.drop('Page', axis=1).assign(page=pagedate.page, date=pd.to_datetime(pagedate.date_str))
+            del raw_keys, pagedate
+            print('Pivoting keys...')
+            pkeys = keys.pivot(index='page', columns='date', values='Id')
+            print('Storing keys...')
+            pkeys.to_pickle(key_file)
+            return pkeys
+    keys = read_keys()    
+    
+    # =============================================================================
+    # 
+    # =============================================================================
+    subm_preds = f_preds.loc[:, '2017-09-13':]
+    assert np.all(subm_preds.index == keys.index)
+    assert np.all(subm_preds.columns == keys.columns)
+    answers = pd.DataFrame({'Id':keys.values.flatten(), 'Visits':np.round(subm_preds).astype(np.int64).values.flatten()})
+    answers.to_csv(f'{OUTPUT_DIR}/submission.csv.gz', compression='gzip', index=False, header=True)
+    
+    
+    
+    print('f_preds')
+    print(f_preds)
+    
+    print('missing')
+    print(prev.loc[missing_pages, '2016-12-15':])
\ No newline at end of file
diff --git a/PREPROCESS.py b/PREPROCESS.py
new file mode 100755
index 0000000..01fdb55
--- /dev/null
+++ b/PREPROCESS.py
@@ -0,0 +1,805 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Thu Jun 21 12:55:54 2018
+
+@author: gk
+"""
+
+#Do some basic preprocessing to get my data in same format as Kaggle code
+
+
+#import matplotlib
+#matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+
+import os
+import pandas as pd
+import numpy as np
+
+#from statsmodels.tsa.seasonal import seasonal_decompose
+#stl = seasonal_decompose(x)
+
+#from sklearn.preprocessing import Imputer
+from collections import Counter
+
+from copy import deepcopy
+from scipy.signal import medfilt
+
+
+def load_my_data(myDataDir):
+    """
+    Load my data
+    """
+    files = os.listdir(myDataDir)
+    files = [i for i in files if i.endswith('.csv')]
+    files = sorted(files, key=lambda x: int(x.split(".")[0]))
+    #Exclude certain cities
+    #ignore_list = [] #id's of the cities to ignore
+    #files = [i for i in files if i.split(".")[0] not in ignore_list]
+    dflist = []
+    for ii, ff in enumerate(files):
+        df = pd.read_csv(os.path.join(myDataDir,ff))
+        #For the test set data, csvs slightly diff format, so need these 2 steps
+        if "id" not in df.columns:
+            df['id'] = ff.split('.')[0]
+            df.rename(columns={'dt':'date'},inplace=True)
+        dflist += [df]
+    df = pd.concat(dflist,sort=False)
+    df = df[['id','date','y']]
+    df['id'] = df['id'].astype(int)
+    return df
+
+
+def remove_cities(df,remove_id_list):
+    """
+    Remove blacklisted id's [since some downloaded id's no longer relevant,
+    or suspected to not be useful, or be corrupted]
+    
+    Or just ignore these files when loading data and don't need this
+    """
+    return df.loc[~df['id'].isin(remove_id_list)]
+    
+
+def get_earliest_latest_dates(df):
+    """
+    Get first and last dates seen across any time series
+    """
+    earliest = min(df['date'])
+    latest = max(df['date'])
+    print('earliest date',earliest)
+    print('latest date',latest)
+    return earliest, latest
+    
+
+
+
+#def __keep_btwn_dates(df,start_date,test_end_date):
+#    """
+#    Excerpt only the data between [inclusive] start and end date.
+#    Both dates are formatted as 'YYYY-mm-DD'
+#    """
+#    len1 = len(df)
+#    df = df.loc[(df['date']>=start_date) & (df['date']<=test_end_date)]
+#    df.reset_index(inplace=True,drop=True)
+#    len2 = len(df)
+#    rows_removed = len1 - len2
+#    print('rows_removed:',rows_removed,'of',len1)
+#    return df   
+
+
+
+
+
+def __missing_vals_distribution(df,out_of_range_fill_value):
+    """
+    Look at two things:
+        - What fraction of our time series are desne vs. have >= 1 missing value?
+        - Of the series that have missing values, what is distribution of gap lengths?
+          [important to know since will be doing imputation on it]
+    
+    df - in format like Kaggle competition: cols are dates, rows are series
+         start/end missing, nd intermedite gaps have been filled with -1
+    """
+
+    def make_cdf(v,out_of_range_fill_value):
+        c = Counter(v)
+        x = list(c.keys())
+        x = np.array(x) -1 #-1 to go from diff in days from present data -> gap length
+        y = list(c.values())
+    #    print(c)
+        plt.figure()
+        #plt.plot(x,y,drawstyle='steps')#,marker='o')
+        plt.plot(x,y,linestyle='None',marker='o')
+        plt.title('Distribution of Missing Data Gap Length',fontsize=20)
+        plt.xlabel('Gap Length [days]',fontsize=20)
+        plt.ylabel('Count',fontsize=20)
+    #    plt.axis([-1,10,0,550])
+        plt.show()
+
+    #get fraction dense vs sparse:
+    dd = df.values[:,1:]
+    sparse = (dd==out_of_range_fill_value).sum(axis=1)
+    Nsparse = float((sparse>0).sum())
+    print(Nsparse)
+    Ntotal = float(dd.shape[0])
+    fraction_dense = (Ntotal - Nsparse) / Ntotal
+    print('Nsparse', Nsparse)
+    print('Ntotal', Ntotal)
+    print('fraction_dense', fraction_dense)
+    
+    #Look at distribution of INTERMEDIATE gap lengths
+    #ignore the leading / lagging unfilled since could just be from the series 
+    #not officially starting yet, or it got closed out.
+    all_gaps = []
+    for row in dd:
+        inds = np.where(row!=out_of_range_fill_value)[0]
+        x = np.diff(inds)
+        t = list(x[x>1])
+        if len(t)>0:
+            all_gaps.extend(t)
+    make_cdf(all_gaps,out_of_range_fill_value)
+
+
+
+
+
+
+def remove_seasonal_blocks(df):
+    """
+    For places in the data where there are missing gaps of length > 1 seasonality,
+    remove 
+    """
+    return
+
+
+
+
+
+
+
+#def do_imputation(df,imputation_method):
+#    """
+#    For places in the data where missing gaps are smalle (<7 days),
+#    just fill in those few missing days with a basic 
+#    remove 
+#    """
+#    
+#    
+#    def imputation_small_gaps(df,imputation_method):
+#        """
+#        Do missing data imputation using the given forecasting method
+#        Only use this for short missing segments; do not use for longer ones.
+#        """
+#        if imputation_method == 'STL':
+#            #stl = seasonal_decompose(x)
+#            df_filled = df
+#            pass
+#        else:
+#            raise Exception('That method not implemented yet')
+#        return df_filled    
+#    
+#    
+#    def imputation_big_gaps(df):
+#        """
+#        Do missing data imputation / removal
+#        For big gaps [gaps bigger than 1 seasonality]
+#        """
+#        df_filled = df
+#        return df_filled    
+#    
+#    
+#    def imputation__simple(df,imputation_method):
+#        """
+#        Juat as placeholder for now,
+#        fill all missing with zeros,
+#        or mean or median imputation
+#        """
+#        missing_values = [-1]#['NaN', -1]
+#        imp = Imputer(missing_values=missing_values,
+#                strategy=imputation_method,
+#                axis=1)
+#        vals = imp.fit_transform(df.values)#[:,1:]) #The data is only [:,1:]. 
+#        #"Some rows only contain missing values: [ 35 251 281]"
+#        #But get some rows with all missing vals. Since we don't actualyl care about this and never will use this 
+#        #for now just use the "Page" number as well to avoid this.
+#        
+#        
+#        cols = df.columns
+#        new_df = pd.DataFrame({cols[i]:vals[:,i] for i in range(vals.shape[1])})
+#        new_df['Page'] = df['Page']
+#        #Put "Page" at left
+#        cols = new_df.columns.tolist()
+#        new_df = new_df[cols[-1:]+cols[:-1]]
+#        new_df.reset_index(drop=True,inplace=True)
+#        return new_df
+#    
+#
+#    
+#    
+#    
+#    if (imputation_method == 'median') or (imputation_method == 'mean'):
+#        df = imputation__simple(df,imputation_method)
+#        
+##    if imputation_method == 'lagKmedian':
+##        #First get rid of the big blocks of mising values [more than 1 seasonality long]
+###        df = imputation_big_gaps(df)
+##        #Then deal with the short missing holes
+##        N_seasons = 4
+##        df = imputation_lagKmedian(df,N_seasons)
+#        
+#    else:
+#        raise Exception('not implemented other methods yet')
+#    
+#    #First deal with small gaps (missing gaps fewer than e.g. 7 days):
+#    #df = imputation_small_gaps(df,imputation_method)
+#    
+#    #Deal with longer gaps [e.g. by removing enough blocks of length S, where
+#    #S is the seasonality, to completely get rid of gaps]
+#    #...
+#    #df = imputation_big_gaps(df)
+#    
+#    #Trim start and end of each series/ to align to get in phase
+#    #df = 
+#    #...
+#    
+#    return df
+
+
+
+
+
+def imputation_lagKmedian_single_series(df,seasonality,N_seasons,out_of_range_fill_value):
+    """
+    Fill in short missing gaps by replacing missing value with:
+        median over last K weeks for that day.
+        E.g. Monday is missing, so use median count over 4 previous Mondays
+        
+    Intended for short holes. Remove longer ones in chunks of length seasonality.
+    
+    For now assuming that big chunk removal is done AFTER this step.
+    """
+    #If the whole series is empty (all -1/NAN):    
+    if np.alltrue(df.drop(columns='Page').values==out_of_range_fill_value):
+        return df
+
+    max_block_length = seasonality - 1
+    offsets = np.arange(-max_block_length,1,1)
+    
+    cols = list(df.columns)
+    cols = cols[:-1]#only the date cols., not the "Page" col
+#    N_timesteps = len(cols)
+#    print(cols)
+#    print(N_timesteps)
+    c = df['Page'].values
+    _ = df.drop(columns=['Page'])
+#    print(_.values)
+    missing_inds = np.where(_<0.)[1]
+
+
+    if missing_inds.size > 0:
+        #Means there are some missing values
+        #So scan through the data and fill in bad values,
+        #starting after the first real data [ignore all -1's that occur before
+        #time series starts for real]
+        first_real_ind = np.where(_>=0.)[1][0]
+        missing_inds =  missing_inds[missing_inds>first_real_ind]
+#        print(missing_inds)
+        
+        for mi in missing_inds:
+            #Only fill in those gaps that are small holes (less than 1 seasonality)
+            #Check that this particular missing val is not in a missing block 
+            #that has >= 1 seasonality size:
+#            print(mi)
+            in_block = False
+           
+            for off in offsets:
+#                print(_.values)
+                block_inds = np.arange(mi+off,mi+off+seasonality,1)
+#                print(block_inds)
+#                print(block_inds, [i in missing_inds for i in block_inds])
+                if np.alltrue([i in missing_inds for i in block_inds]):
+                    in_block = True
+                    break
+            if in_block:
+                continue 
+            #If it is not in a completely missing block [at least 1 value is recorded], then do lag K median:
+            prev_K_inds = np.arange(mi-seasonality, max(0, mi - N_seasons*(seasonality+1)), -seasonality).tolist()
+            t = _[_.columns[prev_K_inds]].values
+            t = t[t>=0]
+            imputed_val = np.median(t)
+            #If all K previous timesteps were -1, then would give nan, so set manually to -1:
+            if np.isnan(imputed_val):# == np.nan:
+                imputed_val = out_of_range_fill_value
+            _[_.columns[mi]] = imputed_val
+            
+#        g = np.where(_<0.)[1]
+#        g = g[g>first_real_ind]
+#        print(g)
+#        print('\n'*3)
+    _['Page'] = c
+    return _
+
+
+
+
+def data_augmentation(df, jitter_pcts_list=[.05,.01], do_low_pass_filter=True, additive_trend=False):
+    """
+    Do some basic data augmentation with a few different options.
+    Then output Cartesian product of all these variations as the final set.
+    
+    Any missing point (NAN) will be left as NAN, but others will be modified in some way
+    """
+    
+    def jitter__uniform_pcts(df, jitter_pcts_list, N_perturbations):
+        """
+        On each observed value (non-NAN), add +/- jitter up to some
+        percent of the observed value. Either positive or negative.
+        If the count is small, then just leave it, otherwise perturb
+        (always leaving counts positive).
+        
+        Just do at most 1 or 2 percent jitter to not corrupt to much,
+        ~ magnitude of measurement noise.
+        """
+        page = df['Page'].values[0]
+        cols = df.columns
+        x = df.drop(columns=['Page']).values[0]
+        dflist = []
+        for uniform_jitter in jitter_pcts_list:
+            ids = [str(page) + '__unijit{}_'.format(str(uniform_jitter)) + str(kk+1) for kk in range(N_perturbations)]
+            _ = np.zeros((N_perturbations,x.size))
+            f = lambda i: np.random.uniform(-uniform_jitter*i,uniform_jitter*i) if not np.isnan(i) else np.nan
+            for kk in range(N_perturbations):
+                _[kk] = [i + f(i) for i in x]# ).reshape(1,x.size)
+            d = {cols[i]:_[:,i] for i in range(x.size)}
+            df = pd.DataFrame(data=d)
+            df['Page'] = ids
+            dflist += [df]
+        df = pd.concat(dflist,axis=0)
+        df.reset_index(drop=True,inplace=True)
+        return df
+
+
+    def add_trend(df, slopes_list):
+        """
+        On each observed value (non-NAN), add +/- X_t, where X_t is from a 
+        linear trend with given slope, applied across whole series.
+        
+        Could change the character of the time series a lot so maybe not so good?
+        """
+        return df
+
+    def low_pass_filter(df, filter_type, kernel_size):
+        """
+        Low-pass filter the data with some kind of kernel, with some kernel size.
+        
+        Is going to smooth out the data a lot, not sure if this will change the
+        time series too much to be good??
+        """
+        page = df['Page'].values[0]
+        cols = df.columns
+        x = df.drop(columns=['Page']).values[0]
+        ids = [str(page) + '__{0}{1}'.format(filter_type.func_name,kernel_size)]
+        y = filter_type(x,kernel_size=kernel_size)
+        _ = np.where(np.invert(np.isnan(x)),y,np.nan)
+        d = {cols[i]:_[i] for i in range(x.size)}
+        df = pd.DataFrame(data=d,index=[0])
+        df['Page'] = ids
+        return df
+
+
+    #For each method, do 5x random
+    N_perturbations = 5
+    dflist = [df]
+    if jitter_pcts_list:
+        dflist += [jitter__uniform_pcts(df, jitter_pcts_list, N_perturbations)]
+    if do_low_pass_filter:
+        filter_type = medfilt
+        kernel_size = 7
+        dflist += [low_pass_filter(df, filter_type, kernel_size)]
+    if additive_trend:
+        slopes_list = [-1.1, 1.1]
+        dflist += [add_trend(df, slopes_list)]        
+#    if autoencoder:
+#        #Run through autoencoder, do VAE, get resulting series
+    
+    df = pd.concat(dflist,axis=0)    
+    return df
+        
+
+
+def format_like_Kaggle(df, mode, myDataDir, imputation_method, sampling_period, do_augmentation, validation_method, train_test_split_date, start_date=None, test_end_date=None, chunk_name=None):
+    """
+    Take my data and format it exactly as needed to use for the Kaggle seq2seq
+    model [requires making train_1.csv, train_2.csv, key_1.csv, key_2.csv]
+    [??? or does the seq2seq cTUlly OPEN THE .ZIPS DIRECTLY????????]
+    """
+    
+    
+    def make_train_csv(df, mode, save_path, imputation_method, sampling_period, start_date, test_end_date):
+        """
+        Make the train_2.csv
+        """
+        
+        def aggregate_to_weekly(df, aggregation_type):
+            """
+            Aggregate the data (average it) to downsample
+            to desired sample period, e.g. daily measurements -> weekly or monthly.
+            Should smooth out some noise, and help w seasonality.
+            
+            **ASSUMES WE HAVE DAILY DATA TO START.
+            """
+            dfc = deepcopy(df)
+            dfc['month-day'] = dfc['date'].apply(lambda x: str(x)[5:])
+
+            #Differentiate by year
+            years = pd.DatetimeIndex(dfc['date']).year
+            #years -= years.min()
+            dfc['year'] = years
+            
+            #Manually define as below, as generated by pd.date_range('2015-01-01','2015-12-24',freq='W-THU')
+            fixed_start_dates = ['01-01','01-08','01-15','01-22',
+            '01-29','02-05','02-12','02-19',
+            '02-26','03-05','03-12','03-19',
+            '03-26','04-02','04-09','04-16',
+            '04-23','04-30','05-07','05-14',
+            '05-21','05-28','06-04','06-11',
+            '06-18','06-25','07-02','07-09',
+            '07-16','07-23','07-30','08-06',
+            '08-13','08-20','08-27','09-03',
+            '09-10','09-17','09-24','10-01',
+            '10-08','10-15','10-22','10-29',
+            '11-05','11-12','11-19','11-26',
+            '12-03','12-10','12-17','12-24']#This combines last ~10 days of year together
+
+
+            _ = [np.searchsorted(fixed_start_dates,str(x),side='right') - 1 for x in dfc['month-day'].values]
+            _ = np.clip(_,0,51).astype(int) #clip 52 to 51. This means lumping last few days of year into 2nd last week of year starting 12/24.
+            _ = [fixed_start_dates[i] for i in _]
+            #Overwrite the actual date with the predefined week start date:
+            dfc['week_start_date'] = dfc['year'].map(str) + '-' + _
+            
+            #For each page-year-week, aggregte over the N<=7 days of that week to get the aggregted value:            
+#            _ = dfc.groupby(['Page','year','week_start_date']).agg({'y': [aggregation_type,'size'], 'year':'min', 'date':'min', 'Page':'min', 'week_start_date':'min'})
+            _ = dfc.groupby(['Page','week_start_date']).agg({'y': [aggregation_type,'size'], 'date':'min', 'Page':'min', 'week_start_date':'min'})
+            new_df = pd.DataFrame({'Page': _['Page']['min'].values,
+                                   'date': _['date']['min'].values,
+                                   'y': _['y'][aggregation_type].values, #This is no longer necessarily an int
+                                   'week_start_date': _['week_start_date']['min'].values
+                                   })
+
+            #After above process, can still have missing blocks for a given time series, so will deal with them later.
+
+            #now that done, delete uneeded columns
+            new_df.drop(columns=['date'],inplace=True)
+            new_df.rename(columns={'week_start_date':'date'},inplace=True)
+            
+            return new_df
+        
+        
+        def remove_downsample_columns(df, out_of_range_fill_value):
+            """
+            When doing any kind of daily --> weekly or monthly aggregation,
+            will have many days that are now empty (all data aggregated to single
+            date marking 1st date of week / month)
+            
+            So remove those obsolete columns
+            """
+            bad_cols = [i for i in df.columns if np.alltrue(df[i].values==out_of_range_fill_value)]
+            df.drop(columns=bad_cols,inplace=True)
+            return df
+            
+        def make_index_col_left(df):
+            """
+            Make sure order as expected by putting page col left
+            """
+            id_col_name = 'Page'
+            cols = df.columns.tolist()
+            cols.remove(id_col_name)
+            
+            df = df[ [id_col_name] + cols]
+            return df
+        
+        
+        #Rename columns to be as in Kaggle data:
+        df.rename(columns={'id':'Page'},inplace=True)
+        
+        #Get earliest and latest date across all series to align times [pad start/end]
+        earliest, latest = get_earliest_latest_dates(df)
+        
+        #Excerpt only the relevant time interval, if manually specified
+        #Earliest start date is applied to both training and testing
+        #(testing is a superset of training)
+        if start_date:
+            earliest = max(earliest,start_date)
+            
+        #In training mode, set th end date by clipoing the data so it does not 
+        #contain the most recent data that is used for TEST set:
+        if mode=='TRAIN':
+            latest = min(latest, train_test_split_date)   
+
+        if mode=='TEST':
+            #In TEST mode, to have a COMPLETELY distinct test set, start from day after last day of taining set:
+            #(this means in TEST phase, not even the known history will overlap with the training set, 
+            #which arguably wiould be ok as long as the horizon is completely outside the training data,
+            #but to be extra conservative, do this):
+            assert (earliest < train_test_split_date), 'TRAIN end date (/TEST start date) must be after start of data'
+            
+            #For backtest chunking vs. disjoint train-test split, test set date range is defined differently:
+            if validation_method=='disjoint':
+                next_day = pd.to_datetime(train_test_split_date) + pd.Timedelta(1,unit='D')
+                next_day_string = next_day.strftime('%Y-%m-%d')
+                earliest = max(earliest,next_day_string)
+            #for backtesting in chunks, always have the same start date, which is same as the train start date:
+            #Just resuse the already defined earliest date from training set
+            #elif validation_method=='backtest_chunks':
+            #    earliest = max(earliest,next_day_string)            
+            
+            
+            
+            if test_end_date:
+                #In TEST mode, if there is a manually defined end date, clip there:
+                latest = min(latest,test_end_date)
+                assert (latest > train_test_split_date), 'TEST end date must be after TRAIN end date'
+                
+        
+        idx = pd.date_range(earliest,latest) #!!!!!! fro now doing daily. When doing weekly also keep with default freq='D' . If change to 'W' alignment gets messed up. Just do daily 'D', then later can correct easily.
+        OUT_OF_RANGE_FILL_VALUE = np.NaN #0 #-1 
+
+
+        #Do aggregation from DAILY --> WEEKLY before doing any kind of imputation
+        if sampling_period=='weekly':
+            AGGREGATION_TYPE = 'median'
+            df = aggregate_to_weekly(df, AGGREGATION_TYPE)    
+
+    
+        #Some id's [15,16 in training;  multiple in testing] have their missing values recorded as "-1"
+        #vs. later id's have their missing values simply missing from the original csv
+        #So for those id's that actually have -1, convert to NAN first:
+        df.replace(-1.,np.nan,inplace=True)
+    
+    
+        #Reorganize data for each id (->"Page")
+        unique_ids = pd.unique(df['Page'])
+        df_list = []
+        for i, u in enumerate(unique_ids):
+            d = df.loc[df['Page']==u]
+            #Nan / zero pad start and end date range if needed {end missing}
+            dates = pd.Series(d['y'].values,index=d['date'])
+            dates.index = pd.DatetimeIndex(dates.index)
+            dates = dates.reindex(idx, fill_value=OUT_OF_RANGE_FILL_VALUE)
+            dates.index = pd.to_datetime(dates.index).strftime('%Y-%m-%d')
+            dd = pd.DataFrame(dates).T 
+            dd['Page'] = u
+            
+            print(i,u, 'of {}'.format(unique_ids[-1]))
+            #Only do imputation on training data, NOT on test data.
+            if mode=='TRAIN':
+                if imputation_method=='lagKmedian':
+                    if sampling_period=='daily':
+                        N_seasons = 4
+                        seasonality = 7
+                    elif sampling_period=='weekly':
+                        N_seasons = 4
+                        seasonality = 1
+                    dd = imputation_lagKmedian_single_series(dd,seasonality,N_seasons,OUT_OF_RANGE_FILL_VALUE)
+    
+                #Data augmentation
+                if do_augmentation:
+                    dd = data_augmentation(dd)
+            
+            df_list.append(dd)
+        
+        df = pd.concat(df_list,axis=0)
+        #cols = df.columns.tolist()
+        #df = df[cols[-1:]+cols[:-1]]
+        df.reset_index(drop=True,inplace=True)
+        
+        
+        #If we did aggregation, then above reogranization will have many of the columns Nan / -1,
+        #since e.g. went from daily to weekly, then 6 days of the week will look empty. So remove them.
+        if sampling_period=='weekly':
+            AGGREGATION_TYPE = 'median'
+            df = remove_downsample_columns(df, OUT_OF_RANGE_FILL_VALUE)
+            
+        
+        
+        
+        # =============================================================================
+        # Just for analysis: look at kinds of gaps in series, for DAILY data
+        # =============================================================================
+        #VERBOSE = False
+        #if VERBOSE:
+        #    __missing_vals_distribution(df,OUT_OF_RANGE_FILL_VALUE)     
+            
+
+        
+#        #No longer use this: imputation done per series at creation
+#        #Imputation, dealing with missing seasonality blocks / out of phase
+#        if imputation_method=='median' or imputation_method=='mean':
+#            df = do_imputation(df,imputation_method)
+#            #Could do impoutation then downsampling, vs. downsampling then imputation ... unclear which is better here in general.
+#            #for now assume we do ipmutation THEN aggregation:
+#            #df = aggregate(df,sampling_period)
+
+
+        #Reorder some things just in case
+        df = make_index_col_left(df)
+        print(df)
+
+        #SHould end up with a csv that is rows are series (each id), cols are dates
+        #:eftmost col should be "Pages" to be same as Kaggle format
+        df.to_csv(save_path,index=False)
+        return df
+    
+    
+    
+    
+    def make_key_csv(df):
+        """
+        Make the key_1.csv, key_2.csv
+        May actually not need this???
+        """
+        #save out
+        return    
+    
+    
+    #Make the train csv [for now just do 1, ignore the train 2 part ???]
+    #save_path = os.path.join(os.path.split(myDataDir)[0],f"train_2[ours_{sampling_period}].csv")
+    suffix = 'TEST' if mode=='TEST' else 'TRAIN'
+    ind = chunk_name if chunk_name else ''
+#    augmented = 'augmented' if do_augmentation else ''
+#    save_path = os.path.join(os.path.split(myDataDir)[0],"ours_{}{}__{}{}.csv".format(sampling_period,augmented,suffix,ind))
+    save_path = os.path.join(os.path.split(myDataDir)[0],"ours_{}_{}{}.csv".format(sampling_period,suffix,ind))    
+    df = make_train_csv(df, mode, save_path, imputation_method, sampling_period, start_date, test_end_date)
+
+    #For the prediction phase, need the key ????
+#    make_key_csv(df)
+    
+    
+   
+    
+    return df
+
+
+
+
+
+
+
+
+
+
+
+if __name__ == '__main__':
+    
+    # =============================================================================
+    #     PARAMETERS
+    # =============================================================================
+    RANDOM_SEED = 123456   
+    #Seed random number generator [holdout ID's, data augmentation]
+    np.random.seed(RANDOM_SEED)
+    
+    
+    # TOTAL COMPLETED TRIPS:
+    #myDataDir_TRAIN = r"/Users/kocher/Desktop/forecasting/exData/totalCTDaily"
+    myDataDir_TRAIN = r"/Users/kocher/Desktop/forecasting/exData/totalCTDaily___2018"#Since the test data is just the same data, but a superset, just use it for consistency
+    myDataDir_TEST = r"/Users/kocher/Desktop/forecasting/exData/totalCTDaily___2018"     
+    IMPUTATION_METHOD = 'lagKmedian' #'median' #'STL' #'lagKmedian' #None
+    REMOVE_ID_LIST = []#[3,4]#id's for locations that are no longer useful
+    HOLDOUT_ID_LIST = list(np.random.choice(1800,30,replace=False)) #As a holdout set, exclude a random ~30 ID's  (approximate, since not all ID's 1 to N are present, so will be fewer than Nchose)
+    SAMPLING_PERIOD = 'daily' #'daily', 'weekly', 'monthly'
+    DO_AUGMENTATION = True #False #True
+    
+    #Regardless of validation method, just use this 
+    START_DATE = '2015-01-01' #None
+    
+    #Whether to do a single disjoint train-test split, vs. backtest in chunks that partially overlap
+    VALIDATION_METHOD = 'backtest_chunks' #'disjoint'
+    
+    
+    
+
+
+    # =============================================================================
+    #     MAIN
+    # =============================================================================
+   
+
+    print('DO_AUGMENTATION',DO_AUGMENTATION)
+    print('RANDOM_SEED',RANDOM_SEED)        
+    print('REMOVE_ID_LIST',REMOVE_ID_LIST)
+    print('HOLDOUT_ID_LIST',HOLDOUT_ID_LIST)
+    print('IMPUTATION_METHOD',IMPUTATION_METHOD)
+    print('myDataDir_TRAIN',myDataDir_TRAIN)
+    print('myDataDir_TEST',myDataDir_TEST)
+    print('SAMPLING_PERIOD',SAMPLING_PERIOD)
+    print('VALIDATION_METHOD',VALIDATION_METHOD)
+    
+
+    
+
+    
+    
+    
+    #If doing a single test set, completely disjoint from training set:
+    if VALIDATION_METHOD == 'disjoint':
+        
+        TEST_END_DATE = '2018-07-05' #None #'2018-07-05' just trim off 2 rightmost days since many cities NAN on 7/7/18
+        TRAIN_TEST_SPLIT_DATE = '2017-04-30' #The last day date to include in training set [and 1st NEW day of test set will be the next day.]
+        print('START_DATE',START_DATE)
+        print('TRAIN_TEST_SPLIT_DATE',TRAIN_TEST_SPLIT_DATE)
+        print('TEST_END_DATE',TEST_END_DATE)        
+        
+        #For TRAIN and TEST data
+        modes = ['TRAIN','TEST']
+        for i, myDataDir in enumerate([myDataDir_TRAIN,myDataDir_TEST]):
+            mode=modes[i]
+            print(mode)
+            #Don't do augmentation for test phase [test only on real]
+            #Actually, just leave it same as however things were trained, but then separate performance statistics later so can still see how this affects things
+            #if i==1:
+            #    DO_AUGMENTATION=False
+            
+            #Load
+            df = load_my_data(myDataDir)
+            
+            #Remove any bad/irrelevant cities
+            df = remove_cities(df,REMOVE_ID_LIST)
+            
+            #Remove random holdout IDs [for TRAIN only: still want to test on them]:
+            if mode=='TRAIN':
+                df = remove_cities(df,HOLDOUT_ID_LIST)
+            
+            #Put into same format as used by Kaggle, save out csv's    
+            df = format_like_Kaggle(df, mode, myDataDir, IMPUTATION_METHOD, SAMPLING_PERIOD, DO_AUGMENTATION, VALIDATION_METHOD, TRAIN_TEST_SPLIT_DATE, start_date=START_DATE, test_end_date=TEST_END_DATE, chunk_name=None)
+            
+            print('Finished with ', mode)
+
+
+
+    #If doing a single test set, completely disjoint from training set:
+    elif VALIDATION_METHOD == 'backtest_chunks':
+        
+        chunks_dict = {'set1':(START_DATE,'2017-07-05','2017-10-05'),
+                       'set2':(START_DATE,'2017-10-05','2018-01-05'),
+                       'set3':(START_DATE,'2018-01-05','2018-04-05'),
+                       'set4':(START_DATE,'2018-04-05','2018-07-05'),
+                       }
+        
+        print(chunks_dict)
+        for k,v in chunks_dict.items():
+        
+            chunk_name=k
+            start = v[0] #train and test both use same start date in this backtest mode
+            
+            train_last_day = v[1]
+            test_end = v[2]
+        
+            print('chunk name : ', chunk_name)
+            print('START_DATE',start)
+            print('TRAIN_TEST_SPLIT_DATE',train_last_day)  
+            print('TEST_END_DATE',test_end)
+            
+            #For TRAIN and TEST data
+            modes = ['TRAIN','TEST']
+            for i, myDataDir in enumerate([myDataDir_TRAIN,myDataDir_TEST]):
+                mode=modes[i]
+                print(mode)
+                #Actually, just leave it same as however things were trained, but then separate performance statistics later so can still see how this affects things
+                #if i==1:
+                #    DO_AUGMENTATION=False
+                
+                #Load
+                df = load_my_data(myDataDir)
+                
+                #Remove any bad/irrelevant cities
+                df = remove_cities(df,REMOVE_ID_LIST)
+                
+                #Remove random holdout IDs [for TRAIN only: still want to test on them]:
+                if mode=='TRAIN':
+                    df = remove_cities(df,HOLDOUT_ID_LIST)                
+                
+                #Put into same format as used by Kaggle, save out csv's    
+                df = format_like_Kaggle(df, mode, myDataDir, IMPUTATION_METHOD, SAMPLING_PERIOD, DO_AUGMENTATION, VALIDATION_METHOD, train_last_day, start_date=start, test_end_date=test_end, chunk_name=chunk_name)
+                
+                print('Finished with ', mode)
\ No newline at end of file
diff --git a/QUICKLOOK.py b/QUICKLOOK.py
new file mode 100755
index 0000000..8f22dbe
--- /dev/null
+++ b/QUICKLOOK.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Mon Jun 18 14:03:35 2018
+
+@author: gk
+"""
+
+#For the KAGGLE data, looks like most series (~2/3) are dense [no sparsity]
+#important because in Arturius's script there is threshold on #0's allowed, default he seems to use is not allow any 0's
+#so then he is using ~2/3 of the time series ???
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+
+
+
+
+filepath = r"/......./kaggle-web-traffic-master/data/train_1.csv"
+
+df = pd.read_csv(filepath)
+
+rows = df.values
+
+x = [(i>0.).sum() -1 for i in rows]
+ndays = max(x)
+x = [float(i) / float (ndays) for i in x]
+
+x.sort()
+
+#Sorted plot of percent dense [so about 2/3 of the 145K Kaggle are dense]
+plt.figure()
+plt.plot(x)
+plt.show()
\ No newline at end of file
diff --git a/RUN_ALL_PREDICTIONS.py b/RUN_ALL_PREDICTIONS.py
new file mode 100755
index 0000000..4a699eb
--- /dev/null
+++ b/RUN_ALL_PREDICTIONS.py
@@ -0,0 +1,482 @@
+import tensorflow as tf
+
+#import matplotlib
+#matplotlib.use('Agg')
+#import matplotlib.pyplot as plt
+
+import os
+import pandas as pd
+import numpy as np
+from trainer import predict
+from hparams import build_hparams
+import hparams
+
+from make_features import read_all
+
+import pickle
+import time
+from pandas import ExcelWriter
+
+from collections import defaultdict
+
+
+# =============================================================================
+# PARAMETRS
+# =============================================================================
+#For histories, we care most about shorter series, so sample lower numbers more densely
+HISTORY_SIZES=[7,8,10,12,15,20,30,50,100]#,200,360]
+HORIZON_SIZES=[7,8,10,12,15,20,30,60] #If doing the Amazon decoder, only do the single fixed length that was used for K [eg. 60 days]
+EVAL_STEP_SIZE=4#step size for evaluation. 1 means use every single day as a FCT to evaluate on. E.g. 3 means step forward 3 timesteps between each FCT to evaluate on.
+PREDICT_MODE = 'backtest'#'disjoint'
+NAMES = ['TESTset1', 'TESTset2', 'TESTset3', 'TESTset4']
+
+FEATURES_SET = 'full'# 'arturius' 'simple' 'full'
+SAMPLING_PERIOD = 'daily'
+DATA_TYPE = 'ours' #'kaggle' #'ours'
+Nmodels = 3
+PARAM_SETTING = 'encdec' #Which of the parameter settings to use [s32 is the default Kaggle one, with a few thigns modified as I want]
+PARAM_SETTING_FULL_NAME = hparams.params_encdec #Which of the parameter settings to use corresponding to the PARAM_SETTING. The mapping is defined in hparams.py at the end in "sets = {'s32':params_s32,..."
+OUTPUT_DIR = 'output'
+
+#If doing quantiels (and no SMAPE optimized point estimate), then choose one of the quantiles to use as the point estimate
+#E.g. 40 45 or 47 since positively biased [otherwise use 50]
+Q_pnt_est = 0.45
+
+SAVE_PLOTS = False
+
+
+
+
+
+
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+
+# =============================================================================
+# Performance Metrics
+# =============================================================================
+def smape(true, pred):
+    summ = np.abs(true) + np.abs(pred)
+    smape = np.where(summ == 0, 0, np.abs(true - pred) / summ)
+    #return np.mean(kaggle_smape) * 200
+    return smape * 200
+
+def mean_smape(true, pred):
+    raw_smape = smape(true, pred)
+    masked_smape = np.ma.array(raw_smape, mask=np.isnan(raw_smape))
+    return masked_smape.mean()
+
+def bias(true, pred):
+    """
+    Check if the forecasts are biased up or down
+    
+    All of the predictions have already been clipped to 0 min.
+    Actual is always nonnegative (and 0 means missing so can mask)
+    So if pred+true is 0, means missing, can ignore those
+    """
+    summ = pred + true
+    bias = np.where(summ == 0, 0, (pred - true) / summ)
+    return 100. * bias
+
+def mean_bias(true, pred):
+    raw_bias = bias(true, pred)
+    masked_bias = np.ma.array(raw_bias, mask=np.isnan(raw_bias))
+    return raw_bias.mean()
+
+
+    
+def do_predictions_one_setting(history,horizon,backoffset,TEST_dir,save_plots,n_series,chunk):
+    
+    # =============================================================================
+    # 
+    # =============================================================================
+    #read_all funcion loads the (hardcoded) file "data/all.pkl", or otherwise train2.csv
+    print('loading data...')
+
+    df_all = read_all(DATA_TYPE,SAMPLING_PERIOD,f'TEST{chunk}')
+    print('df_all.columns')
+    print(df_all.columns)
+#        filename = f'train_2_{data_type}_{sampling_period}'
+#        df = read_file(filename)    
+    
+
+    batchsize = n_series #For simplicity, just do all series at once if not too many for memory
+    print('batchsize',batchsize)
+    # =============================================================================
+    # 
+    # =============================================================================
+    prev = df_all#.loc[:,:'2017-07-08']
+    paths = [p for p in tf.train.get_checkpoint_state(f'data/cpt/TRAIN{chunk}').all_model_checkpoint_paths]
+    #tf.reset_default_graph()
+    #preds = predict(paths, default_hparams(), back_offset=0,
+    #                    n_models=3, target_model=0, seed=2, batch_size=2048, asgd=True)
+    t_preds = defaultdict(lambda:[]) if hparams.DO_QUANTILES else []
+    for tm in range(Nmodels):
+        tf.reset_default_graph()
+        _ = predict(FEATURES_SET, SAMPLING_PERIOD, paths, TEST_dir, build_hparams(PARAM_SETTING_FULL_NAME), history, horizon, back_offset=backoffset, return_x=False,
+                    n_models=Nmodels, target_model=tm, seed=2, batch_size=batchsize, asgd=True)
+        if hparams.DO_QUANTILES:
+            for kk, vv in _.items():
+                t_preds[kk].append(vv)
+        else:
+            t_preds.append(_)
+        
+    
+    # =============================================================================
+    # average the N models predictions
+    # =============================================================================
+    if hparams.DO_QUANTILES:
+        preds = {}
+        for kk, vv in t_preds.items():
+            preds[kk] = sum(vv)/float(Nmodels)
+    else:
+        preds = sum(t_preds)/float(Nmodels)
+        preds = {'point_est':preds}
+    
+    
+    
+    #Do for every quantile
+    final_preds = {}
+    for kk, vv in preds.items():
+        
+        # =============================================================================
+        # look at missing
+        # =============================================================================
+        if DATA_TYPE=='kaggle':
+            missing_pages = prev.index.difference(preds.index)
+            print('missing_pages',missing_pages)
+            # Use zeros for missing pages
+            rmdf = pd.DataFrame(index=missing_pages,
+                            data=np.tile(0, (len(preds.columns),len(missing_pages))).T, columns=preds.columns)
+            f_preds = preds.append(rmdf).sort_index()
+        elif DATA_TYPE=='ours':
+#            f_preds = preds
+            f_preds = vv            
+        # Use zero for negative predictions
+        f_preds[f_preds < 0.5] = 0
+        # Rouns predictions to nearest int
+        f_preds = np.round(f_preds).astype(np.int64)
+        
+        final_preds[kk] = f_preds
+
+#    print('final_preds',final_preds)
+    
+    
+    
+    
+    # =============================================================================
+    # save out all predictions all days (for our stuff will be relevant, for his Kaggle maybe just needed one day)
+    # =============================================================================
+    #firstK = 1000 #for size issues, for now while dev, just a few to look at
+    #ggg = f_preds.iloc[:firstK]
+    #ggg.to_csv('data/all_days_submission.csv.gz', compression='gzip', index=False, header=True)
+    #Instead of saving indivual, just wait and append and look at finals.
+#    f_preds.to_csv(f'{OUTPUT_DIR}/all_predictions_ours.csv.gz', compression='gzip', index=False, header=True)
+    
+    
+    
+    
+    # =============================================================================
+    # visualize to do wuick check
+    # =============================================================================
+    if save_plots:
+        randomK = 1000
+        print('Saving figs of {} time series as checks'.format(randomK))
+        pagenames = list(f_preds.index)
+        pages = np.random.choice(pagenames, size=min(randomK,len(pagenames)), replace=False)
+        N = pages.size
+        for jj, page in enumerate(pages):
+            print(f"{jj} of {N}")
+            plt.figure()
+            if DATA_TYPE=='kaggle':
+                prev.loc[page].fillna(0).plot()#logy=True)
+                f_preds.loc[page].fillna(0).plot(logy=True)
+            elif DATA_TYPE=='ours':
+                prev.loc[int(page)].plot()
+                f_preds.loc[page].plot()
+            plt.title(page)
+            if not os.path.exists(OUTPUT_DIR):
+                os.mkdir(OUTPUT_DIR)
+            pathname = 'ddddddddddd'#os.path.join(OUTPUT_DIR, 'fig_{}.png'.format(jj))
+            plt.savefig(pathname)
+            plt.close()
+        
+        
+        
+    #Cannot view on the AWS so move to local:   
+    #zip -r output.zip output
+    #cp output.zip /home/...../sync
+    return final_preds
+    
+    
+    
+def get_data_timesteps_Nseries(df_path):
+    """
+    Get the data_timesteps value from the TEST set data.
+    Because every day will be used, it is just the number of days in the df.
+    
+    And get number of time series to prdict on [number of rows], to use as batchsize
+    """
+    df = pd.read_csv(df_path)
+    columns = list(df.columns)
+    columns.remove('Page')
+    return len(columns), len(df)
+
+
+
+def get_data_timesteps_Nseries__backtest(test_path,train_path):
+    """
+    For backtest chunk mode only.
+    Get the number of data timesteps (which potentially varies per testset1,2,3,4),
+    in order to determine backoffset range.
+    
+    Because in this mode the TEST set also includes the TRAIN ste in it, cannot 
+    just use length of TEST set alone to get datatimesteps.
+    """
+    test = pd.read_csv(test_path)
+#    test_day = test.columns[-1]
+    train = pd.read_csv(train_path)
+#    train_day = train.columns[-1]
+    
+    #Assuming consecutive days, just get diff of number columns:
+    data_timesteps = len(test.columns) - len(train.columns)
+    
+    #Depending if did holdout id's, then TEST would have extra id's not in TRAIN
+    #For batchsize, using N_series as number of rows of TEST set.
+    #Since metrics are later made from dicts, if an ID is predicted on more than once,
+    #is ok, since would have same key in dict and only be there once anyway.
+    N_series = len(test)
+    
+    return data_timesteps, N_series
+        
+    
+
+
+
+
+def SaveMultisheetXLS(list_dfs, list_sheetnames, xls_path):
+    """
+    xls_path - must be .xls or else will not save sheets properly
+    """
+    writer = ExcelWriter(xls_path)
+    for s in range(len(list_dfs)):
+        list_dfs[s].to_excel(writer,list_sheetnames[s],index=False)
+    writer.save()
+    
+    
+    
+
+
+if __name__ == '__main__':
+    
+    hparams = build_hparams(PARAM_SETTING_FULL_NAME)
+    
+    
+    #For the 4 chunk backtesting performance assessment
+    for name in NAMES:
+        
+        print('name: ',name)
+        chunk = name.replace('TEST','')
+        TEST_DF_PATH = f"data/ours_daily_{name}.csv"
+        TEST_dir = f"data/{name}"
+        TRAIN_DF_PATH = TEST_DF_PATH.replace('TEST','TRAIN')
+        print('TEST_DF_PATH',TEST_DF_PATH)
+        print('TEST_dir',TEST_dir)
+        print('TRAIN_DF_PATH',TRAIN_DF_PATH)
+    
+        groundtruth = pd.read_csv(TEST_DF_PATH)
+        groundtruth.sort_values(['Page'])    
+        print('groundtruth',groundtruth)
+    
+        if not os.path.exists(OUTPUT_DIR):
+            os.makedirs(OUTPUT_DIR)
+        
+        if PREDICT_MODE=='disjoint':
+            data_timesteps, N_series = get_data_timesteps_Nseries(TEST_DF_PATH)
+        elif PREDICT_MODE=='backtest':
+            data_timesteps, N_series = get_data_timesteps_Nseries__backtest(TEST_DF_PATH,TRAIN_DF_PATH)
+        
+#        #For the direct decoder, only need to do single prediction (all smaller horizons contained in the max horizon)
+#        if hparams.MLP_DIRECT_DECODER:
+#            #HORIZON_SIZES = max(HORIZON_SIZES)
+#            HORIZON_SIZES = [hparams.horizon_window_size_minmax[1]]
+        
+        
+        hist_horiz__all = {}
+        t0 = time.clock()
+        for history in HISTORY_SIZES:
+            for horizon_size in HORIZON_SIZES:
+                print('HISTORY ',history, 'of ', HISTORY_SIZES)
+                print('HORIZON ',horizon_size, 'of ', HORIZON_SIZES)
+                
+                
+                
+                #For the direct decoder, need to always use the same K horizon, 
+                #but then once the predictions are made, if want to look at horizon < K, then just excerpt leftmost horizon points.
+                #Technically, for this direct decoder, only need to do single prediction (all smaller horizons contained in the max horizon)
+                #which would speed things up a lot. However, would require reorganizing code a bit, so for easiness, just use current code with duplication.
+                if hparams.MLP_DIRECT_DECODER:
+                    horizon = hparams.horizon_window_size_minmax[1]
+                    print('horizon used for direct decoder: {} instead of {}'.format(horizon,horizon_size))
+                else:
+                    horizon = horizon_size
+                
+                
+                #For the disjoint mode, the test set does not overlap art all w the train set. 
+                #The history + horizon window must completely fit in the test set alone.
+                #vs.
+                #in backtest chunk mode, test set include full train set, but 
+                #horizon window always starts after the train set (so horizon 
+                #is fully inside test set). SO for backtest chunk mode, irrelevant
+                #what history + horizon is, only matters that the horizon is fully inside TEST set.
+                if (PREDICT_MODE=='disjoint') and (history+horizon >= data_timesteps):
+                    print(f'history+horizon ({history+horizon}) >= data set size ({data_timesteps})')
+                    continue
+                if (PREDICT_MODE=='backtest') and (horizon > data_timesteps):
+                    print(f'horizon ({horizon}) > test region size ({data_timesteps})')
+                    continue                
+                
+                #Get the range of values that will step through for 
+                if (PREDICT_MODE=='disjoint'):
+                    offs = [i for i in range(horizon, data_timesteps - history +1, EVAL_STEP_SIZE)]
+                if (PREDICT_MODE=='backtest'):
+                    offs = [i for i in range(horizon, data_timesteps+1, EVAL_STEP_SIZE)]
+
+
+
+
+
+                dflists = [] #defaultdict(lambda:[])# if hparams.DO_QUANTILES else [] #!!!!!!!
+                for gg, backoffset in enumerate(offs):
+                    print('backoffset ',backoffset, 'of ', offs)
+                    f_preds = do_predictions_one_setting(history,horizon,backoffset,TEST_dir,SAVE_PLOTS,N_series,chunk)
+#                    print('f_preds',f_preds)
+                    
+                    #If doing the direct decoder, even though the model is built to predict horizon K ()
+                    if hparams.MLP_DIRECT_DECODER:
+                        for kk,vv in f_preds.items():
+                            K_cols = f_preds[kk].columns[:horizon_size]
+                            f_preds[kk] = vv[K_cols]
+                    
+                    
+                    #Save out some example quantile predictions to make sure they look ok
+                    #!!!!! This is not all of the quantiles: it is only the first backoffset. But can use to plot later
+                    if gg==0:
+                        for kk,vv in f_preds.items():
+                            __out = os.path.join(OUTPUT_DIR, f"{kk}__{history}_{horizon_size}_{chunk}.csv")
+                            vv.to_csv(__out)
+                            
+                    #Just focus on point estimates for now: using q45
+                    if hparams.DO_QUANTILES:
+                        f_preds = f_preds[Q_pnt_est]
+                    else:
+                        f_preds = f_preds['point_est']   
+
+
+
+                    
+                    #COlumns are same for all quantiles/point estimates, so just use point_est to get dates:
+                    #cols = f_preds[Q_pnt_est].columns if hparams.DO_QUANTILES else f_preds['point_est'].columns
+                    #cols = f_preds.values()[0].columns
+                    cols = f_preds.columns
+                    dates = [i.strftime('%Y-%m-%d') for i in cols]
+#                    print(dates)
+                    
+                    
+                    
+                    
+                    #For each series
+                    for jj in range(len(f_preds)):
+                        series = f_preds.iloc[jj]
+                        _id = series.name
+                        true = groundtruth[groundtruth['Page'].astype(str) ==_id]
+                        
+                        
+                        first_pred_day = dates[0]
+                        d1 = pd.date_range(first_pred_day,first_pred_day)[0] - pd.Timedelta(history,unit='D')
+                        history_dates = pd.date_range(start=d1, end=first_pred_day, freq='D')[:-1]   #!!!!!! asuming daily sampling...
+                        history_dates = [i.strftime('%Y-%m-%d') for i in history_dates]
+                        history_missing_count = np.isnan(true[history_dates].values[0]).sum()
+    #                    print('history_missing_count',history_missing_count)                    
+    #                    print('true',true)
+                        true = true[dates].values[0]
+                        horizon_missing_count = np.isnan(true).sum()
+    #                    print('horizon_missing_count',horizon_missing_count)
+                        
+                        #Get smape, mae, bias over this prediction
+                        smp = mean_smape(true, series.values)
+    #                    mae = asdasdasd
+                        bi = mean_bias(true, series.values)
+    #                    print(smape,bias)
+                        metr_dct = {'SMAPE':smp, 
+                                        'bias':bi,
+                                        #'MAE':mae,
+                                        'predict_start_date':dates[0],
+                                        'predict_end_date':dates[-1],
+                                        'history_missing_count':history_missing_count,
+                                        'horizon_missing_count':horizon_missing_count
+                                        }
+                        #When doing quantiles, also see what fraction is in that quantile
+#                        if hparams.DO_QUANTILES:
+#                            metr_dct[asd] = asdasd
+#                            metr_dct['sharpness'] = 0.
+#                            #MSIS ...
+                            
+                        hist_horiz__all[(history,horizon_size,backoffset,_id)] = metr_dct
+    #                    print(hist_horiz__all)
+                        
+                      
+    
+    
+                    #For saving out predictions:
+                    dates = [i.strftime('%m/%d/%Y') for i in cols]
+                    d = {cols[i]:dates[i] for i in range(len(cols))}
+                    f_preds.rename(columns=d,inplace=True)
+                    f_preds['Page'] = f_preds.index.values
+                    #Depending on missing data in the test set in the history window for this backoffset,
+                    #it oculd be that that particular id did not pass the train completeness threshold.
+                    #Then it will not be included, but the batchsize will still be len(df), so to fill that missing
+                    #id, it will repeat id's that already had predictions. THey will be identical, 
+                    #so just take the 1st occurrence for those repeated id's:
+                    df = []
+                    u_ids = np.unique(f_preds['Page'].values)
+                    for u in u_ids:
+                        s = f_preds[f_preds['Page']==u]
+                        if len(s)>1:
+                            s = s.head(1)
+                        df.append(s)
+                    f_preds = pd.concat(df,axis=0)
+                    cols = list(f_preds.columns)
+                    cols.remove('Page')
+                    cols = ['Page'] + cols
+                    f_preds = f_preds[cols]                 
+#                    print(f_preds)
+                    
+                    dflists.append(f_preds)
+                    #Care about the metrics within different partitions:
+                    #Beside just history and horizon size, also consider:
+                    #real vs. synthetic augmented series
+                    #training ID vs. new ID only in TEST set
+                    #series contains holiday vs. only non-holidays
+                    #day of week
+                    
+                savename = f"{str(history)}_{str(horizon_size)}_{name}.xls"
+                savename = os.path.join(OUTPUT_DIR,savename)
+                sheetnames = [str(i) for i in offs]
+                SaveMultisheetXLS(dflists, sheetnames, savename)
+                #each sheet is for a single backoffset, so each sheet contains all ~1800 id's
+    
+        
+#        print(hist_horiz__all)
+        t1 = time.clock()
+        print('elapsed time: ',t1-t0)
+        #Now that all metrics stored in dict, save dict, and analyze further
+        #pickle ... hist_horiz__all
+    #    print(hist_horiz__all)
+        dict_savename = os.path.join(OUTPUT_DIR,f"hist_horiz__{name}.pickle")
+        with open(dict_savename, "wb") as outp:
+            pickle.dump(hist_horiz__all, outp)#, protocol=2)
\ No newline at end of file
diff --git a/Readme.md b/Readme.md
old mode 100644
new mode 100755
index f7ae3e9..4d7e117
--- a/Readme.md
+++ b/Readme.md
@@ -1,37 +1,66 @@
-# Kaggle Web Traffic Time Series Forecasting
-1st place solution
+# RNN-based Encoder-Decoder for Time Series Forecasting w/ Quantiles 
+
 
+Based on Arturus'
+Kaggle Web Traffic Time Series Forecasting
+1st place solution
+https://github.com/Arturus/kaggle-web-traffic
 ![predictions](images/predictions.png)
+See also [detailed model description](how_it_works.md)
 
-Main files:
- * `make_features.py` - builds features from source data
- * `input_pipe.py` - TF data preprocessing pipeline (assembles features
-  into training/evaluation tensors, performs some sampling and normalisation)
- * `model.py` - the model
- * `trainer.py` - trains the model(s)
- * `hparams.py` - hyperpatameter sets.
- * `submission-final.ipynb` - generates predictions for submission
-
-How to reproduce competition results:
-1. Download input files from https://www.kaggle.com/c/web-traffic-time-series-forecasting/data :
-`key_2.csv.zip`, `train_2.csv.zip`, put them into `data` directory.
-2. Run `python make_features.py data/vars --add_days=63`. It will
-extract data and features from the input files and put them into
-`data/vars` as Tensorflow checkpoint.
-3. Run trainer:
-`python trainer.py --name s32 --hparam_set=s32 --n_models=3 --name s32 --no_eval --no_forward_split
- --asgd_decay=0.99 --max_steps=11500 --save_from_step=10500`. This command
- will simultaneously train 3 models on different seeds (on a single TF graph)
- and save 10 checkpoints from step 10500 to step 11500 to `data/cpt`.
- __Note:__ training requires GPU, because of cuDNN usage. CPU training will not work.
- If you have 3 or more GPUs, add `--multi_gpu` flag to speed up the training. One can also try different
-hyperparameter sets (described in `hparams.py`): `--hparam_set=definc`,
-`--hparam_set=inst81`, etc.
-Don't be afraid of displayed NaN losses during training. This is normal,
-because we do the training in a blind mode, without any evaluation of model performance.
-4. Run `submission-final.ipynb` in a standard jupyter notebook environment,
-execute all cells. Prediction will take some time, because it have to
-load and evaluate 30 different model weights. At the end,
-you'll get `submission.csv.gz` file in `data` directory.
+-----------------------------------
 
-See also [detailed model description](how_it_works.md)
+GK modifications for own forecasting application:
+
+1) Architecture improvements:
+	- Recursive feedforward postprocessor: after getting sequence of predictions from RNN-based decoder, refine predictions in 2 layer MLP using nearby timesteps predictions + features + context.
+	- give encoded representation vector as context to every decoder timestep
+	- K step lookback: ideally the RNN would learn a hidden state representation that ~completely describes state of the system. In realiy, that may be too much to expect. In addition to previous timestep prediction y_i-1, also feed in y_i-2,...,y_i-K for K-step lookback. [~same as using lagged features]
+2) Performance Analysis:
+	- performance analysis of test set SMAPE as function of history/horizon window sizes [randomized uniformly in training over all min-max range of history/horizon window sizes]
+	- 
+2) More features, relevant to my data. More focus on seasonalities, and "spiral encoding" for holidays. Automated data augmentation.
+3) Dealing with holes/sparsity as in my data.
+
+
+
+The complete pipeline is:
+
+1. $source activate gktf.               #previously set up a conda environment w/ Python 3.6, tensorflow 1.4.0, to match same versions as Kaggle solution
+2. $cd ..../kaggle-web-traffic
+3. $python PREPROCESS.py               #Maximize reuse of existing architecture: just put my data in exact same format as Kaggle competition csv's
+4. $./MAKEFEATURES_TRAIN_ALL.sh         #For backtestign in chunks method [4 partially overlapping train-test set pairs]
+5. $python RUN_ALL_PREDICTIONS.py      #Run predictions for every ID over triplets of (history, horizon, start point)
+6. $python PERFORMANCE_HEATMAPS.py     #Analyze the prediction metrics across different dimensions 
+7. $python quantile_plots.py			#FOr a subet of the predictions, get an idea what the quantiles look like
+
+
+
+
+
+---------------------------------------
+#Just in case making new features
+cd data
+rm -R vars*
+rm -R cpt/
+rm -R cpt_tmp/
+rm -R logs/
+rm *.pkl
+cd ..
+ll data/
+
+python3 make_features.py data/TRAINset1 ours daily full --add_days=0
+python3 make_features.py data/TESTset1 ours daily full --add_days=0
+
+#For backtesting in 4 chunks, no longer do this. Run the script MAKEFEATURES_TRAIN_ALL.py to automate feature making and training all 4 chunks.
+python3 trainer.py full daily --name=TRAINset1 --hparam_set=encdec --n_models=3 --asgd_decay=0.99 --max_steps=11500 --save_from_step=3 --patience=5 --max_epoch=50 --save_epochs_performance
+
+
+----------------------------------------------------------------------------------------------------------------------------------------------------------
+To do:
+2. for weekly. monthly inputs, need to change few places in tensorflow code
+3. Prediction intervals
+4. Architecture improvements: bi enc, dilated; randomly dilated; randomly dilated with bounds per layer
+4. MLP direct multihorizon
+5. custom attention [e.g. position specific]
+6. VAE aug
\ No newline at end of file
diff --git a/SGDN_HD_optimizer.py b/SGDN_HD_optimizer.py
new file mode 100755
index 0000000..549e42b
--- /dev/null
+++ b/SGDN_HD_optimizer.py
@@ -0,0 +1,76 @@
+#Copy paste from https://github.com/zadaianchuk/HyperGradientDescent/blob/master/SGDN_HD_optimizer.py
+#Hypergradient Descent Optimizer
+
+
+from __future__ import division
+
+import tensorflow as tf
+
+class MomentumSGDHDOptimizer(tf.train.GradientDescentOptimizer):
+
+    def __init__(self, alpha_0, beta =10**(-7), name="HGD", mu=0.95, type_of_learning_rate ="global"):
+        super(MomentumSGDHDOptimizer, self).__init__(beta, name=name)
+        self._mu = mu
+        self._alpha_0 = alpha_0
+        self._beta = beta
+        self._type = type_of_learning_rate
+
+
+    def minimize(self, loss, global_step):
+
+        # Algo params as constant tensors
+        mu = tf.convert_to_tensor(self._mu, dtype=tf.float32)
+        alpha_0=tf.convert_to_tensor(self._alpha_0, dtype=tf.float32)
+        beta=tf.convert_to_tensor(self._beta, dtype=tf.float32)
+
+        var_list = tf.trainable_variables()
+
+        # create and retrieve slot variables for:
+        # direction of previous step
+        ds = [self._get_or_make_slot(var,
+                  tf.constant(0.0, tf.float32, var.get_shape()), "direction", "direction")
+                  for var in var_list]
+        # current learning_rate alpha
+        if self._type == "global":
+            alpha = self._get_or_make_slot(alpha_0, alpha_0, "learning_rate", "learning_rate")
+        else:
+            alphas = [self._get_or_make_slot(var,
+                      tf.constant(self._alpha_0, tf.float32, var.get_shape()), "learning_rates", "learning_rates")
+                      for var in var_list]
+        # moving average estimation
+        ms = [self._get_or_make_slot(var,
+            tf.constant(0.0, tf.float32, var.get_shape()), "m", "m")
+            for var in var_list]
+
+        # update moving averages of the stochastic gradient:
+        grads = tf.gradients(loss, var_list)
+        m_updates = [m.assign(mu*m + (1.0-mu)*g) for m, g in zip(ms, grads)]
+
+        #update of learning rate alpha, it is the main difference between SGD with Nesterov momentum
+        #and its hypergradient version
+        if self._type == "global":
+            hypergrad = sum([tf.reduce_sum(tf.multiply(d,g)) for d,g in zip(ds, grads)])
+            alphas_update = [alpha.assign(alpha-beta*hypergrad)]
+        else:
+            hypergrads = [tf.multiply(d,g) for d,g in zip(ds, grads)]
+            alphas_update = [alpha.assign(alpha-beta*hypergrad) for alpha,hypergrad in zip(alphas,hypergrads)]
+
+        # update step directions
+        with tf.control_dependencies(m_updates+alphas_update):  #we want to be sure that alphas calculated using previous step directions
+            ds_updates=[d.assign(-(mu*m + (1.0-mu)*g)) for (m,d,g) in zip(ms,ds,grads)]
+
+        # update parameters of the model
+        with tf.control_dependencies(ds_updates):
+                if self._type == "global":
+                    alpha_norm = alpha
+                    variable_updates = [v.assign_add(alpha*d) for v, d in zip(var_list, ds)]
+                else:
+                    alpha_norm = sum([tf.reduce_mean(alpha**2) for alpha in alphas])
+                    variable_updates = [v.assign_add(alpha*d) for v,d,alpha in zip(var_list, ds,alphas)]
+                global_step.assign_add(1)
+
+                #add summuries  (track alphas changes)
+                with tf.name_scope("summaries"):
+                    with tf.name_scope("per_iteration"):
+                        alpha_sum=tf.summary.scalar("alpha", alpha_norm, collections=[tf.GraphKeys.SUMMARIES, "per_iteration"])
+        return tf.group(*variable_updates)
\ No newline at end of file
diff --git a/__init__.py b/__init__.py
new file mode 100755
index 0000000..e69de29
diff --git a/classification_models.py b/classification_models.py
new file mode 100755
index 0000000..e6b633b
--- /dev/null
+++ b/classification_models.py
@@ -0,0 +1,120 @@
+# -*- coding: utf-8 -*-
+"""
+Dilated LSTM Cells in Tensorflow
+
+From github user "code-terminator"
+https://github.com/code-terminator/DilatedRNN/blob/master/models/drnn.py
+
+based on the paper
+Dilated Recurrent Neural Networks
+Nov 2017
+Chang et al.
+https://arxiv.org/pdf/1710.02224.pdf 
+"""
+
+import tensorflow as tf
+from drnn import multi_dRNN_with_dilations
+
+def _contruct_cells(hidden_structs, cell_type):
+    """
+    This function contructs a list of cells.
+    """
+    # error checking
+    if cell_type not in ["RNN", "LSTM", "GRU"]:
+        raise ValueError("The cell type is not currently supported.")
+
+    # define cells
+    cells = []
+    for hidden_dims in hidden_structs:
+        if cell_type == "RNN":
+            cell = tf.contrib.rnn.BasicRNNCell(hidden_dims)
+        elif cell_type == "LSTM":
+            cell = tf.contrib.rnn.BasicLSTMCell(hidden_dims)
+        elif cell_type == "GRU":
+            cell = tf.contrib.rnn.GRUCell(hidden_dims)
+        cells.append(cell)
+
+    return cells
+
+
+def _rnn_reformat(x, input_dims, n_steps):
+    """
+    This function reformat input to the shape that standard RNN can take. 
+    
+    Inputs:
+        x -- a tensor of shape (batch_size, n_steps, input_dims).
+    Outputs:
+        x_reformat -- a list of 'n_steps' tenosrs, each has shape (batch_size, input_dims).
+    """
+    # permute batch_size and n_steps
+    x_ = tf.transpose(x, [1, 0, 2])
+    # reshape to (n_steps*batch_size, input_dims)
+    x_ = tf.reshape(x_, [-1, input_dims])
+    # split to get a list of 'n_steps' tensors of shape (batch_size, input_dims)
+    x_reformat = tf.split(x_, n_steps, 0)
+
+    return x_reformat
+
+
+def drnn_classification(x,
+                        hidden_structs,
+                        dilations,
+                        n_steps,
+                        n_classes,
+                        input_dims=1,
+                        cell_type="RNN"):
+    """
+    This function construct a multilayer dilated RNN for classifiction.  
+    Inputs:
+        x -- a tensor of shape (batch_size, n_steps, input_dims).
+        hidden_structs -- a list, each element indicates the hidden node dimension of each layer.
+        dilations -- a list, each element indicates the dilation of each layer.
+        n_steps -- the length of the sequence.
+        n_classes -- the number of classes for the classification.
+        input_dims -- the input dimension.
+        cell_type -- the type of the RNN cell, should be in ["RNN", "LSTM", "GRU"].
+    
+    Outputs:
+        pred -- the prediction logits at the last timestamp and the last layer of the RNN.
+                'pred' does not pass any output activation functions.
+    """
+    # error checking
+    assert (len(hidden_structs) == len(dilations))
+
+    # reshape inputs
+    x_reformat = _rnn_reformat(x, input_dims, n_steps)
+
+    # construct a list of cells
+    cells = _contruct_cells(hidden_structs, cell_type)
+
+    # define dRNN structures
+    layer_outputs = multi_dRNN_with_dilations(cells, x_reformat, dilations)
+
+    if dilations[0] == 1:
+        # dilation starts at 1, no data dependency lost
+        # define the output layer
+        weights = tf.Variable(tf.random_normal(shape=[hidden_structs[-1],
+                                                      n_classes]))
+        bias = tf.Variable(tf.random_normal(shape=[n_classes]))
+        # define prediction
+        pred = tf.add(tf.matmul(layer_outputs[-1], weights), bias)
+    else:
+        # dilation starts not at 1, needs to fuse the output
+        
+        # define output layer
+        weights = tf.Variable(tf.random_normal(shape=[hidden_structs[
+            -1] * dilations[0], n_classes]))
+        bias = tf.Variable(tf.random_normal(shape=[n_classes]))
+
+        # concat hidden_outputs
+        for idx, i in enumerate(range(-dilations[0], 0, 1)):
+            if idx == 0:
+                hidden_outputs_ = layer_outputs[i]
+            else:
+                hidden_outputs_ = tf.concat(
+                    [hidden_outputs_, layer_outputs[i]],
+                    axis=1)
+
+        pred = tf.add(tf.matmul(hidden_outputs_, weights), bias)
+
+    return pred
\ No newline at end of file
diff --git a/cocob.py b/cocob.py
old mode 100644
new mode 100755
diff --git a/directdecoder_Run_All.py b/directdecoder_Run_All.py
new file mode 100644
index 0000000..e94420c
--- /dev/null
+++ b/directdecoder_Run_All.py
@@ -0,0 +1,470 @@
+import tensorflow as tf
+
+#import matplotlib
+#matplotlib.use('Agg')
+#import matplotlib.pyplot as plt
+
+import os
+import pandas as pd
+import numpy as np
+from trainer import predict
+from hparams import build_hparams
+import hparams
+
+from make_features import read_all
+
+import pickle
+import time
+from pandas import ExcelWriter
+
+from collections import defaultdict
+
+
+# =============================================================================
+# PARAMETRS
+# =============================================================================
+#For histories, we care most about shorter series, so sample lower numbers more densely
+HISTORY_SIZES=[100]#[7,8,10,12,15,20,30,50,100]#,200,360]
+HORIZON_SIZES=[60]#[7,8,10,12,15,20,30,60] #If doing the Amazon decoder, only do the single fixed length that was used for K [eg. 60 days]
+EVAL_STEP_SIZE=10#step size for evaluation. 1 means use every single day as a FCT to evaluate on. E.g. 3 means step forward 3 timesteps between each FCT to evaluate on.
+PREDICT_MODE = 'backtest'#'disjoint'
+NAMES = ['TESTset1', 'TESTset2', 'TESTset3', 'TESTset4']
+
+FEATURES_SET = 'full'# 'arturius' 'simple' 'full'
+SAMPLING_PERIOD = 'daily'
+DATA_TYPE = 'ours' #'kaggle' #'ours'
+Nmodels = 3
+PARAM_SETTING = 'encdec' #Which of the parameter settings to use [s32 is the default Kaggle one, with a few thigns modified as I want]
+PARAM_SETTING_FULL_NAME = hparams.params_encdec #Which of the parameter settings to use corresponding to the PARAM_SETTING. The mapping is defined in hparams.py at the end in "sets = {'s32':params_s32,..."
+OUTPUT_DIR = 'output'
+
+#If doing quantiels (and no SMAPE optimized point estimate), then choose one of the quantiles to use as the point estimate
+#E.g. 40 45 or 47 since positively biased [otherwise use 50]
+Q_pnt_est = 0.45
+
+SAVE_PLOTS = False
+
+
+
+
+NAMES = ['TESTset3'] #!!!!!!
+
+
+
+
+
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+
+# =============================================================================
+# Performance Metrics
+# =============================================================================
+def smape(true, pred):
+    summ = np.abs(true) + np.abs(pred)
+    smape = np.where(summ == 0, 0, np.abs(true - pred) / summ)
+    #return np.mean(kaggle_smape) * 200
+    return smape * 200
+
+def mean_smape(true, pred):
+    raw_smape = smape(true, pred)
+    masked_smape = np.ma.array(raw_smape, mask=np.isnan(raw_smape))
+    return masked_smape.mean()
+
+def bias(true, pred):
+    """
+    Check if the forecasts are biased up or down
+    
+    All of the predictions have already been clipped to 0 min.
+    Actual is always nonnegative (and 0 means missing so can mask)
+    So if pred+true is 0, means missing, can ignore those
+    """
+    summ = pred + true
+    bias = np.where(summ == 0, 0, (pred - true) / summ)
+    return 100. * bias
+
+def mean_bias(true, pred):
+    raw_bias = bias(true, pred)
+    masked_bias = np.ma.array(raw_bias, mask=np.isnan(raw_bias))
+    return raw_bias.mean()
+
+
+    
+def do_predictions_one_setting(history,horizon,backoffset,TEST_dir,save_plots,n_series,chunk):
+    
+    # =============================================================================
+    # 
+    # =============================================================================
+    #read_all funcion loads the (hardcoded) file "data/all.pkl", or otherwise train2.csv
+    print('loading data...')
+
+    df_all = read_all(DATA_TYPE,SAMPLING_PERIOD,f'TEST{chunk}')
+    print('df_all.columns')
+    print(df_all.columns)
+#        filename = f'train_2_{data_type}_{sampling_period}'
+#        df = read_file(filename)    
+    
+
+    batchsize = n_series #For simplicity, just do all series at once if not too many for memory
+    print('batchsize',batchsize)
+    # =============================================================================
+    # 
+    # =============================================================================
+    prev = df_all#.loc[:,:'2017-07-08']
+    paths = [p for p in tf.train.get_checkpoint_state(f'data/cpt/TRAIN{chunk}').all_model_checkpoint_paths]
+    #tf.reset_default_graph()
+    #preds = predict(paths, default_hparams(), back_offset=0,
+    #                    n_models=3, target_model=0, seed=2, batch_size=2048, asgd=True)
+    t_preds = defaultdict(lambda:[]) if hparams.DO_QUANTILES else []
+    for tm in range(Nmodels):
+        tf.reset_default_graph()
+        _ = predict(FEATURES_SET, SAMPLING_PERIOD, paths, TEST_dir, build_hparams(PARAM_SETTING_FULL_NAME), history, horizon, back_offset=backoffset, return_x=False,
+                    n_models=Nmodels, target_model=tm, seed=2, batch_size=batchsize, asgd=True)
+        if hparams.DO_QUANTILES:
+            for kk, vv in _.items():
+                t_preds[kk].append(vv)
+        else:
+            t_preds.append(_)
+        
+    
+    # =============================================================================
+    # average the N models predictions
+    # =============================================================================
+    if hparams.DO_QUANTILES:
+        preds = {}
+        for kk, vv in t_preds.items():
+            preds[kk] = sum(vv)/float(Nmodels)
+    else:
+        preds = sum(t_preds)/float(Nmodels)
+        preds = {'point_est':preds}
+    
+    
+    
+    #Do for every quantile
+    final_preds = {}
+    for kk, vv in preds.items():
+        
+        # =============================================================================
+        # look at missing
+        # =============================================================================
+        if DATA_TYPE=='kaggle':
+            missing_pages = prev.index.difference(preds.index)
+            print('missing_pages',missing_pages)
+            # Use zeros for missing pages
+            rmdf = pd.DataFrame(index=missing_pages,
+                            data=np.tile(0, (len(preds.columns),len(missing_pages))).T, columns=preds.columns)
+            f_preds = preds.append(rmdf).sort_index()
+        elif DATA_TYPE=='ours':
+#            f_preds = preds
+            f_preds = vv            
+        # Use zero for negative predictions
+        f_preds[f_preds < 0.5] = 0
+        # Rouns predictions to nearest int
+        f_preds = np.round(f_preds).astype(np.int64)
+        
+        final_preds[kk] = f_preds
+
+#    print('final_preds',final_preds)
+    
+    
+    
+    
+    # =============================================================================
+    # save out all predictions all days (for our stuff will be relevant, for his Kaggle maybe just needed one day)
+    # =============================================================================
+    #firstK = 1000 #for size issues, for now while dev, just a few to look at
+    #ggg = f_preds.iloc[:firstK]
+    #ggg.to_csv('data/all_days_submission.csv.gz', compression='gzip', index=False, header=True)
+    #Instead of saving indivual, just wait and append and look at finals.
+#    f_preds.to_csv(f'{OUTPUT_DIR}/all_predictions_ours.csv.gz', compression='gzip', index=False, header=True)
+    
+    
+    
+    
+    # =============================================================================
+    # visualize to do wuick check
+    # =============================================================================
+    if save_plots:
+        randomK = 1000
+        print('Saving figs of {} time series as checks'.format(randomK))
+        pagenames = list(f_preds.index)
+        pages = np.random.choice(pagenames, size=min(randomK,len(pagenames)), replace=False)
+        N = pages.size
+        for jj, page in enumerate(pages):
+            print(f"{jj} of {N}")
+            plt.figure()
+            if DATA_TYPE=='kaggle':
+                prev.loc[page].fillna(0).plot()#logy=True)
+                f_preds.loc[page].fillna(0).plot(logy=True)
+            elif DATA_TYPE=='ours':
+                prev.loc[int(page)].plot()
+                f_preds.loc[page].plot()
+            plt.title(page)
+            if not os.path.exists(OUTPUT_DIR):
+                os.mkdir(OUTPUT_DIR)
+            pathname = 'ddddddddddd'#os.path.join(OUTPUT_DIR, 'fig_{}.png'.format(jj))
+            plt.savefig(pathname)
+            plt.close()
+        
+        
+        
+    #Cannot view on the AWS so move to local:   
+    #zip -r output.zip output
+    #cp output.zip /home/...../sync
+    return final_preds
+    
+    
+    
+def get_data_timesteps_Nseries(df_path):
+    """
+    Get the data_timesteps value from the TEST set data.
+    Because every day will be used, it is just the number of days in the df.
+    
+    And get number of time series to prdict on [number of rows], to use as batchsize
+    """
+    df = pd.read_csv(df_path)
+    columns = list(df.columns)
+    columns.remove('Page')
+    return len(columns), len(df)
+
+
+
+def get_data_timesteps_Nseries__backtest(test_path,train_path):
+    """
+    For backtest chunk mode only.
+    Get the number of data timesteps (which potentially varies per testset1,2,3,4),
+    in order to determine backoffset range.
+    
+    Because in this mode the TEST set also includes the TRAIN ste in it, cannot 
+    just use length of TEST set alone to get datatimesteps.
+    """
+    test = pd.read_csv(test_path)
+#    test_day = test.columns[-1]
+    train = pd.read_csv(train_path)
+#    train_day = train.columns[-1]
+    
+    #Assuming consecutive days, just get diff of number columns:
+    data_timesteps = len(test.columns) - len(train.columns)
+    
+    #Depending if did holdout id's, then TEST would have extra id's not in TRAIN
+    #For batchsize, using N_series as number of rows of TEST set.
+    #Since metrics are later made from dicts, if an ID is predicted on more than once,
+    #is ok, since would have same key in dict and only be there once anyway.
+    N_series = len(test)
+    
+    return data_timesteps, N_series
+        
+    
+
+
+
+
+def SaveMultisheetXLS(list_dfs, list_sheetnames, xls_path):
+    """
+    xls_path - must be .xls or else will not save sheets properly
+    """
+    writer = ExcelWriter(xls_path)
+    for s in range(len(list_dfs)):
+        list_dfs[s].to_excel(writer,list_sheetnames[s],index=False)
+    writer.save()
+    
+    
+    
+
+
+if __name__ == '__main__':
+    
+    hparams = build_hparams(PARAM_SETTING_FULL_NAME)
+    
+    
+    #For the 4 chunk backtesting performance assessment
+    for name in NAMES:
+        
+        print('name: ',name)
+        chunk = name.replace('TEST','')
+        TEST_DF_PATH = f"data/ours_daily_{name}.csv"
+        TEST_dir = f"data/{name}"
+        TRAIN_DF_PATH = TEST_DF_PATH.replace('TEST','TRAIN')
+        print('TEST_DF_PATH',TEST_DF_PATH)
+        print('TEST_dir',TEST_dir)
+        print('TRAIN_DF_PATH',TRAIN_DF_PATH)
+    
+        groundtruth = pd.read_csv(TEST_DF_PATH)
+        groundtruth.sort_values(['Page'])    
+        print('groundtruth',groundtruth)
+    
+        if not os.path.exists(OUTPUT_DIR):
+            os.makedirs(OUTPUT_DIR)
+        
+        if PREDICT_MODE=='disjoint':
+            data_timesteps, N_series = get_data_timesteps_Nseries(TEST_DF_PATH)
+        elif PREDICT_MODE=='backtest':
+            data_timesteps, N_series = get_data_timesteps_Nseries__backtest(TEST_DF_PATH,TRAIN_DF_PATH)
+        
+
+        
+        
+        hist_horiz__all = {}
+        t0 = time.clock()
+        for history in HISTORY_SIZES:
+#            for horizon in HORIZON_SIZES:
+            #For the direct decoder, only need to do single prediction (all smaller horizons contained in the max horizon)
+            if hparams.MLP_DIRECT_DECODER:
+                #HORIZON_SIZES = max(HORIZON_SIZES)
+                horizon = hparams.horizon_window_size_minmax[1]
+            
+            print('HISTORY ',history, 'of ', HISTORY_SIZES)
+            print('HORIZON ',horizon, 'of ', HORIZON_SIZES)
+            
+            #For the disjoint mode, the test set does not overlap art all w the train set. 
+            #The history + horizon window must completely fit in the test set alone.
+            #vs.
+            #in backtest chunk mode, test set include full train set, but 
+            #horizon window always starts after the train set (so horizon 
+            #is fully inside test set). SO for backtest chunk mode, irrelevant
+            #what history + horizon is, only matters that the horizon is fully inside TEST set.
+            if (PREDICT_MODE=='disjoint') and (history+horizon >= data_timesteps):
+                print(f'history+horizon ({history+horizon}) >= data set size ({data_timesteps})')
+                continue
+            if (PREDICT_MODE=='backtest') and (horizon > data_timesteps):
+                print(f'horizon ({horizon}) > test region size ({data_timesteps})')
+                continue                
+            
+            #Get the range of values that will step through for 
+            if (PREDICT_MODE=='disjoint'):
+                offs = [i for i in range(horizon, data_timesteps - history +1, EVAL_STEP_SIZE)]
+            if (PREDICT_MODE=='backtest'):
+                offs = [i for i in range(horizon, data_timesteps+1, EVAL_STEP_SIZE)]
+
+
+
+
+            #!!!!!!! dflist as dict per quantile
+            #dflists = defaultdict(lambda:[])# if hparams.DO_QUANTILES else []
+            dflists = []
+            for gg, backoffset in enumerate(offs):
+                print('backoffset ',backoffset, 'of ', offs)
+                f_preds = do_predictions_one_setting(history,horizon,backoffset,TEST_dir,SAVE_PLOTS,N_series,chunk)
+#                    print('f_preds',f_preds)
+                
+                
+                #Save out some example quantile predictions to make sure they look ok
+                if gg==0:
+                    for kk,vv in f_preds.items():
+                        vv.to_csv(f"{kk}__{history}_{horizon}_{chunk}.csv")
+                        
+                #Just focus on point estimates for now: using q45
+                if hparams.DO_QUANTILES:
+                    f_preds = f_preds[Q_pnt_est]
+                else:
+                    f_preds = f_preds['point_est']                
+                
+                
+
+                
+#                for horizon in HORIZON_SIZES:
+
+                
+                #COlumns are same for all quantiles/point estimates, so just use point_est to get dates:
+                #cols = f_preds[Q_pnt_est].columns if hparams.DO_QUANTILES else f_preds['point_est'].columns
+                cols = f_preds.columns
+                dates = [i.strftime('%Y-%m-%d') for i in cols]
+#                    print(dates)
+                
+                
+                
+                
+                #For each series
+                for jj in range(len(f_preds)):
+                    series = f_preds.iloc[jj]
+                    _id = series.name
+                    true = groundtruth[groundtruth['Page'].astype(str) ==_id]
+                    
+                    
+                    first_pred_day = dates[0]
+                    d1 = pd.date_range(first_pred_day,first_pred_day)[0] - pd.Timedelta(history,unit='D')
+                    history_dates = pd.date_range(start=d1, end=first_pred_day, freq='D')[:-1]   #!!!!!! asuming daily sampling...
+                    history_dates = [i.strftime('%Y-%m-%d') for i in history_dates]
+                    history_missing_count = np.isnan(true[history_dates].values[0]).sum()
+#                    print('history_missing_count',history_missing_count)                    
+#                    print('true',true)
+                    true = true[dates].values[0]
+                    horizon_missing_count = np.isnan(true).sum()
+#                    print('horizon_missing_count',horizon_missing_count)
+                    
+                    #Get smape, mae, bias over this prediction
+                    smp = mean_smape(true, series.values)
+#                    mae = asdasdasd
+                    bi = mean_bias(true, series.values)
+#                    print(smape,bias)
+                    metr_dct = {'SMAPE':smp, 
+                                    'bias':bi,
+                                    #'MAE':mae,
+                                    'predict_start_date':dates[0],
+                                    'predict_end_date':dates[-1],
+                                    'history_missing_count':history_missing_count,
+                                    'horizon_missing_count':horizon_missing_count
+                                    }
+                    #When doing quantiles, also see what fraction is in that quantile
+#                    if hparams.DO_QUANTILES:
+#                        metr_dct[asd] = asdasd
+#                        metr_dct['sharpness'] = 0.
+#                        #MSIS ...
+                        
+                    hist_horiz__all[(history,horizon,backoffset,_id)] = metr_dct
+#                    print(hist_horiz__all)
+                    
+                  
+
+
+                #For saving out predictions:
+                dates = [i.strftime('%m/%d/%Y') for i in cols]
+                d = {cols[i]:dates[i] for i in range(len(cols))}
+                f_preds.rename(columns=d,inplace=True)
+                f_preds['Page'] = f_preds.index.values
+                #Depending on missing data in the test set in the history window for this backoffset,
+                #it oculd be that that particular id did not pass the train completeness threshold.
+                #Then it will not be included, but the batchsize will still be len(df), so to fill that missing
+                #id, it will repeat id's that already had predictions. THey will be identical, 
+                #so just take the 1st occurrence for those repeated id's:
+                df = []
+                u_ids = np.unique(f_preds['Page'].values)
+                for u in u_ids:
+                    s = f_preds[f_preds['Page']==u]
+                    if len(s)>1:
+                        s = s.head(1)
+                    df.append(s)
+                f_preds = pd.concat(df,axis=0)
+                cols = list(f_preds.columns)
+                cols.remove('Page')
+                cols = ['Page'] + cols
+                f_preds = f_preds[cols]                 
+#                    print(f_preds)
+                
+                dflists.append(f_preds)
+                #Care about the metrics within different partitions:
+                #Beside just history and horizon size, also consider:
+                #real vs. synthetic augmented series
+                #training ID vs. new ID only in TEST set
+                #series contains holiday vs. only non-holidays
+                #day of week
+                
+            savename = f"{str(history)}_{str(horizon)}_{name}.xls"
+            savename = os.path.join(OUTPUT_DIR,savename)
+            sheetnames = [str(i) for i in offs]
+            SaveMultisheetXLS(dflists, sheetnames, savename)
+            #each sheet is for a single backoffset, so each sheet contains all ~1800 id's
+
+        
+#        print(hist_horiz__all)
+        t1 = time.clock()
+        print('elapsed time: ',t1-t0)
+        #Now that all metrics stored in dict, save dict, and analyze further
+        #pickle ... hist_horiz__all
+    #    print(hist_horiz__all)
+        dict_savename = os.path.join(OUTPUT_DIR,f"hist_horiz__{name}.pickle")
+        with open(dict_savename, "wb") as outp:
+            pickle.dump(hist_horiz__all, outp)#, protocol=2)
\ No newline at end of file
diff --git a/drnn.py b/drnn.py
new file mode 100755
index 0000000..3fb5799
--- /dev/null
+++ b/drnn.py
@@ -0,0 +1,101 @@
+"""
+Dilated LSTM Cells in Tensorflow
+
+From github user "code-terminator"
+https://github.com/code-terminator/DilatedRNN/blob/master/models/drnn.py
+
+based on the paper
+Dilated Recurrent Neural Networks
+Nov 2017
+Chang et al.
+https://arxiv.org/pdf/1710.02224.pdf 
+"""
+
+import copy
+import itertools
+import numpy as np
+import tensorflow as tf
+
+def dRNN(cell, inputs, rate, scope='default'):
+    """
+    This function constructs a layer of dilated RNN.
+    Inputs:
+        cell -- the dilation operations is implemented independent of the RNN cell.
+            In theory, any valid tensorflow rnn cell should work.
+        inputs -- the input for the RNN. inputs should be in the form of
+            a list of 'n_steps' tenosrs. Each has shape (batch_size, input_dims)
+        rate -- the rate here refers to the 'dilations' in the orginal WaveNet paper. 
+        scope -- variable scope.
+    Outputs:
+        outputs -- the outputs from the RNN.
+    """
+    n_steps = len(inputs)
+    if rate < 0 or rate >= n_steps:
+        raise ValueError('The \'rate\' variable needs to be adjusted.')
+    print "Building layer: %s, input length: %d, dilation rate: %d, input dim: %d." % (
+        scope, n_steps, rate, inputs[0].get_shape()[1])
+
+    # make the length of inputs divide 'rate', by using zero-padding
+    EVEN = (n_steps % rate) == 0
+    if not EVEN:
+        # Create a tensor in shape (batch_size, input_dims), which all elements are zero.  
+        # This is used for zero padding
+        zero_tensor = tf.zeros_like(inputs[0])
+        dialated_n_steps = n_steps // rate + 1
+        print "=====> %d time points need to be padded. " % (
+            dialated_n_steps * rate - n_steps)
+        print "=====> Input length for sub-RNN: %d" % (dialated_n_steps)
+        for i_pad in xrange(dialated_n_steps * rate - n_steps):
+            inputs.append(zero_tensor)
+    else:
+        dialated_n_steps = n_steps // rate
+        print "=====> Input length for sub-RNN: %d" % (dialated_n_steps)
+
+    # now the length of 'inputs' divide rate
+    # reshape it in the format of a list of tensors
+    # the length of the list is 'dialated_n_steps' 
+    # the shape of each tensor is [batch_size * rate, input_dims] 
+    # by stacking tensors that "colored" the same
+
+    # Example: 
+    # n_steps is 5, rate is 2, inputs = [x1, x2, x3, x4, x5]
+    # zero-padding --> [x1, x2, x3, x4, x5, 0]
+    # we want to have --> [[x1; x2], [x3; x4], [x_5; 0]]
+    # which the length is the ceiling of n_steps/rate
+    dilated_inputs = [tf.concat(inputs[i * rate:(i + 1) * rate],
+                                axis=0) for i in range(dialated_n_steps)]
+
+    # building a dialated RNN with reformated (dilated) inputs
+    dilated_outputs, _ = tf.contrib.rnn.static_rnn(
+        cell, dilated_inputs,
+        dtype=tf.float32, scope=scope)
+
+    # reshape output back to the input format as a list of tensors with shape [batch_size, input_dims]
+    # split each element of the outputs from size [batch_size*rate, input_dims] to 
+    # [[batch_size, input_dims], [batch_size, input_dims], ...] with length = rate
+    splitted_outputs = [tf.split(output, rate, axis=0)
+                        for output in dilated_outputs]
+    unrolled_outputs = [output
+                        for sublist in splitted_outputs for output in sublist]
+    # remove padded zeros
+    outputs = unrolled_outputs[:n_steps]
+
+    return outputs
+
+
+def multi_dRNN_with_dilations(cells, inputs, dilations):
+    """
+    This function constucts a multi-layer dilated RNN. 
+    Inputs:
+        cells -- A list of RNN cells.
+        inputs -- A list of 'n_steps' tensors, each has shape (batch_size, input_dims).
+        dilations -- A list of integers with the same length of 'cells' indicates the dilations for each layer.
+    Outputs:
+        x -- A list of 'n_steps' tensors, as the outputs for the top layer of the multi-dRNN.
+    """
+    assert (len(cells) == len(dilations))
+    x = copy.copy(inputs)
+    for cell, dilation in zip(cells, dilations):
+        scope_name = "multi_dRNN_dilation_%d" % dilation
+        x = dRNN(cell, x, dilation, scope=scope_name)
+    return x
\ No newline at end of file
diff --git a/ex_figs/quickcheck_0.png b/ex_figs/quickcheck_0.png
new file mode 100644
index 0000000..794e34b
Binary files /dev/null and b/ex_figs/quickcheck_0.png differ
diff --git a/ex_figs/quickcheck_1.png b/ex_figs/quickcheck_1.png
new file mode 100644
index 0000000..7e089e8
Binary files /dev/null and b/ex_figs/quickcheck_1.png differ
diff --git a/ex_figs/quickcheck_2.png b/ex_figs/quickcheck_2.png
new file mode 100644
index 0000000..057b0f1
Binary files /dev/null and b/ex_figs/quickcheck_2.png differ
diff --git a/ex_figs/quickcheck_3.png b/ex_figs/quickcheck_3.png
new file mode 100644
index 0000000..fcb05d3
Binary files /dev/null and b/ex_figs/quickcheck_3.png differ
diff --git a/ex_figs/quickcheck_4.png b/ex_figs/quickcheck_4.png
new file mode 100644
index 0000000..9dc4c64
Binary files /dev/null and b/ex_figs/quickcheck_4.png differ
diff --git a/extractor.py b/extractor.py
old mode 100644
new mode 100755
diff --git a/feeder.py b/feeder.py
old mode 100644
new mode 100755
diff --git a/holiday_features.py b/holiday_features.py
new file mode 100755
index 0000000..8c245ff
--- /dev/null
+++ b/holiday_features.py
@@ -0,0 +1,170 @@
+#Define  few functions to create holiday features from the time series
+#For now, these are only intended to ork with DAILY sampled data
+
+import pandas as pd
+import numpy as np
+
+
+
+
+def encode_all_holidays__daily(dates_range):
+    """
+    Encode all fixed and moving holidays, and corresponding holiday shoulders.
+    Intended for daily sampled data only.
+    """
+    
+
+    def get_fixed_date_holidays__daily(dates_range, month_day):
+        """
+        Get YYYY-mm-DD holidays,
+        for holidays that occur yearly on fixed dates.
+
+        For daily sampled data only.
+        
+        In USA:
+        Christmas, New Year, 4th of July, Halloween, Cinco de Mayo
+        Valentine's Day, Veteran's Day
+        
+        other international:
+            ...
+        """
+#        return ['{}-{:02d}-{:02d}'.format(i.year,i.month,i.day) for i in dates_range if ((i.month==int(month_day[:2])) and (i.day==int(month_day[4:])))]
+#        print([(i.month, i.day) for i in dates_range])
+#        print([i for i in dates_range if ((i.month==int(month_day[:2])) and (i.day==int(month_day[4:])))])
+        return [i.strftime('%Y-%m-%d') for i in dates_range if ((i.month==int(month_day[:2])) and (i.day==int(month_day[3:])))]
+
+    # =============================================================================
+    # MOVING holidays [variable date]
+    # =============================================================================
+#    def get_thanksgivings__daily(dates_range):
+#        """
+#        Get Thanksgiving holiday dates within the few years time range
+#        """
+#    #    4th Thurs of Novmber...
+#    #    if (month==11) and (dayofweek=='Thurs') and (22<=dayofmonth<=28)
+##        thanksgiving_dates = [i.strftime('%Y-%m-%d') for i in dates_range if ((i.month==11) and (i.dayofweek==3) and (i.dayofmonth...))]
+#        #...
+#        return thanksgiving_dates
+    
+#    def get_Easters__daily(dates_range):
+#        """
+#        Get Easter holiday dates within the few years time range
+#        """
+#        easter_dates = []
+#        #...
+#        return easter_dates  
+        
+#    def encode_custom_dates__daily(dates_range,dates_list):
+#        """
+#        Encode custom days and optionally shoulder days.
+#        For daily sampled data only.
+#        
+#        E.g. Superbowl Sunday
+#        suberbowl_dates = ['2014-02-02','2015-02-01','2016-02-07','2017-02-05','2018-02-04','2019-02-03']
+#        shoulders = [...]
+#        """
+#        return dates_range 
+    
+    def spiral_encoding(dates_range, holiday_date, shoulder):
+        """
+        Encode holiday and shoulders as a spiral:
+        Rotation over 2pi, with radius goes from 0 to 1 [on holiday] back to 0
+        
+        Right now assuming symmetric beofre / after shoulders.
+        """
+        N_real_days = len(dates_range)
+        real_min = min(dates_range)
+        real_max = max(dates_range)
+        dates_range_padded = pd.date_range(real_min-shoulder-2, real_max+shoulder+2, freq='D')
+#        print(dates_range)
+#        print(dates_range_padded)
+        
+        df = pd.DataFrame()
+        df['date'] = dates_range_padded.values 
+        Ndays = len(df)
+        
+#        print(holiday_date)
+        _ = df.loc[df['date']==holiday_date]
+        if len(_)>0:
+            ind = _.index.values[0]
+        #If this holiday is completely out of bounds of the time series input,
+        #ignore it [assumed additive holiday effects, so just add 0's]
+        else:
+            return np.zeros((N_real_days,2))
+        
+        #For radius: triangle kernel centered on holiday
+        r = np.zeros(Ndays)
+        r[ind-shoulder-1:ind+1] = np.linspace(0.,1.,shoulder+2)
+        r[ind:ind+shoulder+2] = np.linspace(1.,0.,shoulder+2)
+
+        #For anlge: go from phase [0,pi], with holiday at pi/2
+        theta = np.zeros(Ndays)
+        theta[ind-shoulder-1:ind+shoulder+2] = np.linspace(0., np.pi, 2*shoulder+3)
+        #Convert to Cartesian:
+        df['r'] = r 
+        df['theta'] = theta
+        df['x'] = df['r']*np.cos(df['theta'])
+        df['y'] = df['r']*np.sin(df['theta'])
+        v = df[((df['date']>=real_min) & (df['date']<=real_max))]
+        v = v[['x','y']].values
+#        print(v, v.sum(axis=0), v.sum(axis=1))
+        return v
+    
+    
+    
+    Ndays = len(dates_range)
+    
+    #Fixed Holidays [add other international ones as needed]:
+    xmas_dates = get_fixed_date_holidays__daily(dates_range, '12-25')
+    new_years_dates = get_fixed_date_holidays__daily(dates_range, '01-01')
+    st_patricks_dates = get_fixed_date_holidays__daily(dates_range, '03-17')
+    july4_dates = get_fixed_date_holidays__daily(dates_range, '07-04')
+    halloween_dates = get_fixed_date_holidays__daily(dates_range, '10-31')
+    cincodemayo_dates = get_fixed_date_holidays__daily(dates_range, '05-05')
+    valentines_dates = get_fixed_date_holidays__daily(dates_range, '02-14')
+    veterans_dates = get_fixed_date_holidays__daily(dates_range, '11-11')
+    #taxday_dates = get_fixed_date_holidays__daily(dates_range, '04-15')
+    
+
+    #Rule Based Moving Holidays
+#    thanksgiving_dates = get_thanksgivings__daily(dates_range)
+    thanksgiving_dates = ['2014-11-27','2015-11-26','2016-11-24','2017-11-23','2018-11-22','2019-11-28','2020-11-26'] #just m,anually define for now
+#    easter_dates = get_Easters__daily(dates_range) #too complicated : moon stuff, just set custom dates
+    easter_dates = ['2014-04-20','2015-04-05','2016-03-27','2017-04-16','2018-04-01','2019-04-21','2020-04-20']
+    #... Labor Day, Memorial Day, President's Day, MLK Day, Columbus Day, Tax Day
+    #Custom / Single Event moving Holidays
+    suberbowl_dates = ['2014-02-02','2015-02-01','2016-02-07','2017-02-05','2018-02-04','2019-02-03']
+    
+    
+    #Dict of holiday dates: shoulder halfwidth  [-S, -S+1, ..., holiday, holiday+1, ..., holiday+S]
+    #for now just use 3 as the shoulder width for all "major" holidays, 0 or 1 for "minor" holidays
+    #Use ODD numbers for shoulder sizes
+    holidays = {'xmas_dates':(xmas_dates,3),
+                'new_years_dates':(new_years_dates,3),
+                'st_patricks_dates':(st_patricks_dates,1),
+                'july4_dates':(july4_dates,1),
+                'halloween_dates':(halloween_dates,1),
+                'cincodemayo_dates':(cincodemayo_dates,1),
+                'valentines_dates':(valentines_dates,1),
+                'veterans_dates':(veterans_dates,1),
+                'thanksgiving_dates':(thanksgiving_dates,3),
+                'easter_dates':(easter_dates,1),
+                'suberbowl_dates':(suberbowl_dates,1),
+                }
+#    print(holidays)
+    
+    
+    #Assume additive holiday effects: (which should almost never matter anyway 
+    #for small shoulders unless there is overlap beteen some holidays. E.g. with shoulder=3, 
+    #Christmas and New Year's do NOT overlap.)
+#    encoded_holidays = pd.DataFrame()
+#    encoded_holidays['date'] = dates_range.values
+    _ = np.zeros((Ndays,2))
+    #Iterate through each holiday, accumulating the effect:
+    for mmm in holidays.values():
+        shoulder = mmm[1]
+        #Since date series is potentially over few years, could have e.g. several Christmas furing that time range
+        for hd in mmm[0]:
+            _ += spiral_encoding(dates_range, hd, shoulder)
+#    print(_)
+    return _
\ No newline at end of file
diff --git a/how_it_works.md b/how_it_works.md
old mode 100644
new mode 100755
diff --git a/hparams.py b/hparams.py
old mode 100644
new mode 100755
index ca39cbf..164e8d6
--- a/hparams.py
+++ b/hparams.py
@@ -1,11 +1,12 @@
 import tensorflow.contrib.training as training
-import re
+#import re
 
 # Manually selected params
-params_s32 = dict(
-    batch_size=256,
+params_encdec = dict(
+    batch_size=128,#256,
     #train_window=380,
-    train_window=283,
+#    train_window=283,#now make this a bash input to do train-validation window size performance heatmaps
+    #train_window=30,#try 65 w our data to see if allows more samples through filter
     train_skip_first=0,
     rnn_depth=267,
     use_attn=False,
@@ -13,7 +14,7 @@
     attention_heads=1,
     encoder_readout_dropout=0.4768781146510798,
 
-    encoder_rnn_layers=1,
+    encoder_rnn_layers=3,
     decoder_rnn_layers=1,
 
     # decoder_state_dropout_type=['outside','outside'],
@@ -39,152 +40,87 @@
     encoder_activation_loss=1e-06, # max 0.001
     decoder_stability_loss=0.0, # max 100
     decoder_activation_loss=5e-06,  # max 0.001
+    
+    
+    
+    # =============================================================================
+    # RANDOMIZING OVER WINDOW SIZES (in training only)
+    # =============================================================================
+    #Instead of fixed size windows, do training phase over range of window sizes
+    #drawn uniformly from [a,b]. Another form of randomization/regularization, 
+    #but more importantly this way model can generalize to different lengths so
+    #we can more fairly assess performance over range of history/horizon windows:
+    history_window_size_minmax=[7,365],
+    horizon_window_size_minmax=[7,60],    
+
+    
+    
+    # =============================================================================
+    # DECODER OPTIONS
+    # =============================================================================
+    
+    # CONTEXT
+    #Kaggle model architecture is more like a basic many-to-many RNN, not really a
+    #usual encoder-decoder architecture since computational graph does not have 
+    #connections from encoded representation to each decoder time step (only to 1st
+    #decoder timestep). Set below to True to use encoder-decoder; set False to use
+    #Kaggle architecture not really true encoder-decoder
+    RECURSIVE_W_ENCODER_CONTEXT=True,
+
+    # LAGGED FEATURES / LOOKBACK
+    #Lookback K steps: [without specifying, default previous Kaggle setting is K=1]:
+    #for predicting y_i, insteda of just feeding in previous K=1 prediction (y_i-1),
+    #feed in all previous K predictions: y_
+    LOOKBACK_K = 3, #!!!!Can NOT set this to be bigger than min history size (history_window_size_minmax[0])
+    #since then depending on random draw would possibly need to look back further than history size.
+    
+
+
+    # =============================================================================
+    # COMPLETELY DIFFERENT DECODERS    
+    # =============================================================================
+    # Alternative decoders. Can only do one of these (cannot have both True)
+    
+    # MLP POSTPROCESSOR (ADJUST PREDICTIONS IN LOCAL WINDOWS, AND CAN DO QUANTILES)
+    #True or False to use MLP module postprocessor to locally adjust estimates
+    DO_MLP_POSTPROCESS=False,#True,#False
+    MLP_POSTPROCESS__KERNEL_SIZE=15,
+    MLP_POSTPROCESS__KERNEL_OFFSET=7,
+    
+    
+    # DIRECT MLP DECODER (REPLACE RNN CELLS IN DECODER WITH MLP MODULES, AND DO QUANTILES)
+    #Do a direct, quantile forecast by using an MLP as decoder module instead of RNN/LSTM/GRU cells:
+    MLP_DIRECT_DECODER=True,
+    LOCAL_CONTEXT_SIZE=8,
+    GLOBAL_CONTEXT_SIZE=64,
+    
+    
+    
+    # QUANTILE REGRESSION
+    # For whatever kind of decoder, whether or not to use quantiles
+    DO_QUANTILES=True,
+    #If doing quantile regression in addition to point estimates trained to minimize SMAPE.
+    #Also, since SMAPE point estimates are biased positive, can use alternative
+    #point estimator trainde by pinball loss on quantiles < 50 [e.g. 45,38, etc., see what has bias ~0].
+    #So if doing quantiles, no longer optimizing SMAPE, but report it anyway to see. So, use the 0th element of QUANTILES list is used as the point estimate for SMAPE
+    #(but SMAPE will not be used in loss function: instead will use the average quantile loss (ave over all quantiles))
+    #If not using quantile regression, list is ignored
+    QUANTILES = [.45,   .05, .25, .40, .50, .75, .95]
+    
+    
+    #Losses summed together using lembda weighting. 
+    #Vs. if False, just directly optimize quantile loss and ignore SMAPE [but still look at it for TEST sets]
+    #LAMBDA=.01 #Scale factor for relative weight of quantile loss for the point estimate SMAPE loss. Only relevant if SMAPE_AND_QUANTILE=True
 )
 
-# Default incumbent on last smac3 search
-params_definc = dict(
-    batch_size=256,
-    train_window=100,
-    train_skip_first=0,
-    rnn_depth=128,
-    use_attn=True,
-    attention_depth=64,
-    attention_heads=1,
-    encoder_readout_dropout=0.4768781146510798,
-
-    encoder_rnn_layers=1,
-    decoder_rnn_layers=1,
 
-    decoder_input_dropout=[1.0, 1.0, 1.0],
-    decoder_output_dropout=[1.0, 1.0, 1.0],
-    decoder_state_dropout=[0.995, 0.995, 0.995],
-    decoder_variational_dropout=[False, False, False],
-    decoder_candidate_l2=0.0,
-    decoder_gates_l2=0.0,
-    fingerprint_fc_dropout=0.8232342370695286,
-    gate_dropout=0.8961710392091516,
-    gate_activation='none',
-    encoder_dropout=0.030490422531402273,
-    encoder_stability_loss=0.0,
-    encoder_activation_loss=1e-05,
-    decoder_stability_loss=0.0,
-    decoder_activation_loss=5e-05,
-)
 
-# Found incumbent 0.35503610596060753
-#"decoder_activation_loss='1e-05'", "decoder_output_dropout:0='1.0'", "decoder_rnn_layers='1'", "decoder_state_dropout:0='0.995'", "encoder_activation_loss='1e-05'", "encoder_rnn_layers='1'", "gate_dropout='0.7934826952854418'", "rnn_depth='243'", "train_window='135'", "use_attn='1'", "attention_depth='17'", "attention_heads='2'", "encoder_readout_dropout='0.7711751356092252'", "fingerprint_fc_dropout='0.9693950737901414'"
-params_foundinc = dict(
-    batch_size=256,
-    train_window=135,
-    train_skip_first=0,
-    rnn_depth=243,
-    use_attn=True,
-    attention_depth=17,
-    attention_heads=2,
-    encoder_readout_dropout=0.7711751356092252,
 
-    encoder_rnn_layers=1,
-    decoder_rnn_layers=1,
 
-    decoder_input_dropout=[1.0, 1.0, 1.0],
-    decoder_output_dropout=[1.0, 1.0, 1.0],
-    decoder_state_dropout=[0.995, 0.995, 0.995],
-    decoder_variational_dropout=[False, False, False],
-    decoder_candidate_l2=0.0,
-    decoder_gates_l2=0.0,
-    fingerprint_fc_dropout=0.9693950737901414,
-    gate_dropout=0.7934826952854418,
-    gate_activation='none',
-    encoder_dropout=0.0,
-    encoder_stability_loss=0.0,
-    encoder_activation_loss=1e-05,
-    decoder_stability_loss=0.0,
-    decoder_activation_loss=1e-05,
-)
-
-# 81 on smac_run0 (0.3552077534247418 x 7)
-#{'decoder_activation_loss': 0.0, 'decoder_output_dropout:0': 0.85, 'decoder_rnn_layers': 2, 'decoder_state_dropout:0': 0.995,
-# 'encoder_activation_loss': 0.0, 'encoder_rnn_layers': 2, 'gate_dropout': 0.7665920904244501, 'rnn_depth': 201,
-#  'train_window': 143, 'use_attn': 1, 'attention_depth': 17, 'attention_heads': 2, 'decoder_output_dropout:1': 0.975,
-# 'decoder_state_dropout:1': 0.99, 'encoder_dropout': 0.0304904225, 'encoder_readout_dropout': 0.4444295965935664, 'fingerprint_fc_dropout': 0.26412480387331017}
-params_inst81 = dict(
-    batch_size=256,
-    train_window=143,
-    train_skip_first=0,
-    rnn_depth=201,
-    use_attn=True,
-    attention_depth=17,
-    attention_heads=2,
-    encoder_readout_dropout=0.4444295965935664,
-
-    encoder_rnn_layers=2,
-    decoder_rnn_layers=2,
-
-    decoder_input_dropout=[1.0, 1.0, 1.0],
-    decoder_output_dropout=[0.85, 0.975, 1.0],
-    decoder_state_dropout=[0.995, 0.99, 0.995],
-    decoder_variational_dropout=[False, False, False],
-    decoder_candidate_l2=0.0,
-    decoder_gates_l2=0.0,
-    fingerprint_fc_dropout=0.26412480387331017,
-    gate_dropout=0.7665920904244501,
-    gate_activation='none',
-    encoder_dropout=0.0304904225,
-    encoder_stability_loss=0.0,
-    encoder_activation_loss=0.0,
-    decoder_stability_loss=0.0,
-    decoder_activation_loss=0.0,
-)
-# 121 on smac_run0 (0.3548671560628074 x 3)
-# {'decoder_activation_loss': 1e-05, 'decoder_output_dropout:0': 0.975, 'decoder_rnn_layers': 2, 'decoder_state_dropout:0': 1.0,
-# 'encoder_activation_loss': 1e-05, 'encoder_rnn_layers': 1, 'gate_dropout': 0.8631496699358483, 'rnn_depth': 122,
-#  'train_window': 269, 'use_attn': 1, 'attention_depth': 29, 'attention_heads': 4, 'decoder_output_dropout:1': 0.975,
-# 'decoder_state_dropout:1': 0.975, 'encoder_readout_dropout': 0.9835390239895767, 'fingerprint_fc_dropout': 0.7452161827064421}
-
-# 83 on smac_run1 (0.355050330259362 x 7)
-# {'decoder_activation_loss': 1e-06, 'decoder_output_dropout:0': 0.925, 'decoder_rnn_layers': 2, 'decoder_state_dropout:0': 0.98,
-#  'encoder_activation_loss': 1e-06, 'encoder_rnn_layers': 1, 'gate_dropout': 0.9275441207192259, 'rnn_depth': 138,
-# 'train_window': 84, 'use_attn': 1, 'attention_depth': 52, 'attention_heads': 2, 'decoder_output_dropout:1': 0.925,
-# 'decoder_state_dropout:1': 0.98, 'encoder_readout_dropout': 0.6415488109353416, 'fingerprint_fc_dropout': 0.2581296623398802}
-
-
-params_inst83 = dict(
-    batch_size=256,
-    train_window=84,
-    train_skip_first=0,
-    rnn_depth=138,
-    use_attn=True,
-    attention_depth=52,
-    attention_heads=2,
-    encoder_readout_dropout=0.6415488109353416,
-
-    encoder_rnn_layers=1,
-    decoder_rnn_layers=2,
-
-    decoder_input_dropout=[1.0, 1.0, 1.0],
-    decoder_output_dropout=[0.925, 0.925, 1.0],
-    decoder_state_dropout=[0.98, 0.98, 0.995],
-    decoder_variational_dropout=[False, False, False],
-    decoder_candidate_l2=0.0,
-    decoder_gates_l2=0.0,
-    fingerprint_fc_dropout=0.2581296623398802,
-    gate_dropout=0.9275441207192259,
-    gate_activation='none',
-    encoder_dropout=0.0,
-    encoder_stability_loss=0.0,
-    encoder_activation_loss=1e-06,
-    decoder_stability_loss=0.0,
-    decoder_activation_loss=1e-06,
-)
-
-def_params = params_s32
+def_params = params_encdec
 
 sets = {
-    's32':params_s32,
-    'definc':params_definc,
-    'foundinc':params_foundinc,
-    'inst81':params_inst81,
-    'inst83':params_inst83,
+    'encdec':params_encdec,
 }
 
 
@@ -196,3 +132,4 @@ def build_from_set(set_name):
     return build_hparams(sets[set_name])
 
 
+
diff --git a/input_pipe.py b/input_pipe.py
old mode 100644
new mode 100755
index 7627344..624c87d
--- a/input_pipe.py
+++ b/input_pipe.py
@@ -22,6 +22,12 @@ def __init__(self, test_set: List[tf.Tensor], train_set: List[tf.Tensor], test_s
 
 
 class Splitter:
+    """
+    This is the splitter used when side_split
+    (vs. FakeSplitter when not side_split [when forward_split])
+    
+    Is typical train-test split
+    """
     def cluster_pages(self, cluster_idx: tf.Tensor):
         """
         Shuffles pages so all user_agents of each unique pages stays together in a shuffled list
@@ -42,7 +48,7 @@ def __init__(self, tensors: List[tf.Tensor], cluster_indexes: tf.Tensor, n_split
         self.seed = seed
         clustered_index = self.cluster_pages(cluster_indexes)
         index_len = tf.shape(clustered_index)[0]
-        assert_op = tf.assert_equal(index_len, size, message='n_pages is not equals to size of clustered index')
+        assert_op = tf.assert_equal(index_len, size, message='N_time_series is not equals to size of clustered index')
         with tf.control_dependencies([assert_op]):
             split_nitems = int(round(size / n_splits))
             split_size = [split_nitems] * n_splits
@@ -62,179 +68,384 @@ def prepare_split(i):
             train_sampled_size = int(round(train_size * train_sampling))
             test_idx = splits[i][:test_sampled_size]
             train_idx = complements[i][:train_sampled_size]
+            
+#            print(test_size)
+#            print(train_size)
+#            print(test_sampled_size)
+#            print(train_sampled_size)   
+#            print(test_idx)
+#            print(train_idx)
+            #When doing --side_split validation option, was getting a type error
+            #when creating test_set, tran_set list comprehensions: change dtype here for idx
+            test_idx = tf.cast(test_idx, tf.int32)
+            train_idx = tf.cast(train_idx, tf.int32)
+            
+            test_idx = tf.Print(test_idx, ['test_idx',tf.shape(test_idx),test_idx])
+            train_idx = tf.Print(train_idx, ['train_idx',tf.shape(train_idx),train_idx])
+            """48354
+            96709
+            48354
+            96709
+            Tensor("strided_slice_1:0", shape=(48354,), dtype=float32, device=/device:CPU:0)
+            Tensor("strided_slice_2:0", shape=(96709,), dtype=float32, device=/device:CPU:0)"""     
             test_set = [tf.gather(tensor, test_idx, name=mk_name('test', tensor)) for tensor in tensors]
             tran_set = [tf.gather(tensor, train_idx, name=mk_name('train', tensor)) for tensor in tensors]
+#            print(test_set)
+#            print(tran_set)
             return Split(test_set, tran_set, test_sampled_size, train_sampled_size)
 
         self.splits = [prepare_split(i) for i in range(n_splits)]
 
 
 class FakeSplitter:
+    """
+    This is the splitter used when forward_split
+    (vs. Splitter when not forward_split [when side_split])
+    
+    Is typical train-test split
+    """    
     def __init__(self, tensors: List[tf.Tensor], n_splits, seed, test_sampling=1.0):
-        total_pages = tensors[0].shape[0].value
-        n_pages = int(round(total_pages * test_sampling))
+        total_series = tensors[0].shape[0].value
+        N_time_series = int(round(total_series * test_sampling))
 
         def mk_name(prefix, tensor):
             return prefix + '_' + tensor.name[:-2]
 
         def prepare_split(i):
-            idx = tf.random_shuffle(tf.range(0, n_pages, dtype=tf.int32), seed + i)
+            idx = tf.random_shuffle(tf.range(0, N_time_series, dtype=tf.int32), seed + i)
             train_tensors = [tf.gather(tensor, idx, name=mk_name('shfl', tensor)) for tensor in tensors]
-            if test_sampling < 1.0:
-                sampled_idx = idx[:n_pages]
+            if test_sampling < 1.0: #Only use subset of time series = test_sampling
+                sampled_idx = idx[:N_time_series]
                 test_tensors = [tf.gather(tensor, sampled_idx, name=mk_name('shfl_test', tensor)) for tensor in tensors]
             else:
                 test_tensors = train_tensors
-            return Split(test_tensors, train_tensors, n_pages, total_pages)
+            return Split(test_tensors, train_tensors, N_time_series, total_series)
 
         self.splits = [prepare_split(i) for i in range(n_splits)]
 
 
 class InputPipe:
-    def cut(self, hits, start, end):
+#    def randomize_window_sizes(self, *args):
+#        self.horizon_window_size = tf.random_uniform((), 7, 60, dtype=tf.int32)
+#        self.history_window_size = tf.random_uniform((), 7, 366, dtype=tf.int32)
+#        self.attn_window = self.history_window_size - self.horizon_window_size + 1
+#        self.max_train_empty = tf.cast(tf.round(tf.multiply(tf.cast(self.history_window_size,tf.float32),(1 - self.train_completeness_threshold))),tf.int32)
+#        self.max_predict_empty = tf.cast(tf.round(tf.multiply(tf.cast(self.horizon_window_size,tf.float32),(1 - self.predict_completeness_threshold))),tf.int32)
+#        return args
+        
+    def cut(self, counts, start, end):
         """
         Cuts [start:end] diapason from input data
-        :param hits: hits timeseries
+        :param counts: counts timeseries
         :param start: start index
         :param end: end index
-        :return: tuple (train_hits, test_hits, dow, lagged_hits)
+        :return: tuple (train_counts, test_counts, lagged_counts, [subset of: dow,woy,moy,doy,year,holidays])
         """
-        # Pad hits to ensure we have enough array length for prediction
-        hits = tf.concat([hits, tf.fill([self.predict_window], np.NaN)], axis=0)
-        cropped_hit = hits[start:end]
-
-        # cut day of week
-        cropped_dow = self.inp.dow[start:end]
-
-        # Cut lagged hits
-        # gather() accepts only int32 indexes
-        cropped_lags = tf.cast(self.inp.lagged_ix[start:end], tf.int32)
-        # Mask for -1 (no data) lag indexes
-        lag_mask = cropped_lags < 0
-        # Convert -1 to 0 for gather(), it don't accept anything exotic
-        cropped_lags = tf.maximum(cropped_lags, 0)
-        # Translate lag indexes to hit values
-        lagged_hit = tf.gather(hits, cropped_lags)
-        # Convert masked (see above) or NaN lagged hits to zeros
-        lag_zeros = tf.zeros_like(lagged_hit)
-        lagged_hit = tf.where(lag_mask | tf.is_nan(lagged_hit), lag_zeros, lagged_hit)
-
+        
+        # Pad counts to ensure we have enough array length for prediction
+        counts = tf.concat([counts, tf.fill([self.horizon_window_size], np.NaN)], axis=0)
+        cropped_count = counts[start:end]
+#        cropped_count = tf.Print(cropped_count,['INPUT PIPE > CUT > cropped_count',tf.shape(cropped_count), 'start', start, 'end', end])
+#        cropped_count = tf.Print(cropped_count,['self.history_window_size', self.history_window_size, 'self.horizon_window_size', self.horizon_window_size])
+        
+        # =============================================================================
+        # Ordinal periodic variables
+        # which features are here depends on what the sampling period is for the data
+        # =============================================================================
+        if self.sampling_period=='daily':
+            cropped_dow = self.inp.dow[start:end]
+            cropped_woy = self.inp.woy[start:end]
+            cropped_doy = self.inp.doy[start:end]
+            cropped_holidays = self.inp.holidays[start:end]
+#            cropped_moy = 0*cropped_dow #Month information is alreayd contained in week information. COuld incude anyway to be explicit, but for now do not use as a feature
+        elif self.sampling_period=='weekly':
+            cropped_woy = self.inp.woy[start:end]
+#            cropped_dow = 0*cropped_woy
+#            cropped_moy = 0*cropped_woy
+        elif self.sampling_period=='monthly':
+            cropped_moy = self.inp.moy[start:end]
+#            cropped_dow = 0*cropped_moy
+#            cropped_woy = 0*cropped_moy            
+            
+        #ANd use year as a feature to get long term trend
+        cropped_year = self.inp.year[start:end]
+
+        
+        # =============================================================================
+        # Other features that are also time-varying
+        # that can be used, which depend on the choice of feature_set
+        # self.features_set = features_set
+        # =============================================================================        
+        
+        #If used Arturius' original feature set then will include the lagged data:
+#        if self.features_set == 'arturius':
+        if self.features_set=='arturius':
+            # Cut lagged counts
+            # gather() accepts only int32 indexes
+            cropped_lags = tf.cast(self.inp.lagged_ix[start:end], tf.int32)
+            # Mask for -1 (no data) lag indexes
+            lag_mask = cropped_lags < 0
+            # Convert -1 to 0 for gather(), it don't accept anything exotic
+            cropped_lags = tf.maximum(cropped_lags, 0)
+            # Translate lag indexes to count values
+            lagged_count = tf.gather(counts, cropped_lags)
+            # Convert masked (see above) or NaN lagged counts to zeros
+            lag_zeros = tf.zeros_like(lagged_count)
+            lagged_count = tf.where(lag_mask | tf.is_nan(lagged_count), lag_zeros, lagged_count)
+
+
+
+        #Will always have the count series (the series we predict on):
         # Split for train and test
-        x_hits, y_hits = tf.split(cropped_hit, [self.train_window, self.predict_window], axis=0)
+        x_counts, y_counts = tf.split(cropped_count, [self.history_window_size, self.horizon_window_size], axis=0)
 
         # Convert NaN to zero in for train data
-        x_hits = tf.where(tf.is_nan(x_hits), tf.zeros_like(x_hits), x_hits)
-        return x_hits, y_hits, cropped_dow, lagged_hit
-
-    def cut_train(self, hits, *args):
+        x_counts = tf.where(tf.is_nan(x_counts), tf.zeros_like(x_counts), x_counts)
+
+
+
+        if self.features_set=='arturius':
+            if self.sampling_period=='daily':
+                return x_counts, y_counts, lagged_count, cropped_dow, cropped_woy, cropped_doy, cropped_year, cropped_holidays
+            if self.sampling_period=='weekly':
+                return x_counts, y_counts, lagged_count, cropped_woy, cropped_year
+            if self.sampling_period=='monthly':
+                return x_counts, y_counts, lagged_count, cropped_moy, cropped_year
+            
+            
+            
+        elif self.features_set=='full':
+            if self.sampling_period=='daily':
+                return x_counts, y_counts, cropped_dow, cropped_woy, cropped_doy, cropped_year, cropped_holidays
+            if self.sampling_period=='weekly':
+                return x_counts, y_counts, cropped_woy, cropped_year, cropped_holidays
+            if self.sampling_period=='monthly':
+                return x_counts, y_counts, cropped_moy, cropped_year, cropped_holidays
+        else:
+            print(self.features_set)
+            raise Exception('problem with features_set')
+
+
+    def cut_train(self, counts, *args):
         """
         Cuts a segment of time series for training. Randomly chooses starting point.
-        :param hits: hits timeseries
+        :param counts: counts timeseries
         :param args: pass-through data, will be appended to result
         :return: result of cut() + args
         """
-        n_days = self.predict_window + self.train_window
+#        def randomize_window_sizes():#, *args):
+#            self.horizon_window_size = tf.Variable(tf.random_uniform((), 7, 60, dtype=tf.int32))
+#            self.history_window_size = tf.Variable(tf.random_uniform((), 7, 366, dtype=tf.int32))
+#            self.attn_window = self.history_window_size - self.horizon_window_size + 1
+#            self.max_train_empty = tf.cast(tf.floor(tf.multiply(tf.cast(self.history_window_size,tf.float32),(1 - self.train_completeness_threshold))),tf.int32)
+#            self.max_predict_empty = tf.cast(tf.floor(tf.multiply(tf.cast(self.horizon_window_size,tf.float32),(1 - self.predict_completeness_threshold))),tf.int32)
+#            #return args        
+#        randomize_window_sizes()        
+        
+        n_timesteps = self.horizon_window_size + self.history_window_size
         # How much free space we have to choose starting day
-        free_space = self.inp.data_days - n_days - self.back_offset - self.start_offset
+        free_space = self.inp.data_timesteps - n_timesteps - self.back_offset - self.start_offset
         if self.verbose:
+            #!!!!!! doesn't really matter since this is just printout, but would need to change for WEEKLY / MONTHLY
             lower_train_start = self.inp.data_start + pd.Timedelta(self.start_offset, 'D')
-            lower_test_end = lower_train_start + pd.Timedelta(n_days, 'D')
-            lower_test_start = lower_test_end - pd.Timedelta(self.predict_window, 'D')
+            lower_test_end = lower_train_start + pd.Timedelta(n_timesteps, 'D')
+            lower_test_start = lower_test_end - pd.Timedelta(self.horizon_window_size, 'D')
             upper_train_start = self.inp.data_start + pd.Timedelta(free_space - 1, 'D')
-            upper_test_end = upper_train_start + pd.Timedelta(n_days, 'D')
-            upper_test_start = upper_test_end - pd.Timedelta(self.predict_window, 'D')
+            upper_test_end = upper_train_start + pd.Timedelta(n_timesteps, 'D')
+            upper_test_start = upper_test_end - pd.Timedelta(self.horizon_window_size, 'D')
             print(f"Free space for training: {free_space} days.")
             print(f" Lower train {lower_train_start}, prediction {lower_test_start}..{lower_test_end}")
             print(f" Upper train {upper_train_start}, prediction {upper_test_start}..{upper_test_end}")
         # Random starting point
         offset = tf.random_uniform((), self.start_offset, free_space, dtype=tf.int32, seed=self.rand_seed)
-        end = offset + n_days
+#        offset = tf.Print(offset,['offset',tf.shape(offset),offset])
+        end = offset + n_timesteps
         # Cut all the things
-        return self.cut(hits, offset, end) + args
+        return self.cut(counts, offset, end) + args
 
-    def cut_eval(self, hits, *args):
+    def cut_eval(self, counts, *args):
         """
         Cuts segment of time series for evaluation.
-        Always cuts train_window + predict_window length segment beginning at start_offset point
-        :param hits: hits timeseries
+        Always cuts history_window_size + horizon_window_size length segment beginning at start_offset point
+        :param counts: counts timeseries
         :param args: pass-through data, will be appended to result
         :return: result of cut() + args
         """
-        end = self.start_offset + self.train_window + self.predict_window
-        return self.cut(hits, self.start_offset, end) + args
+        end = self.start_offset + self.history_window_size + self.horizon_window_size
+        return self.cut(counts, self.start_offset, end) + args
 
-    def reject_filter(self, x_hits, y_hits, *args):
+    def reject_filter(self, x_counts, y_counts, *args):
         """
         Rejects timeseries having too many zero datapoints (more than self.max_train_empty)
+        [by this point, NANs would have already been converted to 0's, this is is NAN's U 0's]
         """
         if self.verbose:
             print("max empty %d train %d predict" % (self.max_train_empty, self.max_predict_empty))
-        zeros_x = tf.reduce_sum(tf.to_int32(tf.equal(x_hits, 0.0)))
+        zeros_x = tf.reduce_sum(tf.to_int32(tf.equal(x_counts, 0.0)))
         keep = zeros_x <= self.max_train_empty
         return keep
 
-    def make_features(self, x_hits, y_hits, dow, lagged_hits, pf_agent, pf_country, pf_site, page_ix,
-                      page_popularity, year_autocorr, quarter_autocorr):
+
+
+
+
+    def make_features(self, *args):        
         """
         Main method. Assembles input data into final tensors
         """
-        # Split day of week to train and test
-        x_dow, y_dow = tf.split(dow, [self.train_window, self.predict_window], axis=0)
-
-        # Normalize hits
-        mean = tf.reduce_mean(x_hits)
-        std = tf.sqrt(tf.reduce_mean(tf.squared_difference(x_hits, mean)))
-        norm_x_hits = (x_hits - mean) / std
-        norm_y_hits = (y_hits - mean) / std
-        norm_lagged_hits = (lagged_hits - mean) / std
-
-        # Split lagged hits to train and test
-        x_lagged, y_lagged = tf.split(norm_lagged_hits, [self.train_window, self.predict_window], axis=0)
-
-        # Combine all page features into single tensor
-        stacked_features = tf.stack([page_popularity, quarter_autocorr, year_autocorr])
-        flat_page_features = tf.concat([pf_agent, pf_country, pf_site, stacked_features], axis=0)
-        page_features = tf.expand_dims(flat_page_features, 0)
-
-        # Train features
-        x_features = tf.concat([
-            # [n_days] -> [n_days, 1]
-            tf.expand_dims(norm_x_hits, -1),
-            x_dow,
-            x_lagged,
-            # Stretch page_features to all training days
-            # [1, features] -> [n_days, features]
-            tf.tile(page_features, [self.train_window, 1])
-        ], axis=1)
-
+        # =============================================================================
+        # Unpack the vars depending on which features_set - sampling_period
+        # The order needs to match the output of the cut method.
+        # cut_train and cut_eval return args + cut_output
+        # the args are things like pf_agent, p
+        # the cut_output is the same order as the return of the cut method.
+        # =============================================================================
+        print(args)
+        if self.features_set == 'arturius':
+            if self.sampling_period == 'daily':
+                x_counts, y_counts, lagged_counts, dow, woy, doy, year, holidays, pf_agent, pf_country, pf_site, page_ix, count_median, year_autocorr, quarter_autocorr, count_pctl_100 = args
+            elif self.sampling_period == 'weekly':
+                x_counts, y_counts, lagged_counts, woy, year, holidays, pf_agent, pf_country, pf_site, page_ix, count_median, year_autocorr, quarter_autocorr, count_pctl_100 = args        
+            elif self.sampling_period == 'monthly':
+                x_counts, y_counts, lagged_counts, moy, year, holidays, pf_agent, pf_country, pf_site, page_ix, count_median, year_autocorr, quarter_autocorr, count_pctl_100 = args          
+        #For now just use the same ...
+#        count_pctl_0, count_pctl_5, count_pctl_25, count_pctl_75, count_pctl_95, count_pctl_100, count_variance)
+        elif self.features_set == 'full':
+            if self.sampling_period == 'daily': 
+                x_counts, y_counts, dow, woy, doy, year, holidays, page_ix, count_median,\
+                count_pctl_0, count_pctl_5, count_pctl_25, count_pctl_75, count_pctl_95, count_pctl_100, count_variance = args                
+            elif self.sampling_period == 'weekly':
+                x_counts, y_counts, woy, year, holidays, page_ix, count_median,\
+                count_pctl_0, count_pctl_5, count_pctl_25, count_pctl_75, count_pctl_95, count_pctl_100, count_variance = args
+            elif self.sampling_period == 'monthly':
+                x_counts, y_counts, moy, year, holidays, page_ix, count_median,\
+                count_pctl_0, count_pctl_5, count_pctl_25, count_pctl_75, count_pctl_95, count_pctl_100, count_variance = args
+        
+        # =============================================================================
+        # Do train - predict splits
+        # =============================================================================
+        if self.sampling_period == 'daily':
+            x_dow, y_dow = tf.split(dow, [self.history_window_size, self.horizon_window_size], axis=0)
+            x_woy, y_woy = tf.split(woy, [self.history_window_size, self.horizon_window_size], axis=0)
+            x_doy, y_doy = tf.split(doy, [self.history_window_size, self.horizon_window_size], axis=0)
+            x_holidays, y_holidays = tf.split(holidays, [self.history_window_size, self.horizon_window_size], axis=0)
+        elif self.sampling_period == 'weekly':
+            x_woy, y_woy = tf.split(woy, [self.history_window_size, self.horizon_window_size], axis=0)
+        elif self.sampling_period == 'monthly':
+            x_moy, y_moy = tf.split(moy, [self.history_window_size, self.horizon_window_size], axis=0)
+
+        #Already did a manual kind of scaling for year in make_features.py so don't need to normalize here...
+        x_year, y_year = tf.split(year, [self.history_window_size, self.horizon_window_size], axis=0)
+        x_year = tf.expand_dims(x_year,axis=1)
+        y_year = tf.expand_dims(y_year,axis=1)
+
+        # Normalize counts
+        mean = tf.reduce_mean(x_counts)
+        std = tf.sqrt(tf.reduce_mean(tf.squared_difference(x_counts, mean)))
+        norm_x_counts = (x_counts - mean) / std
+        norm_y_counts = (y_counts - mean) / std
+        
+        if self.features_set == 'arturius':
+            norm_lagged_counts = (lagged_counts - mean) / std
+            # Split lagged counts to train and test
+            x_lagged, y_lagged = tf.split(norm_lagged_counts, [self.history_window_size, self.horizon_window_size], axis=0)            
+            scalar_features = tf.stack([count_median, quarter_autocorr, year_autocorr, count_pctl_100])
+            flat_features = tf.concat([pf_agent, pf_country, pf_site, scalar_features], axis=0) 
+            series_features = tf.expand_dims(flat_features, 0)
+        elif self.features_set == 'full':
+            scalar_features = tf.stack([count_median, count_pctl_0, count_pctl_5, count_pctl_25, count_pctl_75, count_pctl_95, count_pctl_100, count_variance])
+            #flat_features = tf.concat([pf_agent, pf_country, pf_site, scalar_features], axis=0) 
+            flat_features = tf.concat([scalar_features], axis=0) 
+            series_features = tf.expand_dims(flat_features, 0)
+        
+        
+        
+        
+#            print(scalar_features) #4
+#            print(flat_features) #18
+#            print(series_features)
+#            print([pf_agent, pf_country, pf_site]) #4, 7, 3   #the one hot encoded features
+        
+
+        #Any time dependent feature need to be split into x [train] and y [test]
+        #the time INdependent features [constant per fixture] will just be tiled same way either way except diff lengths
+
+        # Train features, depending on measurement frequency
+        x_features = tf.expand_dims(norm_x_counts, -1) # [n_timesteps] -> [n_timesteps, 1]
+        if self.sampling_period == 'daily':
+            x_features = tf.concat([x_features, x_dow, x_woy, tf.expand_dims(x_doy,-1), x_year, x_holidays], axis=1)
+        elif self.sampling_period == 'weekly':
+            x_features = tf.concat([x_features, x_woy, x_year, x_holidays], axis=1)            
+        elif self.sampling_period == 'monthly':
+            x_features = tf.concat([x_features, x_moy, x_year, x_holidays], axis=1)             
+        
+        #Regardess of period/frequency will have below features:
+        if self.features_set == 'arturius':
+            x_features = tf.concat([x_features, x_lagged,
+                                    # Stretch series_features to all training days
+                                    # [1, features] -> [n_timesteps, features]
+                                    tf.tile(series_features, [self.history_window_size, 1])], axis=1)
+        elif self.features_set == 'full':
+            x_features = tf.concat([x_features, 
+                                    # Stretch series_features to all training days
+                                    # [1, features] -> [n_timesteps, features]
+                                    tf.tile(series_features, [self.history_window_size, 1])], axis=1)
+                
+                
         # Test features
-        y_features = tf.concat([
-            # [n_days] -> [n_days, 1]
-            y_dow,
-            y_lagged,
-            # Stretch page_features to all testing days
-            # [1, features] -> [n_days, features]
-            tf.tile(page_features, [self.predict_window, 1])
-        ], axis=1)
-
-        return x_hits, x_features, norm_x_hits, x_lagged, y_hits, y_features, norm_y_hits, mean, std, flat_page_features, page_ix
-
-    def __init__(self, inp: VarFeeder, features: Iterable[tf.Tensor], n_pages: int, mode: ModelMode, n_epoch=None,
-                 batch_size=127, runs_in_burst=1, verbose=True, predict_window=60, train_window=500,
+        if self.sampling_period == 'daily':
+            y_features = tf.concat([y_dow, y_woy, tf.expand_dims(y_doy,-1), y_year, y_holidays], axis=1)
+        elif self.sampling_period == 'weekly':
+            y_features = tf.concat([y_woy, y_year, y_holidays], axis=1)
+        elif self.sampling_period == 'monthly':
+            y_features = tf.concat([y_moy, y_year, y_holidays], axis=1)
+
+        if self.features_set == 'arturius':
+            y_features = tf.concat([y_features, y_lagged,
+                                    # Stretch series_features to all testing days
+                                    # [1, features] -> [n_timesteps, features]
+                                    tf.tile(series_features, [self.horizon_window_size, 1])
+                                    ], axis=1)
+        elif self.features_set == 'full':
+            y_features = tf.concat([y_features,
+                                    # Stretch series_features to all testing days
+                                    # [1, features] -> [n_timesteps, features]
+                                    tf.tile(series_features, [self.horizon_window_size, 1])
+                                    ], axis=1)                
+
+#        print(x_features)
+        
+        #!!!!! why no lagged_y alnoe, only in y_features??? 
+        #!!!! why no norm_y_counts ?????
+        
+        print('x_features')
+        print(x_features)
+        print(x_features.shape)
+        if self.features_set == 'arturius':
+            return x_counts, x_features, norm_x_counts, x_lagged, y_counts, y_features, norm_y_counts, mean, std, flat_features, page_ix
+        if self.features_set == 'full':
+            return x_counts, x_features, norm_x_counts, y_counts, y_features, norm_y_counts, mean, std, flat_features, page_ix
+        #Must match up with setting self.XYZ = it_tensors below in __init__. 
+
+
+    def __init__(self, features_set, sampling_period, inp: VarFeeder, features: Iterable[tf.Tensor], N_time_series: int, mode: ModelMode, n_epoch=None,
+                 batch_size=127, runs_in_burst=1, verbose=True, horizon_window_size=60, history_window_size=200,
                  train_completeness_threshold=1, predict_completeness_threshold=1, back_offset=0,
                  train_skip_first=0, rand_seed=None):
         """
         Create data preprocessing pipeline
+        features_set  -  arturius, simple, full, full_w_context
+        sampling_period  -  daily, weekly, monthly
         :param inp: Raw input data
         :param features: Features tensors (subset of data in inp)
-        :param n_pages: Total number of pages
+        :param N_time_series: Total number of pages
         :param mode: Train/Predict/Eval mode selector
         :param n_epoch: Number of epochs. Generates endless data stream if None
         :param batch_size:
         :param runs_in_burst: How many batches can be consumed at short time interval (burst). Multiplicator for prefetch()
         :param verbose: Print additional information during graph construction
-        :param predict_window: Number of days to predict
-        :param train_window: Use train_window days for traning
+        :param horizon_window_size: Number of days to predict
+        :param history_window_size: Use history_window_size days for traning
         :param train_completeness_threshold: Percent of zero datapoints allowed in train timeseries.
         :param predict_completeness_threshold: Percent of zero datapoints allowed in test/predict timeseries.
         :param back_offset: Don't use back_offset days at the end of timeseries
@@ -242,61 +453,169 @@ def __init__(self, inp: VarFeeder, features: Iterable[tf.Tensor], n_pages: int,
         :param rand_seed:
 
         """
-        self.n_pages = n_pages
+        
+        self.features_set = features_set
+        self.sampling_period = sampling_period
+        
+        self.N_time_series = N_time_series
         self.inp = inp
         self.batch_size = batch_size
         self.rand_seed = rand_seed
         self.back_offset = back_offset
         if verbose:
             print("Mode:%s, data days:%d, Data start:%s, data end:%s, features end:%s " % (
-            mode, inp.data_days, inp.data_start, inp.data_end, inp.features_end))
+            mode, inp.data_timesteps, inp.data_start, inp.data_end, inp.features_end))
 
         if mode == ModelMode.TRAIN:
-            # reserve predict_window at the end for validation
-            assert inp.data_days - predict_window > predict_window + train_window, \
+            # reserve horizon_window_size at the end for validation
+            assert inp.data_timesteps - horizon_window_size > horizon_window_size + history_window_size, \
                 "Predict+train window length (+predict window for validation) is larger than total number of days in dataset"
             self.start_offset = train_skip_first
         elif mode == ModelMode.EVAL or mode == ModelMode.PREDICT:
-            self.start_offset = inp.data_days - train_window - back_offset
+            print('inp.data_timesteps',inp.data_timesteps)
+            print('history_window_size',history_window_size)
+            print('back_offset',back_offset)
+            self.start_offset = inp.data_timesteps - history_window_size - back_offset #!!!!!
             if verbose:
                 train_start = inp.data_start + pd.Timedelta(self.start_offset, 'D')
-                eval_start = train_start + pd.Timedelta(train_window, 'D')
-                end = eval_start + pd.Timedelta(predict_window - 1, 'D')
+                eval_start = train_start + pd.Timedelta(history_window_size, 'D')
+                end = eval_start + pd.Timedelta(horizon_window_size - 1, 'D')
                 print("Train start %s, predict start %s, end %s" % (train_start, eval_start, end))
             assert self.start_offset >= 0
 
-        self.train_window = train_window
-        self.predict_window = predict_window
-        self.attn_window = train_window - predict_window + 1
-        self.max_train_empty = int(round(train_window * (1 - train_completeness_threshold)))
-        self.max_predict_empty = int(round(predict_window * (1 - predict_completeness_threshold)))
+
+        #For random unfirom draws: leave these as placeholders
+        self.history_window_size = history_window_size #!!!!!!!!!!!random resize
+        self.horizon_window_size = horizon_window_size#!!!!!!!!!!!random resize
+        self.attn_window = history_window_size - horizon_window_size + 1#!!!!!!!!!!!random resize
+        #For train empty, if max_train_empty=history, then on data with many missing values, 
+        #you can get all NAN series, which then causes NANs in inp.time_x and 
+        #destroys the encoded_state and kills everything. You need to have at 
+        #least 1 valid value in the history window, so do min(history-1, xxxxx)
+        self.max_train_empty = min(history_window_size-1, int(np.floor(history_window_size * (1 - train_completeness_threshold))))
+        self.max_predict_empty = int(np.floor(horizon_window_size * (1 - predict_completeness_threshold)))
+        
+#        self.history_window_size = tf.placeholder(tf.int32, shape=[], name='history_window_size')
+#        self.horizon_window_size = tf.placeholder(tf.int32, shape=[], name='horizon_window_size')
+#        self.attn_window = tf.placeholder(tf.int32, shape=[], name='attn_window')
+#        self.max_train_empty = tf.placeholder(tf.int32, shape=[], name='max_train_empty')
+#        self.max_predict_empty = tf.placeholder(tf.int32, shape=[], name='max_predict_empty')
+        
+#        self.history_window_size = tf.constant(222,tf.int32, shape=[], name='history_window_size')
+#        self.horizon_window_size = tf.constant(33,tf.int32, shape=[], name='horizon_window_size')
+#        self.attn_window = 111#history_window_size - horizon_window_size + 1
+#        self.max_train_empty = 111#tf.placeholder(tf.int32, shape=[], name='max_train_empty')
+#        self.max_predict_empty = 1111#tf.placeholder(tf.int32, shape=[], name='max_predict_empty')        
+        
+        
         self.mode = mode
         self.verbose = verbose
-
+        
+        self.train_completeness_threshold = train_completeness_threshold
+        self.predict_completeness_threshold = predict_completeness_threshold
+
+
+        print('max_train_empty',self.max_train_empty)
+        print('max_predict_empty',self.max_predict_empty)
+        print('history_window_size',self.history_window_size)
+        print('horizon_window_size',self.horizon_window_size)
+        print('attn_window',self.attn_window)
+
+        
+#        def random_draw_new_window_sizes():
+#            history = np.random.randint(low=7,high=120+1)
+#            horizon = np.random.randint(low=7,high=60+1)        
+#            self.history_window_size = history
+#            self.horizon_window_size = horizon
+#            self.attn_window = history - horizon + 1
+#            self.max_train_empty = min(history_window_size-1, int(np.floor(history_window_size * (1 - train_completeness_threshold))))
+#            self.max_predict_empty = int(np.floor(horizon * (1 - self.predict_completeness_threshold)))
+    
+    
+        
         # Reserve more processing threads for eval/predict because of larger batches
         num_threads = 3 if mode == ModelMode.TRAIN else 6
 
         # Choose right cutter function for current ModelMode
         cutter = {ModelMode.TRAIN: self.cut_train, ModelMode.EVAL: self.cut_eval, ModelMode.PREDICT: self.cut_eval}
         # Create dataset, transform features and assemble batches
+        #features is a list of tensors (one tensor per feature: counts, page_ix, ..., count_variance)
+        print('features',features)
+    
+    
+#        for _ in range(10):#max(n_epoch,20)):
+#            print('\n'*5)
+#            random_draw_new_window_sizes()
+#            print('max_train_empty',self.max_train_empty)
+#            print('max_predict_empty',self.max_predict_empty)
+#            print('history_window_size',self.history_window_size)
+#            print('horizon_window_size',self.horizon_window_size)
+#            print('attn_window',self.attn_window)            
+#            
+#            root_ds = tf.data.Dataset.from_tensor_slices(tuple(features)).repeat(n_epoch)
+#    #        print(root_ds.output_classes, root_ds.output_shapes, root_ds.output_types,)
+#            print('root_ds.output_shapes',root_ds.output_shapes)
+#            print('root_ds.output_types',root_ds.output_types)
+#    #        batch = (root_ds
+#    #                 .map(cutter[mode])
+#    #                 .filter(self.reject_filter)
+#    #                 .map(self.make_features, num_parallel_calls=num_threads)
+#    #                 .batch(batch_size)
+#    #                 .prefetch(runs_in_burst * 2)
+#    #                 )
+#            
+#            #TEST:change horisoron jiostory
+#            batch = root_ds.map(cutter[mode]).filter(self.reject_filter).map(self.make_features, num_parallel_calls=num_threads)
+#            print('batch MFM', batch)
+#            
+#            batch = batch.batch(batch_size)
+#            print('batch B', batch)
+#             
+#            batch = batch.prefetch(runs_in_burst * 2)
+#            print('batch P', batch)
+#            batch = (batch)
+
         root_ds = tf.data.Dataset.from_tensor_slices(tuple(features)).repeat(n_epoch)
         batch = (root_ds
+#                 .map(self.randomize_window_sizes)
                  .map(cutter[mode])
                  .filter(self.reject_filter)
                  .map(self.make_features, num_parallel_calls=num_threads)
                  .batch(batch_size)
                  .prefetch(runs_in_burst * 2)
-                 )
-
+                 )        
+        
+        print('---------------- Done batching ----------------')
+        print(batch)
         self.iterator = batch.make_initializable_iterator()
         it_tensors = self.iterator.get_next()
+#        print('self.iterator',self.iterator)
+#        print('it_tensors',it_tensors)
 
         # Assign all tensors to class variables
-        self.true_x, self.time_x, self.norm_x, self.lagged_x, self.true_y, self.time_y, self.norm_y, self.norm_mean, \
-        self.norm_std, self.page_features, self.page_ix = it_tensors
+        #self.time_x is the tensor of features, regardless of which feature set, so this can stay same.
+        if self.features_set=='arturius':
+            self.true_x, self.time_x, self.norm_x, self.lagged_x, self.true_y, self.time_y, self.norm_y, self.norm_mean, \
+            self.norm_std, self.series_features, self.page_ix = it_tensors
+        elif self.features_set=='full':
+            self.true_x, self.time_x, self.norm_x, self.true_y, self.time_y, self.norm_y, self.norm_mean, \
+            self.norm_std, self.series_features, self.page_ix = it_tensors
+        print('self.true_x', self.true_x)
+#        self.true_x = tf.Print(self.true_x,['self.true_x',self.true_x])
+        print('self.time_x', self.time_x)
+#        print('self.time_y', self.time_y)
+#        self.time_y = tf.Print(self.time_y,['self.time_y',self.time_y])
+        """if self.features_set=='simple':
+            pass
+#        if self.features_set=='full':
+#            pass
+        if self.features_set=='full_w_context':
+            pass"""
 
         self.encoder_features_depth = self.time_x.shape[2].value
-
+        print('self.encoder_features_depth',self.encoder_features_depth)
+        print('self.time_x.shape',self.time_x.shape)
+        
     def load_vars(self, session):
         self.inp.restore(session)
 
@@ -304,6 +623,38 @@ def init_iterator(self, session):
         session.run(self.iterator.initializer)
 
 
-def page_features(inp: VarFeeder):
-    return (inp.hits, inp.pf_agent, inp.pf_country, inp.pf_site,
-            inp.page_ix, inp.page_popularity, inp.year_autocorr, inp.quarter_autocorr)
+def page_features(inp: VarFeeder, features_set):
+    """
+    Other than inp.counts, these features are the static features.
+    So do not need to pass in here the time-varying ones like day of week, 
+    month of year, lagged, etc.
+    
+    DO NOT return dow, woy, moy, year, doy, holidays
+    """
+    
+    if features_set=='arturius':
+        d = (inp.counts, inp.pf_agent, inp.pf_country, inp.pf_site,
+                inp.page_ix, inp.count_median, inp.year_autocorr, inp.quarter_autocorr,
+                inp.count_pctl_100
+                )
+        
+#    elif features_set=='simple':
+#        raise Exception('not ready yet')
+        
+    elif features_set=='full':
+        d = (inp.counts,
+            inp.page_ix,
+            inp.count_median,
+#            inp.year_autocorr, inp.quarter_autocorr,
+            inp.count_pctl_0,
+            inp.count_pctl_5,
+            inp.count_pctl_25,
+            inp.count_pctl_75,
+            inp.count_pctl_95,
+            inp.count_pctl_100,
+            inp.count_variance)          
+        
+#    elif features_set=='full_w_context':
+#        raise Exception('not ready yet')
+    
+    return d
\ No newline at end of file
diff --git a/make_features.py b/make_features.py
old mode 100644
new mode 100755
index 23d547f..2a15801
--- a/make_features.py
+++ b/make_features.py
@@ -9,6 +9,8 @@
 import numba
 from typing import Tuple, Dict, Collection, List
 
+from holiday_features import encode_all_holidays__daily
+
 
 def read_cached(name) -> pd.DataFrame:
     """
@@ -16,8 +18,8 @@ def read_cached(name) -> pd.DataFrame:
     :param name: file name without extension
     :return: file content
     """
-    cached = 'data/%s.pkl' % name
-    sources = ['data/%s.csv' % name, 'data/%s.csv.zip' % name]
+    cached = '%s.pkl' % name
+    sources = ['%s.csv' % name, '%s.csv.zip' % name]
     if os.path.exists(cached):
         return pd.read_pickle(cached)
     else:
@@ -28,55 +30,64 @@ def read_cached(name) -> pd.DataFrame:
                 return df
 
 
-def read_all() -> pd.DataFrame:
+def read_all(data_type,sampling_period,mode_chunk_name) -> pd.DataFrame:
     """
     Reads source data for training/prediction
     """
     def read_file(file):
-        df = read_cached(file).set_index('Page')
+        try:
+            df = read_cached(file).set_index('Page')
+        except AttributeError:
+            raise Exception('File not exist, did you specify correct sampling_period?')        
         df.columns = df.columns.astype('M8[D]')
         return df
 
     # Path to cached data
-    path = os.path.join('data', 'all.pkl')
+    path = os.path.join('data', 'all_{}.pkl'.format(mode_chunk_name))
+        
     if os.path.exists(path):
         df = pd.read_pickle(path)
     else:
-        # Official data
-        df = read_file('train_2')
-        # Scraped data
-        scraped = read_file('2017-08-15_2017-09-11')
-        # Update last two days by scraped data
-        df[pd.Timestamp('2017-09-10')] = scraped['2017-09-10']
-        df[pd.Timestamp('2017-09-11')] = scraped['2017-09-11']
+        filename = f'data/{data_type}_{sampling_period}_{mode_chunk_name}'
+        df = read_file(filename)
+        
+        if data_type=='kaggle':
+            # Scraped data
+            scraped = read_file(r'data/2017-08-15_2017-09-11')
+            # Update last two days by scraped data
+            df[pd.Timestamp('2017-09-10')] = scraped['2017-09-10']
+            df[pd.Timestamp('2017-09-11')] = scraped['2017-09-11']
 
         df = df.sort_index()
         # Cache result
         df.to_pickle(path)
     return df
 
-# todo:remove
-def make_holidays(tagged, start, end) -> pd.DataFrame:
-    def read_df(lang):
-        result = pd.read_pickle('data/holidays/%s.pkl' % lang)
-        return result[~result.dw].resample('D').size().rename(lang)
 
-    holidays = pd.DataFrame([read_df(lang) for lang in ['de', 'en', 'es', 'fr', 'ja', 'ru', 'zh']])
-    holidays = holidays.loc[:, start:end].fillna(0)
-    result =tagged[['country']].join(holidays, on='country').drop('country', axis=1).fillna(0).astype(np.int8)
-    result.columns = pd.DatetimeIndex(result.columns.values)
-    return result
 
 
-def read_x(start, end) -> pd.DataFrame:
+## todo:remove
+#def make_holidays(tagged, start, end) -> pd.DataFrame:
+#    def read_df(lang):
+#        result = pd.read_pickle('data/holidays/%s.pkl' % lang)
+#        return result[~result.dw].resample('D').size().rename(lang)
+#
+#    holidays = pd.DataFrame([read_df(lang) for lang in ['en']])#['de', 'en', 'es', 'fr', 'ja', 'ru', 'zh']]) #!!!!!!!!!!! can play around with this: english only
+#    holidays = holidays.loc[:, start:end].fillna(0)
+#    result =tagged[['country']].join(holidays, on='country').drop('country', axis=1).fillna(0).astype(np.int8)
+#    result.columns = pd.DatetimeIndex(result.columns.values)
+#    return result
+
+
+def read_x(start, end, data_type, sampling_period, mode_chunk_name) -> pd.DataFrame:
     """
     Gets source data from start to end date. Any date can be None
     """
-    df = read_all()
+    df = read_all(data_type,sampling_period,mode_chunk_name)
     # User GoogleAnalitycsRoman has really bad data with huge traffic spikes in all incarnations.
     # Wikipedia banned him, we'll ban it too
-    bad_roman = df.index.str.startswith("User:GoogleAnalitycsRoman")
-    df = df[~bad_roman]
+#    bad_roman = df.index.str.startswith("User:GoogleAnalitycsRoman")
+#    df = df[~bad_roman]
     if start and end:
         return df.loc[:, start:end]
     elif end:
@@ -107,7 +118,7 @@ def single_autocorr(series, lag):
 def batch_autocorr(data, lag, starts, ends, threshold, backoffset=0):
     """
     Calculate autocorrelation for batch (many time series at once)
-    :param data: Time series, shape [n_pages, n_days]
+    :param data: Time series, shape [N_time_series, n_timesteps]
     :param lag: Autocorrelation lag
     :param starts: Start index for each series
     :param ends: End index for each series
@@ -117,8 +128,8 @@ def batch_autocorr(data, lag, starts, ends, threshold, backoffset=0):
     autocorrelation value is NaN
     """
     n_series = data.shape[0]
-    n_days = data.shape[1]
-    max_end = n_days - backoffset
+    n_timesteps = data.shape[1]
+    max_end = n_timesteps - backoffset
     corr = np.empty(n_series, dtype=np.float64)
     support = np.empty(n_series, dtype=np.float64)
     for i in range(n_series):
@@ -143,28 +154,28 @@ def find_start_end(data: np.ndarray):
     """
     Calculates start and end of real traffic data. Start is an index of first non-zero, non-NaN value,
      end is index of last non-zero, non-NaN value
-    :param data: Time series, shape [n_pages, n_days]
+    :param data: Time series, shape [N_time_series, n_timesteps]
     :return:
     """
-    n_pages = data.shape[0]
-    n_days = data.shape[1]
-    start_idx = np.full(n_pages, -1, dtype=np.int32)
-    end_idx = np.full(n_pages, -1, dtype=np.int32)
-    for page in range(n_pages):
+    N_time_series = data.shape[0]
+    n_timesteps = data.shape[1]
+    start_idx = np.full(N_time_series, -1, dtype=np.int32)
+    end_idx = np.full(N_time_series, -1, dtype=np.int32)
+    for page in range(N_time_series):
         # scan from start to the end
-        for day in range(n_days):
+        for day in range(n_timesteps):
             if not np.isnan(data[page, day]) and data[page, day] > 0:
                 start_idx[page] = day
                 break
         # reverse scan, from end to start
-        for day in range(n_days - 1, -1, -1):
+        for day in range(n_timesteps - 1, -1, -1):
             if not np.isnan(data[page, day]) and data[page, day] > 0:
                 end_idx[page] = day
                 break
     return start_idx, end_idx
 
 
-def prepare_data(start, end, valid_threshold) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray]:
+def prepare_data(start, end, valid_threshold, data_type, sampling_period, mode_chunk_name) -> Tuple[pd.DataFrame, pd.DataFrame, np.ndarray, np.ndarray]:
     """
     Reads source data, calculates start and end of each series, drops bad series, calculates log1p(series)
     :param start: start date of effective time interval, can be None to start from beginning
@@ -173,7 +184,7 @@ def prepare_data(start, end, valid_threshold) -> Tuple[pd.DataFrame, pd.DataFram
     ratio is less than threshold
     :return: tuple(log1p(series), nans, series start, series end)
     """
-    df = read_x(start, end)
+    df = read_x(start, end, data_type, sampling_period, mode_chunk_name)
     starts, ends = find_start_end(df.values)
     # boolean mask for bad (too short) series
     page_mask = (ends - starts) / df.shape[1] < valid_threshold
@@ -245,7 +256,7 @@ def encode_page_features(df) -> Dict[str, pd.DataFrame]:
     """
     Applies one-hot encoding to page features and normalises result
     :param df: page features DataFrame (one column per feature)
-    :return: dictionary feature_name:encoded_values. Encoded values is [n_pages,n_values] array
+    :return: dictionary feature_name:encoded_values. Encoded values is [N_time_series,n_values] array
     """
     def encode(column) -> pd.DataFrame:
         one_hot = pd.get_dummies(df[column], drop_first=False)
@@ -259,29 +270,58 @@ def normalize(values: np.ndarray):
     return (values - values.mean()) / np.std(values)
 
 
+
+
+
+
+
 def run():
     parser = argparse.ArgumentParser(description='Prepare data')
-    parser.add_argument('data_dir')
+    parser.add_argument('data_dir',  help="Directory of pickles, etc., e.g. 'data/TRAINset4' or 'data/TESTset4'")
+#    parser.add_argument('mode', help="Which mode running in, determines some directories: {'train','test'}")
+    
+    parser.add_argument('data_type', help="Which data set to use: {'kaggle','ours'}")
+    parser.add_argument('sampling_period', help="Sampling period for our data: {'daily','weekly','monthly'}")
+    parser.add_argument('features_set', help="Which set of features to use. His default for Kaggle vs. one of my custom sets: {'arturius','simple','full','full_w_context'}")
+
     parser.add_argument('--valid_threshold', default=0.0, type=float, help="Series minimal length threshold (pct of data length)")
-    parser.add_argument('--add_days', default=64, type=int, help="Add N days in a future for prediction")
+    parser.add_argument('--add_days', default=0, type=int, help="Add N days in a future for prediction")
     parser.add_argument('--start', help="Effective start date. Data before the start is dropped")
     parser.add_argument('--end', help="Effective end date. Data past the end is dropped")
     parser.add_argument('--corr_backoffset', default=0, type=int, help='Offset for correlation calculation')
     args = parser.parse_args()
 
+
+#    if args.mode=='train':
+#        data_dir = r"data/vars_TRAIN" 
+#    elif args.mode=='test':
+#        data_dir = r"data/vars_TEST"
+    
+    mode_chunk_name = args.data_dir.split('/')[-1]
+    
+    print(args.data_dir, args.data_type, args.features_set)
+
     # Get the data
-    df, nans, starts, ends = prepare_data(args.start, args.end, args.valid_threshold)
+    df, nans, starts, ends = prepare_data(args.start, args.end, args.valid_threshold, args.data_type, args.sampling_period, mode_chunk_name)
+
+    # =============================================================================
+    # STATIC FEATURES
+    # =============================================================================
 
     # Our working date range
     data_start, data_end = df.columns[0], df.columns[-1]
 
     # We have to project some date-dependent features (day of week, etc) to the future dates for prediction
-    features_end = data_end + pd.Timedelta(args.add_days, unit='D')
+    features_end = data_end + pd.Timedelta(args.add_days, unit='D') #!!!!!!!!!!! will need to change for WEEKLY MONTHLY sampled
     print(f"start: {data_start}, end:{data_end}, features_end:{features_end}")
 
     # Group unique pages by agents
     assert df.index.is_monotonic_increasing
-    page_map = uniq_page_map(df.index.values)
+    #Only do this wikipedia web scraping if doing the kaggle comp. Not for ours
+    if args.data_type=='kaggle':
+        page_map = uniq_page_map(df.index.values)
+
+
 
     # Yearly(annual) autocorrelation
     raw_year_autocorr = batch_autocorr(df.values, 365, starts, ends, 1.5, args.corr_backoffset)
@@ -298,49 +338,239 @@ def run():
     quarter_autocorr = normalize(np.nan_to_num(raw_quarter_autocorr))
 
     # Calculate and encode page features
-    page_features = make_page_features(df.index.values)
-    encoded_page_features = encode_page_features(page_features)
-
-    # Make time-dependent features
-    features_days = pd.date_range(data_start, features_end)
-    #dow = normalize(features_days.dayofweek.values)
-    week_period = 7 / (2 * np.pi)
-    dow_norm = features_days.dayofweek.values / week_period
-    dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)
-
+    if args.data_type=='kaggle':
+        page_features = make_page_features(df.index.values)
+        encoded_page_features = encode_page_features(page_features)
+
+
+#    #To get idea of overall scale of a time series, to compare between time series, which would be lost if just used standard scaled values:
+#    count_median = df.median(axis=1)
+#    count_median = normalize(count_median)
+#    #Play around w a few other basic summary stats
+#    percentiles = []
+#    for pctl in [0,5,25,75,95,100]:
+#        percentiles.append(normalize(np.percentile(df.values,pctl,axis=1)))
+#    count_variance = normalize(np.var(df.values,axis=1))
+#    #entropy = normalize(entropy(df.values,axis=1))
+#    #filled_len = df.values.shape[1] - np.count_nonzero(np.isnan(df.values),axis=1) #non-nans
+#    #series_length = (df.values>0).sum(axis=1) #actually it has already been log transofmred so this is not correct
+#
+    #!!!! Seems like the above way Kaggle does it is kind of against best practices [seems to use ~TEST set information to create features,
+    #in terms of how it does a whole series median, e.g., but at any given prediction, that would contribute to the median...]
+    #Better: use e.g. 1st 7 observations to get median, etc. features, since 7 is min history size.
+    #This is a lot more fair. Technixally, even this is ~cheating because depending on train_complete_threshold the model may have
+    #to make a prediction after seeing as few as 1 observation, so using first 7 is potentially unfair as well, depending on TCT.
+    #But is OK, and especially since most series esp. those we care about, will not have such bad missing.
+    MIN_HIST = 7
+    count_median = []
+    percs = [0,5,25,75,95,100]
+#    percentiles = np.zeros((len(df),len(percs)))
+    percentiles = []
+    count_variance = []
+    for rr in range(len(df)):
+        _ = df.iloc[rr].values
+        gg = np.where(_>0.)[0]
+        if gg.size>MIN_HIST-1:
+            #last_ind = gg[max(MIN_HIST-1,gg.size)]
+            last_ind = gg[MIN_HIST-1]
+            v = _[:last_ind][(_[:last_ind])>0.]
+            count_median.extend([np.nanmedian(v)])
+            p = []
+            for pctl in percs:
+                p.extend([np.percentile(v,pctl)])
+            percentiles.append(p)
+            count_variance.extend([np.nanvar(v)])
+        else:
+            #If the entire time series has fewer than MIN_HIST-1 valid values, just put in 0, won't matter because this series wouldn't be included anyway since so few values it shouldn't pass filter.
+            count_median.extend([0.])
+            percentiles.append([0.]*len(percs))
+            count_variance.extend([0.])
+    count_median = normalize(np.array(count_median))
+    percentiles = [normalize(np.array(percentiles)[:,rr]) for rr in range(len(percs))]
+    count_variance = normalize(np.array(count_variance))
+    
+    # =============================================================================
+    # TIME-VARYING FEATURES
+    # =============================================================================
+    #Could determine week of year number in several ways: 1) as in Pandas as starting on a particular day of week,
+    # 2. just use day of year / 365
+    WEEK_NUMBER_METHOD = 'floor7'#'pandas' #'floor7'
+    WEEK_NUMBER_MAX = 53. #52.
+    
+    REFERENCE_FIRST_YEAR = 2010 #Use the year number as a feature, calculated as (year-REF_1)/(REF_2 - REF_1) to put on smaller scale 
+    #(must be careful about normalizing on the fly within window, where depending on window size, most observations will have same year number, and 0 variance)
+    #so do this manual scaling instead of standard mean-var scaling
+    REFERENCE_LAST_YEAR = 2020
+    
+    
+    features_times = pd.date_range(data_start, features_end, freq='D')
+    
+    if args.sampling_period=='daily':
+        #dow = normalize(features_times.dayofweek.values)
+        week_period = 7 / (2 * np.pi)
+        dow_norm = features_times.dayofweek.values / week_period #S.dayofweek gives day of the week with Monday=0, Sunday=6
+        dow = np.stack([np.cos(dow_norm), np.sin(dow_norm)], axis=-1)
+        
+        #index of week number, when sampling at DAILY level
+        if WEEK_NUMBER_METHOD=='pandas':
+            week = features_times.weekofyear.values
+        elif WEEK_NUMBER_METHOD=='floor7':
+            week = np.floor((features_times.dayofyear.values - 1.) /7.)
+        year_period = WEEK_NUMBER_MAX / (2 * np.pi) #!!!! need to be carefuly non-uniform weeks [52 has 10 days ???]  ---> actually in pandas numbering goes to 53, depending on start day of week for that year
+        woy_norm = week / year_period #not sure if by default this starts on Monday vs Sunday
+        woy = np.stack([np.cos(woy_norm), np.sin(woy_norm)], axis=-1)
+        #Also day of year number. Do not do same circle encoding, just let it be usual ordinal.
+        #Also, careful w leapyear. After February, year's w it would be out of phase vs. years w/o leap year
+        #Instead, could leave a gap for leapyear. If that particular year has it, fill it in with that ordinal,
+        #otherwise the model just does not have that index.
+        doy = features_times.dayofyear.values
+        #If not doing the circle encoding, then normalize:
+        doy = normalize(doy)
+    
+    
+    if args.sampling_period=='weekly':
+        #index of week number, when sampling at WEEKLY level (this is different than above)
+#        features_times = pd.date_range(data_start, features_end, freq='W')
+        #!!!!!!!!!!!!! still need to worry about alignment ... 
+        if WEEK_NUMBER_METHOD=='pandas':
+            week = features_times.weekofyear.values
+        elif WEEK_NUMBER_METHOD=='floor7':
+            week = np.floor((features_times.dayofyear.values - 1.) /7.)
+        year_period = WEEK_NUMBER_MAX / (2 * np.pi) #!!!! need to be carefuly non-uniform weeks [52 has 10 days ???]  ---> actually in pandas numbering goes to 53, depending on start day of week for that year
+        woy_norm = week / year_period #not sure if by default this starts on Monday vs Sunday
+        woy = np.stack([np.cos(woy_norm), np.sin(woy_norm)], axis=-1)
+    
+    if args.sampling_period=='monthly':
+        #month index (only used if sampling monthly)
+#        features_times = pd.date_range(data_start, features_end, freq='M') #!!!!! need to think about alignment of starting month on particular dates ....
+        period = 12. / (2 * np.pi) #!!!! need to be carefuly non-uniform weeks [52 has 10 days ???]  ---> actually in pandas numbering goes to 53, depending on start day of week for that year
+        moy_norm = features_times.month.values / period #not sure if by default this starts on Monday vs Sunday
+        moy = np.stack([np.cos(moy_norm), np.sin(moy_norm)], axis=-1)    
+
+    #To catch longer term trending data, can also include year number. [depending on size of train / prediction windows and random sampling boundaries could be same value over whole series]
+    year = (features_times.year - REFERENCE_FIRST_YEAR)/float(REFERENCE_LAST_YEAR-REFERENCE_FIRST_YEAR)
+    
+    
+    #Holidays: try my "spiral encoding":
+    #Right now only doing for daily sampled data:
+    if args.sampling_period=='daily':
+        holidays = encode_all_holidays__daily(features_times)
+    
+    
+    
+    
     # Assemble indices for quarterly lagged data
     lagged_ix = np.stack(lag_indexes(data_start, features_end), axis=-1)
 
-    page_popularity = df.median(axis=1)
-    page_popularity = (page_popularity - page_popularity.mean()) / page_popularity.std()
-
     # Put NaNs back
     df[nans] = np.NaN
 
-    # Assemble final output
-    tensors = dict(
-        hits=df,
-        lagged_ix=lagged_ix,
-        page_map=page_map,
-        page_ix=df.index.values,
-        pf_agent=encoded_page_features['agent'],
-        pf_country=encoded_page_features['country'],
-        pf_site=encoded_page_features['site'],
-        page_popularity=page_popularity,
-        year_autocorr=year_autocorr,
-        quarter_autocorr=quarter_autocorr,
-        dow=dow,
-    )
+    #Compile the features
+    print(f'Using {args.features_set} set of features')
+
+    if args.features_set == 'arturius':
+        if args.data_type != 'kaggle':
+            raise Exception('arturius features can only work with data_type "kaggle" since scrapes wikipedia pages')
+        tensors = dict(
+            counts=df,
+            lagged_ix=lagged_ix,
+            page_map=page_map,
+            page_ix=df.index.values,
+            pf_agent=encoded_page_features['agent'],#ll-access_all-agents  all-access_spider  desktop_all-agents  mobile-web_all-agents
+            pf_country=encoded_page_features['country'],#de        en        es        fr        ja        ru        zh
+            pf_site=encoded_page_features['site'], #commons.wikimedia.org  wikipedia.org  www.mediawiki.org
+            count_median=count_median,
+            year_autocorr=year_autocorr,
+            quarter_autocorr=quarter_autocorr,
+            #dow=dow,#N x 2 array since encoded week periodicity as complex number
+            count_pctl_100=percentiles[5],#max #!!!!!!!!!!!!!!!! just to see what happens: apend one of my features.
+        )
+    
+#    elif args.features_set == 'simple':
+#        tensors = dict(
+#            counts=df,
+#            count_median=count_median,#this is just the median feature, can put in others too
+#            #dow=dow,
+#        )    
+        
+    elif (args.features_set == 'full') or (args.features_set == 'full_w_context'):
+        tensors = dict(
+            counts=df,
+#            lagged_ix=lagged_ix,
+            page_map=np.zeros(len(df)),#just set to a dummy all 0's  #could also use this place to code 0,1 for real vs. synthetic augmented data... #page_map is only used in "Splitter", which is only for side_split validation option.
+            page_ix=df.index.values,#!!!!!! 
+
+#            year_autocorr=year_autocorr,
+#            quarter_autocorr=quarter_autocorr,
+            count_median=count_median,#this is just the median feature, can put in others too
+            count_variance=count_variance,#variance
+            #entropy
+            
+            #percentiles
+            count_pctl_0=percentiles[0],#min
+            count_pctl_5=percentiles[1],#5th percentile
+            count_pctl_25=percentiles[2],#25th percentile
+            count_pctl_75=percentiles[3],#75th percentile
+            count_pctl_95=percentiles[4],#95th percentile
+            count_pctl_100=percentiles[5],#max
+#            series_length=series_length,#length of series [number of samples] to get idea of how much history a series has #number nonzero
+            
+            #Other time-frequency/scale features
+            #tsfresh features
+            #...
+        )  
+        
+    else:
+        raise Exception(f'features_set must be specified\nOne of ["arturius","simple","full","full_w_context"]')
+
+        
+    if args.sampling_period=='daily':
+        tensors['dow']=dow
+        tensors['woy']=woy #and want want week number too, aggregating last ~10 days into week 52
+        tensors['doy']=doy
+        tensors['holidays']=holidays
+    elif args.sampling_period=='weekly':
+        tensors['woy']=woy
+    elif args.sampling_period=='monthly':
+        tensors['moy']=moy
+    else:
+        raise Exception('Must specify correct sampling period')
+    
+    #Also use year number as feature
+    tensors['year']=year
+            
+    
+    """#If provide other info based on e.g. new location (any features that are not derived purely from the time series)
+    if args.features_set == 'full_w_context':
+        tensors['country'] = asdasdasd
+        tensors['region'] = asdasdasd
+        tensors['city_population'] = asdasdasd
+        raise Exception('not implemented yet')
+        #... can write scraper function to get these ..."""
+        
+        
+        
+
+    
+    
+    
+    
     plain = dict(
-        features_days=len(features_days),
-        data_days=len(df.columns),
-        n_pages=len(df),
+        features_times=len(features_times),
+        data_timesteps=len(df.columns),
+        N_time_series=len(df),
         data_start=data_start,
         data_end=data_end,
         features_end=features_end
 
     )
 
+
+    print(tensors)
+    print(plain)
+    print(tensors.keys())
+    print(plain.keys())
+
     # Store data to the disk
     VarFeeder(args.data_dir, tensors, plain)
 
diff --git a/model.py b/model.py
index 4d658d8..fe40f8f 100644
--- a/model.py
+++ b/model.py
@@ -7,14 +7,30 @@
 from tensorflow.python.util import nest
 
 from cocob import COCOB
+from Adam_HD_optimizer import AdamHDOptimizer
+from SGDN_HD_optimizer import MomentumSGDHDOptimizer
 from input_pipe import InputPipe, ModelMode
 
+
 GRAD_CLIP_THRESHOLD = 10
 RNN = cudnn_rnn.CudnnGRU
-# RNN = tf.contrib.cudnn_rnn.CudnnLSTM
+#RNN = tf.contrib.cudnn_rnn.CudnnLSTM
 # RNN = tf.contrib.cudnn_rnn.CudnnRNNRelu
 
 
+
+
+
+def debug_tensor_print(tensor):
+    """
+    Debugging mode:
+        Print info about a tensor in realtime
+    """
+    tensor_list = [tensor.name, tf.shape(tensor), tensor]
+    tensor = tf.Print(tensor, tensor_list)
+    return tensor
+
+
 def default_init(seed):
     # replica of tf.glorot_uniform_initializer(seed=seed)
     return layers.variance_scaling_initializer(factor=1.0,
@@ -62,16 +78,24 @@ def make_encoder(time_inputs, encoder_features_depth, is_train, hparams, seed, t
     :param transpose_output: Transform RNN output to batch-first shape
     :return:
     """
-
+    DIRECTION = 'unidirectional'#'bidirectional',#'unidirectional', #Let's try bidirectional as well, or ,ay as well try keeping unidirectional but with order reversed, just see what happens
     def build_rnn():
         return RNN(num_layers=hparams.encoder_rnn_layers, num_units=hparams.rnn_depth,
                    input_size=encoder_features_depth,
-                   direction='unidirectional',
+                   direction=DIRECTION,
+                   #assume merge mode default is concat??
+                   #need to fix dimensions error. If could change merge mode to sum or mean or something then at least output dimension is same so might be easiest way to avoid  error ?
                    dropout=hparams.encoder_dropout if is_train else 0, seed=seed)
 
     static_p_size = cuda_params_size(build_rnn)
+#    static_p_size = tf.Print(static_p_size,['static_p_size',static_p_size])
+#    print('static_p_size',static_p_size)
     cuda_model = build_rnn()
+#    time_inputs = tf.check_numerics(time_inputs,'time_inputs')
+    
     params_size_t = cuda_model.params_size()
+#    params_size_t = tf.Print(static_p_size,['params_size_t',params_size_t])
+#    print('params_size_t',params_size_t)
     with tf.control_dependencies([tf.assert_equal(params_size_t, [static_p_size])]):
         cuda_params = tf.get_variable("cuda_rnn_params",
                                       initializer=tf.random_uniform([static_p_size], minval=-0.05, maxval=0.05,
@@ -79,14 +103,24 @@ def build_rnn():
                                       )
 
     def build_init_state():
-        batch_len = tf.shape(time_inputs)[0]
+        batch_len = tf.shape(time_inputs)[0] #!!!!!!!! for random history/horizon size, may need to adjust
         return tf.zeros([hparams.encoder_rnn_layers, batch_len, hparams.rnn_depth], dtype=tf.float32)
+#    def build_init_state():
+#        batch_len = tf.shape(time_inputs)[0] #!!!!!!!! for random history/horizon size, may need to adjust
+#        if DIRECTION == 'unidirectional':
+#            return tf.zeros([1, hparams.encoder_rnn_layers, batch_len, hparams.rnn_depth], dtype=tf.float32)
+#        if DIRECTION == 'bidirectional':
+#            return tf.zeros([2, hparams.encoder_rnn_layers, batch_len, hparams.rnn_depth], dtype=tf.float32)        
 
     input_h = build_init_state()
 
     # [batch, time, features] -> [time, batch, features]
     time_first = tf.transpose(time_inputs, [1, 0, 2])
     rnn_time_input = time_first
+    
+    
+#    cuda_params = tf.Print(cuda_params,['cuda_params',tf.shape(cuda_params),cuda_params]) #???? shape is [233892]
+    
     model = partial(cuda_model, input_data=rnn_time_input, input_h=input_h, params=cuda_params)
     if RNN == tf.contrib.cudnn_rnn.CudnnLSTM:
         rnn_out, rnn_state, c_state = model(input_c=build_init_state())
@@ -95,6 +129,16 @@ def build_init_state():
         c_state = None
     if transpose_output:
         rnn_out = tf.transpose(rnn_out, [1, 0, 2])
+        
+    
+    #Need to check for NANs that are sometimes happening
+    rnn_out = tf.check_numerics(rnn_out,'rnn_out')    
+    rnn_state = tf.check_numerics(rnn_state,'rnn_state')
+
+        
+#    rnn_out = tf.Print(rnn_out,['rnn_out',rnn_out])
+#    rnn_state = tf.Print(rnn_state,['rnn_state',rnn_state,'encoder_features_depth',encoder_features_depth])
+#    encoder_features_depth = tf.Print(encoder_features_depth,['encoder_features_depth',encoder_features_depth])
     return rnn_out, rnn_state, c_state
 
 
@@ -151,6 +195,7 @@ def make_fingerprint(x, is_train, fc_dropout, seed):
     return out_encoder
 
 
+
 def attn_readout_v3(readout, attn_window, attn_heads, page_features, seed):
     # input: [n_days, batch, readout_depth]
     # [n_days, batch, readout_depth] -> [batch(readout_depth), width=n_days, channels=batch]
@@ -158,7 +203,7 @@ def attn_readout_v3(readout, attn_window, attn_heads, page_features, seed):
     # [batch(readout_depth), width, channels] -> [batch, height=1, width, channels]
     inp = readout[:, tf.newaxis, :, :]
 
-    # attn_window = train_window - predict_window + 1
+    # attn_window = history_window_size - horizon_window_size + 1
     # [batch, attn_window * n_heads]
     filter_logits = tf.layers.dense(page_features, attn_window * attn_heads, name="attn_focus",
                                     kernel_initializer=default_init(seed)
@@ -176,15 +221,15 @@ def attn_readout_v3(readout, attn_window, attn_heads, page_features, seed):
 
     # [width(attn_window), channels(batch), n_heads] -> [height(1), width(attn_window), channels(batch), multiplier(n_heads)]
     attn_filter = attns_max[tf.newaxis, :, :, :]
-    # [batch(readout_depth), height=1, width=n_days, channels=batch] -> [batch(readout_depth), height=1, width=predict_window, channels=batch*n_heads]
+    # [batch(readout_depth), height=1, width=n_days, channels=batch] -> [batch(readout_depth), height=1, width=horizon_window_size, channels=batch*n_heads]
     averaged = tf.nn.depthwise_conv2d_native(inp, attn_filter, [1, 1, 1, 1], 'VALID')
-    # [batch, height=1, width=predict_window, channels=readout_depth*n_neads] -> [batch(depth), predict_window, batch*n_heads]
+    # [batch, height=1, width=horizon_window_size, channels=readout_depth*n_neads] -> [batch(depth), horizon_window_size, batch*n_heads]
     attn_features = tf.squeeze(averaged, 1)
-    # [batch(depth), predict_window, batch*n_heads] -> [batch*n_heads, predict_window, depth]
+    # [batch(depth), horizon_window_size, batch*n_heads] -> [batch*n_heads, horizon_window_size, depth]
     attn_features = tf.transpose(attn_features, [2, 1, 0])
-    # [batch * n_heads, predict_window, depth] -> n_heads * [batch, predict_window, depth]
+    # [batch * n_heads, horizon_window_size, depth] -> n_heads * [batch, horizon_window_size, depth]
     heads = [attn_features[head_no::attn_heads] for head_no in range(attn_heads)]
-    # n_heads * [batch, predict_window, depth] -> [batch, predict_window, depth*n_heads]
+    # n_heads * [batch, horizon_window_size, depth] -> [batch, horizon_window_size, depth*n_heads]
     result = tf.concat(heads, axis=-1)
     # attn_diag = tf.unstack(attns_max, axis=-1)
     return result, None
@@ -200,11 +245,14 @@ def calc_smape_rounded(true, predicted, weights):
     """
     n_valid = tf.reduce_sum(weights)
     true_o = tf.round(tf.expm1(true))
-    pred_o = tf.maximum(tf.round(tf.expm1(predicted)), 0.0)
+    pred_o = tf.maximum(tf.round(tf.expm1(predicted)), 0.0) #!!!!!!! for us we could even clip at 1, since 0 means measurement was missing
     summ = tf.abs(true_o) + tf.abs(pred_o)
     zeros = summ < 0.01
     raw_smape = tf.abs(pred_o - true_o) / summ * 2.0
     smape = tf.where(zeros, tf.zeros_like(summ, dtype=tf.float32), raw_smape)
+    #!!!!!!!!!!! since summ is sum of absolute values of 2 rounded things, is only < .01 if is exactly = 0. For our data, this should NEVER happen, would mean unmeasured NAN, so actually this is exactly the SMAPE we want
+    
+#    smape = tf.Print(smape, ['pred_o',tf.shape(pred_o),pred_o, 'pred_o not round clip',tf.expm1(predicted),  'true_o',tf.shape(true_o),true_o,  'smape', smape, 'raw_smape', raw_smape])  
     return tf.reduce_sum(smape * weights) / n_valid
 
 
@@ -231,14 +279,35 @@ def decode_predictions(decoder_readout, inp: InputPipe):
     :param inp: Input tensors
     :return:
     """
-    # [n_days, batch] -> [batch, n_days]
+    
+    
+    # [n_days, batch, quantiles] -> [quantiles, batch, n_days]
     batch_readout = tf.transpose(decoder_readout)
+#    print('inp.norm_std',inp.norm_std)
     batch_std = tf.expand_dims(inp.norm_std, -1)
+#    print('batch_std',batch_std)
     batch_mean = tf.expand_dims(inp.norm_mean, -1)
-    return batch_readout * batch_std + batch_mean
+    
+    ret = batch_readout * batch_std + batch_mean
+#    ret = tf.Print(ret, ['ret:',tf.shape(ret),ret, 'batch_readout:',batch_readout, 'batch_std:',batch_std, 'batch_mean',batch_mean])
+    return ret
 
 
-def calc_loss(predictions, true_y, additional_mask=None):
+
+#!!!!!!! would be good to run on logged (NOT the exp)
+def quantile_loss(true, predicted, weights, quantile):
+    """
+    When doing quantile regression, get the pinball loss on each quantile
+    """
+    n_valid = tf.reduce_sum(weights)
+    true_o = tf.round(tf.expm1(true))
+    pred_o = tf.maximum(tf.round(tf.expm1(predicted)), 0.0)
+    diff = tf.subtract(true, predicted)
+    pinball = tf.reduce_mean(tf.maximum(quantile*diff, (quantile-1.)*diff))
+    return tf.reduce_sum(pinball * weights) / n_valid
+
+
+def calc_loss(hparams, predictions, true_y, additional_mask=None):#, DO_QUANTILES=False, QUANTILES=[]):
     """
     Calculates losses, ignoring NaN true values (assigning zero loss to them)
     :param predictions: Predicted values
@@ -255,13 +324,62 @@ def calc_loss(predictions, true_y, additional_mask=None):
     if additional_mask is not None:
         weights = weights * tf.expand_dims(additional_mask, axis=0)
 
-    mae_loss = tf.losses.absolute_difference(labels=true_y, predictions=predictions, weights=weights)
-    return mae_loss, smape_loss(true_y, predictions, weights), calc_smape_rounded(true_y, predictions,
-                                                                                  weights), tf.size(true_y)
+    
+#    FRACTION = 1#.01
+#    if DO_QUANTILES:
+#        quantile_losses = []
+#        for nn, q in enumerate(QUANTILES):
+#            quantile_losses.extend([quantile_loss(true_y, predictions[nn], weights, q)])
+#        #For the total quantile loss, use the mean loss over all quantiles
+#        ave_quantile_loss = FRACTION*tf.reduce_mean(quantile_losses)
+#        mae_loss = tf.losses.absolute_difference(labels=true_y, predictions=predictions[0], weights=weights)
+#        csr = calc_smape_rounded(true_y, predictions[0], weights)
+#        sl = smape_loss(true_y, predictions[0], weights) 
+#    else:
+#        ave_quantile_loss = 0.
+#        quantile_losses = []
+#        mae_loss = tf.losses.absolute_difference(labels=true_y, predictions=predictions, weights=weights)
+#        csr = calc_smape_rounded(true_y, predictions, weights)
+#        sl = smape_loss(true_y, predictions, weights)
+        
+    FRACTION = 1#.01
+    if hparams.DO_QUANTILES:
+        quantile_losses = []
+        for nn, q in enumerate(hparams.QUANTILES):
+            quantile_losses.extend([quantile_loss(true_y, predictions[nn], weights, q)])
+        #For the total quantile loss, use the mean loss over all quantiles
+        ave_quantile_loss = FRACTION*tf.reduce_mean(quantile_losses)            
+    else:
+        ave_quantile_loss = tf.constant(0.,dtype=tf.float32)
+        quantile_losses = []            
+            
+    if hparams.DO_MLP_POSTPROCESS or hparams.MLP_DIRECT_DECODER:
+        mae_loss = tf.losses.absolute_difference(labels=true_y, predictions=predictions[0], weights=weights)
+        csr = calc_smape_rounded(true_y, predictions[0], weights)
+        sl = smape_loss(true_y, predictions[0], weights) 
+    else:
+        mae_loss = tf.losses.absolute_difference(labels=true_y, predictions=predictions, weights=weights)
+        csr = calc_smape_rounded(true_y, predictions, weights)
+        sl = smape_loss(true_y, predictions, weights)        
+        
+    return mae_loss, sl, csr, tf.size(true_y), ave_quantile_loss, quantile_losses
 
 
 def make_train_op(loss, ema_decay=None, prefix=None):
-    optimizer = COCOB()
+#    OPTIMIZER=#'SGDN-HD',#'COCOB',#'ADAM',#'SGDN-HD',#'ADAM-HD'
+#    if OPTIMIZER=='COCOB':
+#        optimizer = COCOB()
+#    if OPTIMIZER=='ADAM':
+#        optimizer = tf.train.AdamOptimizer()
+#    if OPTIMIZER=='SGD':
+#        optimizer = tf.train.GradientDescentOptimizer(1e-9)
+#    if OPTIMIZER=='SGDN-HD':
+#        optimizer = MomentumSGDHDOptimizer()
+#    if OPTIMIZER=='ADAM-HD':
+#        optimizer = AdamHDOptimizer()
+#    optimizer=MomentumSGDHDOptimizer(alpha_0=1e-1)#bad SMAPEs for various orders of magnitude alpha_0
+    optimizer = tf.train.AdamOptimizer()
+    
     glob_step = tf.train.get_global_step()
 
     # Add regularization losses
@@ -291,7 +409,7 @@ def make_train_op(loss, ema_decay=None, prefix=None):
     return training_op, glob_norm, ema
 
 
-def convert_cudnn_state_v2(h_state, hparams, seed, c_state=None, dropout=1.0):
+def convert_cudnn_state_v3(h_state, hparams, seed, c_state=None, dropout=1.0):
     """
     Converts RNN state tensor from cuDNN representation to TF RNNCell compatible representation.
     :param h_state: tensor [num_layers, batch_size, depth]
@@ -313,13 +431,28 @@ def wrap_dropout(structure):
     # encoder_layers > decoder_layers: get outputs of upper encoder layers
     # encoder_layers < decoder_layers: feed encoder outputs to lower decoder layers, feed zeros to top layers
     h_layers = tf.unstack(h_state)
+    
+#    h_layers = tf.Print(h_layers,['h_layers',h_layers])
+    
+    #Regardless of relative number of layers in encoder vs. decoder, simple approach is 
+    #use topmost encoder layer hidden state as the (fixed) context
+    encoded_representation = wrap_dropout(h_layers[-1])
+    
+#    encoded_representation = tf.Print(encoded_representation,['encoded_representation',encoded_representation])
+    
+    #above uses a different random dropout for the "encoded representaiton" than the actual top level output.
+    #This is possibly a good regularization thing since we dont expect the final hidden state to be  perfect summar/context vector,
+    #so a little randomness is probably good here.
+    #vs. below using topmost level exactly same dropout mask: _[-1]
     if hparams.encoder_rnn_layers >= hparams.decoder_rnn_layers:
-        return squeeze(wrap_dropout(h_layers[hparams.encoder_rnn_layers - hparams.decoder_rnn_layers:]))
+        _ = wrap_dropout(h_layers[hparams.encoder_rnn_layers - hparams.decoder_rnn_layers:])
+        return squeeze(_), _[-1] #Use the topmost hidden state of the encoder as the encoded representaiton
+#        return squeeze(_), encoded_representation #Use the topmost hidden state of the encoder as the encoded representaiton
     else:
         lower_inputs = wrap_dropout(h_layers)
         upper_inputs = [tf.zeros_like(h_layers[0]) for _ in
                         range(hparams.decoder_rnn_layers - hparams.encoder_rnn_layers)]
-        return squeeze(lower_inputs + upper_inputs)
+        return squeeze(lower_inputs + upper_inputs), lower_inputs[-1] #Use the topmost hidden state of the encoder as the encoded representaiton
 
 
 def rnn_stability_loss(rnn_output, beta):
@@ -359,45 +492,135 @@ def __init__(self, inp: InputPipe, hparams, is_train, seed, graph_prefix=None, a
         :param seed:
         :param graph_prefix: Subgraph prefix for multi-model graph
         :param asgd_decay: Decay for SGD averaging
-        :param loss_mask: Additional mask for losses calculation (one value for each prediction day), shape=[predict_window]
+        :param loss_mask: Additional mask for losses calculation (one value for each prediction day), shape=[horizon_window_size]
         """
         self.is_train = is_train
         self.inp = inp
         self.hparams = hparams
         self.seed = seed
         self.inp = inp
+        
+        self.DO_MLP_POSTPROCESS = self.hparams.DO_MLP_POSTPROCESS
+        self.lookback_K_actual = min(hparams.LOOKBACK_K, hparams.history_window_size_minmax[0])
+        print('self.lookback_K_actual',self.lookback_K_actual)
+        self.MLP_DIRECT_DECODER = self.hparams.MLP_DIRECT_DECODER
+        self.DO_QUANTILES = self.hparams.DO_QUANTILES
+#        self.LAMBDA = self.hparams.LAMBDA
+        assert not (self.MLP_DIRECT_DECODER and self.DO_MLP_POSTPROCESS), 'Cannot do both DO_MLP_POSTPROCESS and MLP_DIRECT_DECODER.\n Choose exactly 1, or neither.'
+ 
+        
+
+
+#        with tf.Graph().as_default():
+#            config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
+#            with tf.Session(config=config) as sess:
+#                result = sess.run([self.inp.history_window_size, self.inp.horizon_window_size])
+#                print(result)
+        print('Model.py history, horizon : ', self.inp.history_window_size, self.inp.horizon_window_size)
+
+
+
+#        inp.time_x = tf.Print(inp.time_x, ['where NANs in inp.time_x :', tf.where(tf.is_nan(inp.time_x))])
+#        inp.time_x = tf.Print(inp.time_x, ['inp.time_x :', tf.shape(inp.time_x),inp.time_x])
+        
+        
+#        inp.time_x = tf.check_numerics(inp.time_x,'inp.time_x has NANs')
+#        inp.time_y = tf.Print(inp.time_y, ['inp.time_y :', tf.shape(inp.time_y),inp.time_y])
+#        inp.norm_x = tf.Print(inp.norm_x, ['inp.norm_x :', tf.shape(inp.norm_x),inp.norm_x])
+#        inp.time_x = tf.Print(inp.time_x, ['inp.time_x[:,:,0] :', tf.shape(inp.time_x[:,:,0]),inp.time_x[:,:,0]])
+#        gggg = tf.reduce_mean(tf.cast(tf.equal(inp.norm_x, inp.time_x[:,:,0]), tf.int32))
+#        inp.time_x = tf.Print(inp.time_x, ['mean tf.equal(inp.norm_x, inp.time_x[:,:,0]) should be 1 :', gggg])        
+        
 
         encoder_output, h_state, c_state = make_encoder(inp.time_x, inp.encoder_features_depth, is_train, hparams, seed,
                                                         transpose_output=False)
+        
+#        h_state = tf.Print(h_state,['h_state',h_state,'encoder_output',encoder_output,'inp.time_x',inp.time_x])
+        
+        
         # Encoder activation losses
-        enc_stab_loss = rnn_stability_loss(encoder_output, hparams.encoder_stability_loss / inp.train_window)
-        enc_activation_loss = rnn_activation_loss(encoder_output, hparams.encoder_activation_loss / inp.train_window)
-
+#        enc_stab_loss = rnn_stability_loss(encoder_output, hparams.encoder_stability_loss / tf.cast(inp.history_window_size, tf.float32)) #!!!!!!!!when random history-horizons, need to cast dtypes here
+#        enc_activation_loss = rnn_activation_loss(encoder_output, hparams.encoder_activation_loss / tf.cast(inp.history_window_size, tf.float32)) #!!!!!!
+        enc_stab_loss = rnn_stability_loss(encoder_output, hparams.encoder_stability_loss / inp.history_window_size) #!!!!!!!!when random history-horizons, need to cast dtypes here
+        enc_activation_loss = rnn_activation_loss(encoder_output, hparams.encoder_activation_loss / inp.history_window_size) #!!!!!!
+        
+        
         # Convert state from cuDNN representation to TF RNNCell-compatible representation
-        encoder_state = convert_cudnn_state_v2(h_state, hparams, c_state,
+        encoder_state, summary_z = convert_cudnn_state_v3(h_state, hparams, c_state,
+#        encoder_state, summary_z = convert_cudnn_state_v3(h_state, hparams, seed, c_state,
                                                dropout=hparams.gate_dropout if is_train else 1.0)
-
+#        encoder_state = tf.Print(encoder_state, ['encoder_state',tf.shape(encoder_state),encoder_state])
+#        summary_z = tf.Print(summary_z, ['summary_z',tf.shape(summary_z),summary_z])
+        
         # Attention calculations
         # Compress encoder outputs
         enc_readout = compressed_readout(encoder_output, hparams,
                                          dropout=hparams.encoder_readout_dropout if is_train else 1.0, seed=seed)
+        
         # Calculate fingerprint from input features
-        fingerprint_inp = tf.concat([inp.lagged_x, tf.expand_dims(inp.norm_x, -1)], axis=-1)
-        fingerprint = make_fingerprint(fingerprint_inp, is_train, hparams.fingerprint_fc_dropout, seed)
-        # Calculate attention vector
-        attn_features, attn_weights = attn_readout_v3(enc_readout, inp.attn_window, hparams.attention_heads,
+        if hparams.use_attn:
+            fingerprint_inp = tf.concat([inp.lagged_x, tf.expand_dims(inp.norm_x, -1)], axis=-1)
+            fingerprint = make_fingerprint(fingerprint_inp, is_train, hparams.fingerprint_fc_dropout, seed)
+            # Calculate attention vector
+            attn_features, attn_weights = attn_readout_v3(enc_readout, inp.attn_window, hparams.attention_heads,
                                                       fingerprint, seed=seed)
 
-        # Run decoder
-        decoder_targets, decoder_outputs = self.decoder(encoder_state,
-                                                        attn_features if hparams.use_attn else None,
-                                                        inp.time_y, inp.norm_x[:, -1])
+        print('building decoder')
+
+        # Run (regular RNN-based) decoder
+        #... = decoder(encoder_state, attn_features, prediction_inputs, previous_y)
+        if not self.MLP_DIRECT_DECODER:
+            decoder_targets, decoder_outputs = self.decoder(encoder_state,
+                                                            attn_features if hparams.use_attn else None,
+                                                            summary_z if hparams.RECURSIVE_W_ENCODER_CONTEXT else None,
+                                                            inp.time_y, inp.norm_x[:, -self.lookback_K_actual:]) #in decoder function def:   inp.time_y = "prediction_inputs";  inp.norm_x[:, -1] = "previous_y" (i.e. the final x normalizd))
+
+        #Vs. if doing the direct MLP decoder (while loop of MLP's instead of while loop of RNN cells):
+        elif self.MLP_DIRECT_DECODER:
+            decoder_targets, decoder_outputs = self.direct_decoder(inp.time_y,  #
+                                                   summary_z, 
+                                                   self.hparams.LOCAL_CONTEXT_SIZE, 
+                                                   self.hparams.GLOBAL_CONTEXT_SIZE, 
+                                                   None)
+
+        #If doing the MLP postprocessing step to adjust predictions:
+        #Technically, you could do this in addition to MLP_DIRECT_DECODER if direct_decoder was outputting point estimates only.
+        #Or could change postprocess to also refine quantiles.
+        #But this is overkill. Only allowed to use 0 or 1 of [MLP_DIRECT_DECODER, DO_MLP_POSTPROCESS]
+#        decoder_targets = tf.Print(decoder_targets,['decoder_targets BEFORE',tf.shape(decoder_targets),decoder_targets,'decoder_outputs',tf.shape(decoder_outputs),decoder_outputs])
+        if self.DO_MLP_POSTPROCESS:
+            #Could 0 pad, but for now instead just ensure kernelsize and offset are such that
+            #thre are no encoder-side issues with kernel extending beyond history (relevant for very short histories)
+            #Offset can stay the same (represents number of positions to RIGHT of given position),
+            #but adjust kernel size to fit:
+            self.offset = hparams.MLP_POSTPROCESS__KERNEL_OFFSET
+            leftside = hparams.MLP_POSTPROCESS__KERNEL_SIZE -1 -self.offset
+            self.actual_kernel_size = min(leftside, hparams.history_window_size_minmax[0]) +1 +self.offset
+            decoder_targets = self.MLP_postprocess(decoder_targets, inp.time_y, 
+                                                   inp.time_x, 
+                                                   summary_z if hparams.RECURSIVE_W_ENCODER_CONTEXT else None, 
+                                                   self.actual_kernel_size, self.offset, None)
+#        decoder_targets = tf.Print(decoder_targets,['decoder_targets AFTER postprocess',decoder_targets,'decoder_outputs',decoder_outputs])        
+#        decoder_targets = tf.Print(decoder_targets,['encoder_state',encoder_state,'inp.time_y',inp.time_y,'inp.norm_x',inp.norm_x])
+#        decoder_targets = tf.Print(decoder_targets,['decoder_targets',decoder_targets,'decoder_outputs',decoder_outputs])
+        
+
+        
         # Decoder activation losses
-        dec_stab_loss = rnn_stability_loss(decoder_outputs, hparams.decoder_stability_loss / inp.predict_window)
-        dec_activation_loss = rnn_activation_loss(decoder_outputs, hparams.decoder_activation_loss / inp.predict_window)
+        #If doing the Direct MLP decoder, there is no RNN decoder output, so these losses are not applicable, set to 0.
+        dec_stab_loss = rnn_stability_loss(decoder_outputs, hparams.decoder_stability_loss / inp.horizon_window_size) if not self.MLP_DIRECT_DECODER else tf.constant(0.)
+        dec_activation_loss = rnn_activation_loss(decoder_outputs, hparams.decoder_activation_loss / inp.horizon_window_size) if not self.MLP_DIRECT_DECODER else tf.constant(0.)
+        print('dec_stab_loss',dec_stab_loss)
+        print('dec_activation_loss',dec_activation_loss)
 
         # Get final denormalized predictions
         self.predictions = decode_predictions(decoder_targets, inp)
+#        vv = decode_predictions(decoder_targets, inp)
+#        vv = tf.Print(vv, ['decode_predictions',vv,tf.shape(vv)])
+#        self.predictions = vv
+#        print('self.predictions (still log1p(counts))')
+        print('self.predictions',self.predictions)
+        
 
         # Calculate losses and build training op
         if inp.mode == ModelMode.PREDICT:
@@ -412,11 +635,22 @@ def __init__(self, inp: InputPipe, hparams, is_train, seed, graph_prefix=None, a
                     ema_vars = variables
                 self.ema.apply(ema_vars)
         else:
-            self.mae, smape_loss, self.smape, self.loss_item_count = calc_loss(self.predictions, inp.true_y,
+            self.mae, smape_loss, self.smape, self.loss_item_count, self.ave_quantile_loss, self.quantile_losses = calc_loss(hparams, self.predictions, inp.true_y,
                                                                                additional_mask=loss_mask)
+                                                                               #DO_QUANTILES=self.hparams.DO_QUANTILES,
+                                                                               #QUANTILES=self.hparams.QUANTILES)
+            #from calc_loss:
+            #mae_loss, smape_loss(true_y, predictions, weights), calc_smape_rounded(true_y, predictions, weights), tf.size(true_y)
+            
             if is_train:
                 # Sum all losses
-                total_loss = smape_loss + enc_stab_loss + dec_stab_loss + enc_activation_loss + dec_activation_loss
+                total_loss =  enc_stab_loss + dec_stab_loss + enc_activation_loss + dec_activation_loss
+                #For quantile regression: just include a term for sum over all quantile losses [weights every quantile equally]
+                if self.hparams.DO_QUANTILES:
+                    #total_loss += self.LAMBDA * self.ave_quantile_loss
+                    total_loss += self.ave_quantile_loss
+                else:
+                    total_loss += smape_loss
                 self.train_op, self.glob_norm, self.ema = make_train_op(total_loss, asgd_decay, prefix=graph_prefix)
 
 
@@ -424,25 +658,32 @@ def __init__(self, inp: InputPipe, hparams, is_train, seed, graph_prefix=None, a
     def default_init(self, seed_add=0):
         return default_init(self.seed + seed_add)
 
-    def decoder(self, encoder_state, attn_features, prediction_inputs, previous_y):
+    def decoder(self, encoder_state, attn_features, summary_z, prediction_inputs, previous_y):
         """
         :param encoder_state: shape [batch_size, encoder_rnn_depth]
         :param prediction_inputs: features for prediction days, tensor[batch_size, time, input_depth]
-        :param previous_y: Last day pageviews, shape [batch_size]
-        :param attn_features: Additional features from attention layer, shape [batch, predict_window, readout_depth*n_heads]
+        :param previous_y: Last day pageviews, shape [batch_size, self.lookback_K_actual] 
+        :param attn_features: Additional features from attention layer, shape [batch, horizon_window_size, readout_depth*n_heads]
         :return: decoder rnn output
         """
         hparams = self.hparams
 
         def build_cell(idx):
             with tf.variable_scope('decoder_cell', initializer=self.default_init(idx)):
-                cell = rnn.GRUBlockCell(self.hparams.rnn_depth)
+                print('self.hparams.rnn_depth',self.hparams.rnn_depth)
+                
+                cell = rnn.GRUBlockCell(self.hparams.rnn_depth) #GRUBlockCellV2 same butnewer version just variables named differently?
                 has_dropout = hparams.decoder_input_dropout[idx] < 1 \
                               or hparams.decoder_state_dropout[idx] < 1 or hparams.decoder_output_dropout[idx] < 1
 
+                #context size alone may be as big as decoder state size?? Then input-> hidden would be a down projection...
+                #so maybe do a projection down, on the encoder side first [e.g. encoder output??] then better here...
                 if self.is_train and has_dropout:
                     attn_depth = attn_features.shape[-1].value if attn_features is not None else 0
-                    input_size = attn_depth + prediction_inputs.shape[-1].value + 1 if idx == 0 else self.hparams.rnn_depth
+                    context_depth = summary_z.shape[-1].value if self.hparams.RECURSIVE_W_ENCODER_CONTEXT else 0 #Should just be the encoder RNN depth
+                    print('attn_depth',attn_depth, 'context_depth',context_depth)
+                    input_size = attn_depth + context_depth + prediction_inputs.shape[-1].value + self.lookback_K_actual if idx == 0 else self.hparams.rnn_depth
+                    input_size = tf.Print(input_size, ['attn_depth',tf.shape(attn_depth),attn_depth, 'context_depth',tf.shape(context_depth),context_depth, 'input_size',tf.shape(input_size),input_size])#!!!!!!!!!!
                     cell = rnn.DropoutWrapper(cell, dtype=tf.float32, input_size=input_size,
                                               variational_recurrent=hparams.decoder_variational_dropout[idx],
                                               input_keep_prob=hparams.decoder_input_dropout[idx],
@@ -456,9 +697,27 @@ def build_cell(idx):
         else:
             cell = build_cell(0)
 
+
+        #!!!!!! on our data, when doing side_split, encoder_state is fine [no NANs],
+        #but when doing walk_forward, some rows (instances) are all NANs (and the others all defined),
+        #then eventually every instance becomes NANs
+#        N_nans = tf.reduce_sum(tf.cast(tf.is_nan(encoder_state), tf.float32))
+#        tt = tf.cast(tf.is_nan(encoder_state), tf.float32)
+#        ff = tf.reduce_sum(tt,axis=1)
+#        ggg = tf.cast(tf.equal(ff, ff*0.+267.), tf.float32)
+#        N_all_NAN_encoder_states = tf.reduce_sum(ggg)
+#        total = tf.reduce_prod(tf.shape(encoder_state))
+#        encoder_state = tf.Print(encoder_state,['encoder_state', tf.shape(encoder_state), encoder_state, 'N_nans', N_nans, 'total', total, 'N_all_NAN_encoder_states', N_all_NAN_encoder_states])
+        
+
+
         nest.assert_same_structure(encoder_state, cell.state_size)
-        predict_days = self.inp.predict_window
-        assert prediction_inputs.shape[1] == predict_days
+#        predict_timesteps = 7777#tf.reshape(self.inp.horizon_window_size, [])
+        predict_timesteps = self.inp.horizon_window_size
+#        print('predict_timesteps',predict_timesteps,type(predict_timesteps))
+        #When random size steps, cannot know dims in advance to do below assert:
+        assert prediction_inputs.shape[1] == predict_timesteps #!!!!!!!quantiles
+#        print('predict_timesteps',predict_timesteps,'prediction_inputs.shape[1]',prediction_inputs.shape)
 
         # [batch_size, time, input_depth] -> [time, batch_size, input_depth]
         inputs_by_time = tf.transpose(prediction_inputs, [1, 0, 2])
@@ -467,59 +726,416 @@ def build_cell(idx):
         return_raw_outputs = self.hparams.decoder_stability_loss > 0.0 or self.hparams.decoder_activation_loss > 0.0
 
         # Stop condition for decoding loop
-        def cond_fn(time, prev_output, prev_state, array_targets: tf.TensorArray, array_outputs: tf.TensorArray):
-            return time < predict_days
+        def cond_fn(timestep, prev_output, prev_state, array_targets: tf.TensorArray, array_outputs: tf.TensorArray):
+            return timestep < predict_timesteps #If doing k2-step lookahead prediction for k2>1, possibly want to 
+            #adjust condition to do appropriate n steps > predict_timesteps... and then combine predictions for those steps to get single prediction, 
+            #e.g. by exponential weighting  backward in time from this step.
 
         # FC projecting layer to get single predicted value from RNN output
         def project_output(tensor):
-            return tf.layers.dense(tensor, 1, name='decoder_output_proj', kernel_initializer=self.default_init())
+            N_pctls=1 #!!!!!!!!!! quantiles
+            return tf.layers.dense(tensor, N_pctls, name='decoder_output_proj', kernel_initializer=self.default_init())
 
-        def loop_fn(time, prev_output, prev_state, array_targets: tf.TensorArray, array_outputs: tf.TensorArray):
+        def loop_fn(timestep, prev_output, prev_state, array_targets: tf.TensorArray, array_outputs: tf.TensorArray):
             """
             Main decoder loop
-            :param time: Day number
-            :param prev_output: Output(prediction) from previous step
+            :param timestep: timestep number
+            :param prev_output: Output(prediction) from previous step --> from previous K steps: self.lookback_K_actual 
             :param prev_state: RNN state tensor from previous step
             :param array_targets: Predictions, each step will append new value to this array
             :param array_outputs: Raw RNN outputs (for regularization losses)
             :return:
             """
             # RNN inputs for current step
-            features = inputs_by_time[time]
+            features = inputs_by_time[timestep]
+#            print('features',features)
+#            print('previous_y',previous_y)
 
-            # [batch, predict_window, readout_depth * n_heads] -> [batch, readout_depth * n_heads]
+            # [batch, horizon_window_size, readout_depth * n_heads] -> [batch, readout_depth * n_heads]
             if attn_features is not None:
                 #  [batch_size, 1] + [batch_size, input_depth]
-                attn = attn_features[:, time, :]
+                attn = attn_features[:, timestep, :]
                 # Append previous predicted value + attention vector to input features
                 next_input = tf.concat([prev_output, features, attn], axis=1)
+               
             else:
                 # Append previous predicted value to input features
                 next_input = tf.concat([prev_output, features], axis=1)
-
+            #If using more of a typical encoder-decoder, also have encoder context each time:
+            if self.hparams.RECURSIVE_W_ENCODER_CONTEXT:
+                next_input = tf.concat([next_input, summary_z], axis=1) #!!!!!!!!summary_z[-1]
+                    
+            print('next_input',next_input)
+            print('prev_state',prev_state)            
             # Run RNN cell
             output, state = cell(next_input, prev_state)
             # Make prediction from RNN outputs
-            projected_output = project_output(output)
+            projected_output = project_output(output) #!!!!!!!!!! quantiles
+#            projected_output = tf.Print(projected_output, ['timestep',timestep,'projected_output',projected_output,tf.shape(projected_output),'output',output,tf.shape(output),'state',state,tf.shape(state) ,'prev_output',prev_output,tf.shape(prev_output) ,'features',features,tf.shape(features),features[1,:18]])
+            
             # Append step results to the buffer arrays
             if return_raw_outputs:
-                array_outputs = array_outputs.write(time, output)
-            array_targets = array_targets.write(time, projected_output)
-            # Increment time and return
-            return time + 1, projected_output, state, array_targets, array_outputs
+                array_outputs = array_outputs.write(timestep, output)
+            array_targets = array_targets.write(timestep, projected_output)
+            
+            #Update prev_output
+            #(delete oldest left, append rightmost)
+            if self.lookback_K_actual > 1:
+                prev_output = prev_output[:,1:] #All examples in batch, exclude oldest output [leftmost oldest, rightmost most recent]
+#                print('prev_output',prev_output)
+#                print('projected_output',projected_output)
+                updated_outputs = tf.concat([prev_output,projected_output],axis=1)
+#                print('updated_outputs',updated_outputs)
+            elif self.lookback_K_actual==1:
+                updated_outputs = prev_output
+                
+            # Increment timestep and return
+            return timestep + 1, updated_outputs, state, array_targets, array_outputs #!!!!!! quantiles: projected_output will be diff dims
 
         # Initial values for loop
-        loop_init = [tf.constant(0, dtype=tf.int32),
-                     tf.expand_dims(previous_y, -1),
-                     encoder_state,
-                     tf.TensorArray(dtype=tf.float32, size=predict_days),
-                     tf.TensorArray(dtype=tf.float32, size=predict_days) if return_raw_outputs else tf.constant(0)]
+        loop_init = [tf.constant(0, dtype=tf.int32), #timestep
+#                     previous_y if self.lookback_K_actual  > 1 else tf.expand_dims(previous_y, -1), #prev_output
+                    previous_y, #prev_output
+                     encoder_state, #prev_state
+                     tf.TensorArray(dtype=tf.float32, size=predict_timesteps), #array_targets
+                     tf.TensorArray(dtype=tf.float32, size=predict_timesteps) if return_raw_outputs else tf.constant(0)] #array_outputs #!!!!!!! size= ... x N_pctls
         # Run the loop
-        _, _, _, targets_ta, outputs_ta = tf.while_loop(cond_fn, loop_fn, loop_init)
-
+        _timestep, _projected_output, _state, targets_ta, outputs_ta = tf.while_loop(cond_fn, loop_fn, loop_init)
+        
+        #!!!!!!!!if do the decoder as fixed max predict window, then here, when in train mode,  slice to get only those really evaluated
+        
+        
+        
+        print('decoder')
+#        print('_timestep',_timestep)
+#        _timestep = debug_tensor_print(_timestep)
+#        print('_projected_output',_projected_output)
+#        _projected_output = debug_tensor_print(_projected_output)     
+#        print('_state',_state)        
+#        _state = debug_tensor_print(_state)  
+
+        
+#        targets_ta_tensor = tf.convert_to_tensor(targets_ta)
+#        targets_ta_tensor = tf.Print(targets_ta_tensor,[targets_ta_tensor])
+#        print('targets_ta',targets_ta)
+#        print('outputs_ta',outputs_ta)
+        
+        
         # Get final tensors from buffer arrays
+#        targets_ta = tf.Print(targets_ta,['targets_ta',targets_ta,tf.shape(targets_ta)])
+#        print('targets_ta',targets_ta)
         targets = targets_ta.stack()
         # [time, batch_size, 1] -> [time, batch_size]
         targets = tf.squeeze(targets, axis=-1)
+#        print('targets after squeeze',targets)
+#        targets = tf.Print(targets,['targets after squeeze',targets,tf.shape(targets)])
         raw_outputs = outputs_ta.stack() if return_raw_outputs else None
+
+#        print('targets',targets)
+        #!!!!!!!!!!! why targets becomes NANs ?????
+#        why targets NANs?
+#        targets = debug_tensor_print(targets)  #63 x 245,   except for first 2 prints for each new iteration it is 63 x 64
+#        raw_outputs = debug_tensor_print(raw_outputs) #is 63 x 64 x 267
+        
+#        print_list = ['_timestep', _timestep.name, tf.shape(_timestep), _timestep]
+#        raw_outputs = tf.Print(raw_outputs, print_list)
+        
         return targets, raw_outputs
+    
+    
+    
+    def MLP_postprocess(self, decoder_targets, decoder_features, time_x, summary_z, cropped_kernel_size, offset, seed):#kernel_size, kernel_offset, seed):
+        """
+        As a postprocess decoder step:
+        
+        After the decoder has output predictions for each decoder timestep 
+        (in standard mode this is a vector of predictions: one per timestep
+        vs. if doing quantile regression, is a matrix timesteps x quantiles...)
+        
+        Refine those predictions by taking into account the neighboring predictions.
+        Ideally, the RNN/LSTM/GRU state would contain all necessary information about state 
+        to make good predictions. In reality, might be asking too much, could be useful 
+        to do a second pass over the outputs to improve predictions.
+        
+        E.g. if encoding shock events: binary 0,1 with 1 on change day:
+        if there is no pre-event shoulder, no information on Monday takes advantage 
+        of known change about to ocur on Tuesday. This postprocessor will help with that.
+        
+        
+        hparams.QUANTILES = [None],#[.20, .30, .40, .50, ,75, .90]
+        hparams.MLP_POSTPROCESS__KERNEL_SIZE=15,
+        hparams.MLP_POSTPROCESS__KERNEL_OFFSET=7,
+        
+        
+        
+        INPUTS:
+            decoder_targets - [HORIZON x BATCH]  (for now ignoring quantiles just single value per decoder step)
+            decoder_features - decoder side features for all horizon timesteps [BATCH x HORIZON x FEATURES] (but gets transposed later to make easier)
+            time_x - encoder side features and ground truth for all history timesteps [BATCH x HISTORY x FEATURES+1] (but gets transposed later to make easier)
+            summary_z - encoded context vector, simple way using same vector as context for every decoder step [BATCH x RNNSTATEDEPTH]. Will be None if not doing encoder context option
+            
+        
+            kernel_size - int
+            The size, in #timesteps, of the MLP postprocess kernel.
+            For a given timestep of decoder, a window containing that day is used to 
+            hard select which nearby timesteps are allowed to contribute to refinement 
+            for that timestep. I.e. bigger kernel means looking at more nearby RNN outputs.
+    
+            kernel_offset - int
+            The offset, in #timesteps, of the MLP postprocess kernel from the given timestep under consideration.
+            0 offset means kernel has rightmost index exactly on the given timestep,
+            1 offset means the rightmost timestep included in the postprocess window is 
+            the timestep exactly 1 position right of the timestep under consideration, ..., 
+            ...
+            K-1 is when the kernel window is trasnlated as far as possible to the right,
+            i.e. has the given timestep as the LEFTmost position, and K-1 positions to the right of the day in question.
+            In other words, offset tells you how many positions to the right of the given timestep are included in the kernel window.
+        
+        
+        For this mini feedforward net, just use 1 hidden layer with nonlinearity, and for #of nodes there, just make it half of #input nodes to this MLP.
+                
+        ALternative: you could also choose to simplify things a bit on the left side (early in time)
+        where there is overlap with encoder side and even potentially with before the encoder starts (which needs padding).
+        To simplify, could choose to not do refinement on those earliest outputs, and only do refinement when kernel
+        is fully in decoder area. Justification could be that refinement of those early predictions is less needed
+        since they are closer to the forecast creation time, so should suffer less from the recursion problem and hopefully be OK already.
+        """
+        
+        
+        #Get dimensions of things based on inputs, features, options:
+        #N1 is number nuerons in input layer, is   (KERNELSIZE x FEATURES) + KERNELSIZE (+ENCODER_DEPTH if encoder_context)
+        context_depth = summary_z.shape[-1].value if self.hparams.RECURSIVE_W_ENCODER_CONTEXT else 0 #Should just be the encoder RNN depth
+        feature_depth = decoder_features.shape[-1].value
+        batchsize = tf.shape(decoder_features)[0]
+        N1 =  cropped_kernel_size*(feature_depth+1) + context_depth + 2 #+2 is to also use two timestep index features, this is since at beginning/end of iterations, could have overlap with 0-padded region, and those nodes of net would normally have a very different distribution of nonzero values, so signal this.
+        N2 = int(min(max(.5*N1,100),400)) #Use between 100-400 neurons in layer 2, using approximately 
+        print('MLP Post-processing N1, N2:', N1, N2)
+        #0-pad the right side (for now just make offset/kernel size such that don't need to worry about left side before encoder)
+        #The amount of padding needed is based on horizon, kernel size, offset:
+        print('offset',offset)
+        print('batchsize',batchsize)
+        #_ = tf.stack([offset, tf.shape(decoder_features)[0], feature_depth])
+        _ = tf.stack([offset, batchsize, feature_depth+1])
+        right_pad_zeros = tf.fill(_, 0.0)
+        
+        
+        #If doing quantile regression
+        Nquantiles = len(self.hparams.QUANTILES) if self.hparams.DO_QUANTILES else 0
+        N_out = Nquantiles if self.hparams.DO_QUANTILES else 1
+        print('Nquantiles',Nquantiles)
+        print('N_out',N_out)
+        
+
+        #Quick check there are correct number timesteps:
+        assert decoder_features.shape[1] == self.inp.horizon_window_size #!!!!!!!quantiles
+
+        #Rearrange tensors to all have time first:
+        # [batch_size, time, input_depth] -> [time, batch_size, input_depth]
+        decoder_targets = tf.expand_dims(decoder_targets,axis=-1)
+        decoder_features = tf.transpose(decoder_features, [1, 0, 2])
+        print('decoder_targets',decoder_targets)
+        print('decoder_features',decoder_features)        
+        
+        decoder_by_time = tf.concat([decoder_targets,decoder_features],axis=2)
+        encoder_by_time = tf.transpose(time_x, [1, 0, 2]) #Has target and features stacked already
+
+        print('decoder_by_time',decoder_by_time)
+        print('encoder_by_time',encoder_by_time)
+        all_features_targets_by_time = tf.concat([encoder_by_time,decoder_by_time,right_pad_zeros],axis=0)
+        print('all_features_targets_by_time',all_features_targets_by_time)
+        
+        
+        #Simple 2 layer feedforward
+        def feedforward(_x):
+            fc1 = tf.layers.dense(_x, N2, activation=selu, name='fc1', kernel_initializer=self.default_init())
+            fc2 = tf.layers.dense(fc1, N_out, name='fc2', kernel_initializer=self.default_init())
+            return fc2
+        
+        # Stop condition for decoding loop
+        def cond_fn(timestep, all_timesteps_input, array_targets: tf.TensorArray):
+            return timestep < self.inp.horizon_window_size
+
+        def loop_fn(timestep, all_timesteps_input, array_targets: tf.TensorArray):
+            print(timestep)
+            #Excerpt time window
+            start = timestep - cropped_kernel_size + 1 + offset + self.inp.history_window_size
+            end = timestep + offset + self.inp.history_window_size + 1
+            cropped = all_features_targets_by_time[start:end,:,:]
+            print('timestep,start,end', timestep,start,end)
+            cropped = tf.transpose(cropped,[1,0,2])
+            cropped = tf.contrib.layers.flatten(cropped)
+            
+            #If using context vector, also append it
+            if self.hparams.RECURSIVE_W_ENCODER_CONTEXT:
+                cropped = tf.concat([cropped,summary_z],axis=1)
+
+            print('cropped',cropped)
+            
+            #Include timestep related features:
+            #Because using fixed dimension MLP with fixed input for input layer neurons,
+            #but window is sliding over boundaries with different meaning (0 padded vs legit values),
+            #use 2 additional features that help control issue of boundary effects.
+            #Use the (normalized) percent through decoder phase, = normalize(timestep/horizon),
+            #and the (normalized) percent overlap of the kernel with the 0padded region
+            f1 = timestep/self.inp.horizon_window_size - .5
+            f2 = tf.maximum(0, offset - (self.inp.horizon_window_size - timestep)) / offset - .5
+            _ = tf.stack([batchsize,1])
+            f1 = tf.cast(tf.fill(_, f1),tf.float32)
+            f2 = tf.cast(tf.fill(_, f2),tf.float32)
+            print('f1',f1)
+            print('f2',f2)
+            print('cropped',cropped)
+            flattened_input = tf.concat([cropped,f1,f2],axis=-1)
+            #Before even running this, we know the shape (other than batchsize which may be dynamic)
+            #So, since dense layer in feed_forward needs last dimension specified, do this:
+            flattened_input.set_shape((None,N1))
+            print('flattened_input',flattened_input)
+
+            #Pass tensor into the simple feedforward network:
+            projected_output = feedforward(flattened_input)
+            #Append this timestep results 
+            array_targets = array_targets.write(timestep, projected_output)
+            
+                
+            return timestep + 1, all_timesteps_input, array_targets #!!!!!! quantiles: projected_output will be diff dims
+
+
+        #Since we know the dimensions beforehand, jsut use an init with defined dimension so can initialize the feedforward net
+#        _ = tf.stack([batchsize,N1])
+#        init_features_zeros = tf.cast(tf.fill(_, 0.),tf.float32)
+#        print('init_features_zeros',init_features_zeros)
+        
+        # Initial values for loop
+        loop_init = [tf.constant(0, dtype=tf.int32), #timestep
+                    all_features_targets_by_time, #init_features_zeros,#all_features_targets_by_time, #all_timesteps_input
+                    tf.TensorArray(dtype=tf.float32, size=self.inp.horizon_window_size)] #array_targets
+
+        # Run the loop
+        _timestep, _, targets_ta = tf.while_loop(cond_fn, loop_fn, loop_init)        
+        
+        #Get the post-processed predictions
+        #[time, batch_size, Nptcl]
+        targets = targets_ta.stack()
+        
+        # [time, batch_size, 1] -> [time, batch_size]
+#        targets = tf.squeeze(targets, axis=-1)
+                
+        return targets    
+
+
+
+
+
+
+    
+    def direct_decoder(self, decoder_features, summary_z, local_context_size, global_context_size, seed):
+        """
+        A different kind of decoder. Using the max of the allowed horizon sizes
+        as the horizon, do a direct forecast, with quantiles.
+        Based on "A Multi-Horizon Quantile Recurrent Forecaster"
+        by Amazon AI Labs:
+        https://arxiv.org/pdf/1711.11053.pdf
+        
+        1. One time MLP/CNN to get context vectors [1 global, K local]
+        
+        2. Replace the RNN-based decoder with an MLP decoder, called iteratively on every input in the horizon.
+        
+        Inputs:        
+            decoder_features - decoder side features for all horizon timesteps [BATCH x HORIZON x FEATURES]
+            summary_z - encoded context vector, simple way using same vector as context for every decoder step [BATCH x RNNSTATEDEPTH]. Will be None if not doing encoder context option
+                    
+        """
+    
+        #Get dimensions of things based on inputs, features, options:
+        K = self.hparams.horizon_window_size_minmax[1]
+        summary_z_depth = summary_z.shape[-1].value if self.hparams.RECURSIVE_W_ENCODER_CONTEXT else 0 #Should just be the encoder RNN depth
+        feature_depth = decoder_features.shape[-1].value
+        batchsize = tf.shape(decoder_features)[0]
+        N1 =  K*feature_depth + summary_z_depth
+        N_1_mid = int(.5*N1)
+        print('N1',N1)
+        print('N_1_mid',N_1_mid)
+        #Output of the global MLP/CNN has context vectors for each timestep in horizon 1,...,K; and also one global context vector
+        N_1_out = global_context_size + K*local_context_size
+        print('N_1_out',N_1_out)
+        print('local_context_size',local_context_size)
+        print('global_context_size',global_context_size)
+        
+        
+        #If doing quantile regression
+        N_2_in = global_context_size + local_context_size + feature_depth
+        N_2_mid = int(.5*N_2_in)
+        Nquantiles = len(self.hparams.QUANTILES) if self.hparams.DO_QUANTILES else 0
+        N_2_out = Nquantiles if self.hparams.DO_QUANTILES else 1
+        print('N_2_in',N_2_in)
+        print('Nquantiles',Nquantiles)
+        print('N_2_out',N_2_out)
+        
+        
+        
+        #MLP/CNN 1 : GLOBAL : Get context vectors from [encoded summary_z; all future features]
+        def net_1(_x):
+            net_1_fc1 = tf.layers.dense(_x, N_1_mid, activation=selu, name='net_1_fc1', kernel_initializer=self.default_init())
+            net_1_fc2 = tf.layers.dense(net_1_fc1, N_1_out, name='net_1_fc2', kernel_initializer=self.default_init())
+            return net_1_fc2
+
+        #MLP/CNN 2 : LCOAL : Get quantiles from [local_features; global_context; local_context]
+        #??? Could try feeding in summary_z as well to local. But the global context should basically capture summary_z, so should not need, and would increase net_2 size a lot.
+        def net_2(_x):
+            net_2_fc1 = tf.layers.dense(_x, N_2_mid, activation=selu, name='net_2_fc1', kernel_initializer=self.default_init())
+            net_2_fc2 = tf.layers.dense(net_2_fc1, N_2_out, name='net_2_fc2', kernel_initializer=self.default_init())
+            return net_2_fc2      
+        
+        
+        # Stop condition for decoding loop
+        def cond_fn(timestep, summary_z, decoder_features, local_context_vectors, global_context_vector, array_targets: tf.TensorArray):
+            return timestep < self.inp.horizon_window_size
+
+        def loop_fn(timestep, summary_z, decoder_features, local_context_vectors, global_context_vector, array_targets: tf.TensorArray):
+            print(timestep)
+
+            #Concatenate the 3 inputs: [global context; local context; features for that timestep]
+            net2_input = tf.concat([global_context_vector, local_context_vectors[:,timestep,:], decoder_features[:,timestep,:]],axis=1)
+            quantiles_output = net_2(net2_input)
+            array_targets = array_targets.write(timestep, quantiles_output)
+                
+            return timestep + 1, summary_z, decoder_features, local_context_vectors, global_context_vector, array_targets #!!!!!! quantiles: projected_output will be diff dims
+
+        
+        #One time pass through the GLOBAL network:
+        
+#        ddddd = tf.transpose(tf.expand_dims(summary_z,-1),,perm=[0,2,1])
+        flattened_input = tf.concat([tf.contrib.layers.flatten(decoder_features), summary_z],axis=1)
+#        flattened_input.set_shape((None,N1))
+        context_vectors = net_1(flattened_input)
+        print('context_vectors',context_vectors)
+        global_context_vector = context_vectors[:,:global_context_size]
+        print('global_context_vector',global_context_vector)
+        print('context_vectors[:,global_context_size:]',context_vectors[:,global_context_size:])
+        print('[batchsize, self.inp.horizon_window_size, local_context_size]',[batchsize, K, local_context_size])
+        local_context_vectors = tf.reshape(context_vectors[:,global_context_size:], [batchsize, K, local_context_size])
+        
+        # Initial values for loop
+        loop_init = [tf.constant(0, dtype=tf.int32), #timestep
+                    summary_z,
+                    decoder_features, #init_features_zeros,#all_features_targets_by_time, #all_future_input
+                    local_context_vectors,
+                    global_context_vector,
+                    tf.TensorArray(dtype=tf.float32, size=self.inp.horizon_window_size)] #array_targets
+
+        # Run the loop
+        _timestep, _, _, _, _, targets_ta = tf.while_loop(cond_fn, loop_fn, loop_init)        
+        
+        #[time, batch_size, Nptcl]
+        targets = targets_ta.stack()
+        
+        # [time, batch_size, 1] -> [time, batch_size]
+#        targets = tf.squeeze(targets, axis=-1)
+                
+
+        #Does not use RNN-based decoder, so decoder_outputs are not relevant
+        #But could do regularization on the feedforward net1 and net2 weights...
+#        decoder_outputs = tf.constant(0.,shape=[batchsize, self.inp.horizon_window_size, self.hparams.rnn_depth])
+        decoder_outputs = None
+        
+        return targets, decoder_outputs
\ No newline at end of file
diff --git a/percent_dense.png b/percent_dense.png
new file mode 100644
index 0000000..2ad21ff
Binary files /dev/null and b/percent_dense.png differ
diff --git a/quantile_plots.py b/quantile_plots.py
new file mode 100644
index 0000000..9744010
--- /dev/null
+++ b/quantile_plots.py
@@ -0,0 +1,83 @@
+#Look at some example quantile predictions.
+#Not fully integrated in to the full pipeline yet, this script just
+#looks at 1 example test set, and for one forecast creation time.
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+import pandas as pd
+import os
+
+
+# =============================================================================
+# PARAMETERS
+# =============================================================================
+
+#Which cities to look at
+IDs = [i for i in range(1200)]#[1,2,3,5,6,12,622,555]     [1,273,284, 245, 385, 458]
+
+#The original test set data that has the true values
+groundtruth_path = r".../Desktop/forecasting/exData/4splits_aug/ours_daily_augmented__TESTset1.csv"
+#corrsponding to r".../Desktop/forecasting/kaggle-web-traffic/output/quantiles/testset1"
+
+
+
+
+
+
+
+
+
+
+
+# =============================================================================
+# MAIN
+# =============================================================================
+
+#each csv of predictions is named as format:
+#"{quantile}__{history}_{horizon}_{chunk}.csv"
+
+files = os.listdir('.')
+files = [i for i in files if i.endswith('.csv')]
+files = [i for i in files if not i.startswith('0.4')]
+files = files[::-1]#reverse order for plotting so .05th on bottom, .95th on top
+
+actual_df = pd.read_csv(groundtruth_path)
+
+for id_ in IDs:
+    _ = []
+    quantiles = []
+    for i, f in enumerate(files):
+        df = pd.read_csv(f)
+#        print(df)
+        row = df.loc[df['Unnamed: 0']==id_].head(1) #in case predictions repeated
+#        print(row)
+        _.append(row.values[:,1:].flatten())
+        quantiles.append(os.path.split(f)[1][:-4])
+#    _=np.array(_)
+#    print(quantiles)
+    dates = df.columns.tolist()[1:]
+#    print(dates)
+    try:
+        fig=plt.figure()
+        ax=fig.add_subplot(111)
+        plt.title('{}'.format(id_))
+        colors = ['r','g','b','g','r']
+        lines = ['--','--','-','--','--']
+        for qq in range(len(_)):
+            plt.plot(_[qq], color=colors[qq], linestyle=lines[qq], label=quantiles[qq])
+        x = np.arange(len(_[0]))
+    #    print(_[0])
+        ax.fill_between(x,_[0],_[-1],color='r',alpha=.2)
+        ax.fill_between(x,_[1],_[-2],color='g',alpha=.2)
+    #    print(_[1])
+        groundtruth = actual_df.loc[actual_df['Page']==id_]
+        groundtruth = groundtruth[dates].T
+        plt.plot(groundtruth,color='k',label='actual')
+        K=14
+        plt.xticks(np.arange(len(dates))[::K],dates[::K])
+        plt.legend(numpoints=1)
+    #        plt.show()
+        plt.savefig('{}.png'.format(id_))
+    except:
+        continue
\ No newline at end of file
diff --git a/trainer copy.py b/trainer copy.py
new file mode 100755
index 0000000..9b93998
--- /dev/null
+++ b/trainer copy.py	
@@ -0,0 +1,827 @@
+import os.path
+import shutil
+import sys
+import numpy as np
+import tensorflow as tf
+from tqdm import trange
+from typing import List, Tuple
+import heapq
+import logging
+import pandas as pd
+from enum import Enum
+
+from hparams import build_from_set, build_hparams
+from feeder import VarFeeder
+from input_pipe import InputPipe, ModelMode, Splitter,FakeSplitter, page_features
+from model import Model
+import argparse
+
+from collections import defaultdict
+
+
+log = logging.getLogger('trainer')
+
+class Ema:
+    def __init__(self, k=0.99):
+        self.k = k
+        self.state = None
+        self.steps = 0
+
+    def __call__(self, *args, **kwargs):
+        v = args[0]
+        self.steps += 1
+        if self.state is None:
+            self.state = v
+        else:
+            eff_k = min(1 - 1 / self.steps, self.k)
+            self.state = eff_k * self.state + (1 - eff_k) * v
+        return self.state
+
+
+class Metric:
+    def __init__(self, name: str, op, smoothness: float = None):
+        self.name = name
+        self.op = op
+        self.smoother = Ema(smoothness) if smoothness else None
+        self.epoch_values = []
+        self.best_value = np.Inf
+        self.best_step = 0
+        self.last_epoch = -1
+        self.improved = False
+        self._top = []
+
+    @property
+    def avg_epoch(self):
+        return np.mean(self.epoch_values)
+
+    @property
+    def best_epoch(self):
+        return np.min(self.epoch_values)
+
+    @property
+    def last(self):
+        return self.epoch_values[-1] if self.epoch_values else np.nan
+
+    @property
+    def top(self):
+        return -np.mean(self._top)
+
+
+    def update(self, value, epoch, step):
+        if self.smoother:
+            value = self.smoother(value)
+        if epoch > self.last_epoch:
+            self.epoch_values = []
+            self.last_epoch = epoch
+        self.epoch_values.append(value)
+        if value < self.best_value:
+            self.best_value = value
+            self.best_step = step
+            self.improved = True
+        else:
+            self.improved = False
+        if len(self._top) >= 5:
+            heapq.heappushpop(self._top, -value)
+        else:
+            heapq.heappush(self._top, -value)
+
+
+class AggMetric:
+    def __init__(self, metrics: List[Metric]):
+        self.metrics = metrics
+
+    def _mean(self, fun) -> float:
+        # noinspection PyTypeChecker
+        return np.mean([fun(metric) for metric in self.metrics])
+
+    @property
+    def avg_epoch(self):
+        return self._mean(lambda m: m.avg_epoch)
+
+    @property
+    def best_epoch(self):
+        return self._mean(lambda m: m.best_epoch)
+
+    @property
+    def last(self):
+        return self._mean(lambda m: m.last)
+
+    @property
+    def top(self):
+        return self._mean(lambda m: m.top)
+
+    @property
+    def improved(self):
+        return np.any([metric.improved for metric in self.metrics])
+
+
+class DummyMetric:
+    @property
+    def avg_epoch(self):
+        return np.nan
+
+    @property
+    def best_epoch(self):
+        return np.nan
+
+    @property
+    def last(self):
+        return np.nan
+
+    @property
+    def top(self):
+        return np.nan
+
+    @property
+    def improved(self):
+        return False
+
+    @property
+    def metrics(self):
+        return []
+
+
+class Stage(Enum):
+    TRAIN = 0
+    EVAL_SIDE = 1
+    EVAL_FRWD = 2
+    EVAL_SIDE_EMA = 3
+    EVAL_FRWD_EMA = 4
+
+
+class ModelTrainerV2:
+    def __init__(self, train_model: Model, eval: List[Tuple[Stage, Model]], model_no=0,
+                 patience=None, stop_metric=None, summary_writer=None):
+        self.train_model = train_model
+        if eval:
+            self.eval_stages, self.eval_models = zip(*eval)
+        else:
+            self.eval_stages, self.eval_models = [], []
+        self.stopped = False
+        self.model_no = model_no
+        self.patience = patience
+        self.best_metric = np.inf
+        self.bad_epochs = 0
+        self.stop_metric = stop_metric
+        self.summary_writer = summary_writer
+
+        def std_metrics(model: Model, smoothness):
+            return [Metric('SMAPE', model.smape, smoothness), Metric('MAE', model.mae, smoothness), Metric('qntl', model.ave_quantile_loss, smoothness)]
+
+        self._metrics = {Stage.TRAIN: std_metrics(train_model, 0.9) + [Metric('GrNorm', train_model.glob_norm)]}
+        for stage, model in eval:
+            self._metrics[stage] = std_metrics(model, None)
+        self.dict_metrics = {key: {metric.name: metric for metric in metrics} for key, metrics in self._metrics.items()}
+
+    def init(self, sess):
+        for model in list(self.eval_models) + [self.train_model]:
+            model.inp.init_iterator(sess)
+
+    @property
+    def metrics(self):
+        return self._metrics
+
+    @property
+    def train_ops(self):
+        model = self.train_model
+#        print('model.train_op',model.train_op)
+        return [model.train_op]  # , model.summaries
+
+    def metric_ops(self, key):
+        return [metric.op for metric in self._metrics[key]]
+
+    def process_metrics(self, key, run_results, epoch, step):
+        metrics = self._metrics[key]
+        summaries = []
+        for result, metric in zip(run_results, metrics):
+            metric.update(result, epoch, step)
+            summaries.append(tf.Summary.Value(tag=f"{key.name}/{metric.name}_0", simple_value=result))
+        return summaries
+
+    def end_epoch(self):
+        if self.stop_metric:
+            best_metric = self.stop_metric(self.dict_metrics)# self.dict_metrics[Stage.EVAL_FRWD]['SMAPE'].avg_epoch
+            if self.best_metric > best_metric:
+                self.best_metric = best_metric
+                self.bad_epochs = 0
+            else:
+                self.bad_epochs += 1
+                if self.bad_epochs > self.patience:
+                    self.stopped = True
+
+
+class MultiModelTrainer:
+    def __init__(self, trainers: List[ModelTrainerV2], inc_step_op,
+                 misc_global_ops=None):
+        self.trainers = trainers
+        self.inc_step = inc_step_op
+        self.global_ops = misc_global_ops or []
+        self.eval_stages = trainers[0].eval_stages
+
+    def active(self):
+        return [trainer for trainer in self.trainers if not trainer.stopped]
+
+    def _metric_step(self, stage, initial_ops, sess: tf.Session, epoch: int, step=None, repeats=1, summary_every=1):
+        ops = initial_ops
+        offsets, lengths = [], []
+        trainers = self.active()
+        for trainer in trainers:
+            offsets.append(len(ops))
+            metric_ops = trainer.metric_ops(stage)
+            lengths.append(len(metric_ops))
+            ops.extend(metric_ops)
+        if repeats > 1:
+            all_results = np.stack([np.array(sess.run(ops)) for _ in range(repeats)])
+            results = np.mean(all_results, axis=0)
+        else:
+            results = sess.run(ops)
+        if step is None:
+            step = results[0]
+
+        for trainer, offset, length in zip(trainers, offsets, lengths):
+            chunk = results[offset: offset + length]
+            summaries = trainer.process_metrics(stage, chunk, epoch, step)
+            if trainer.summary_writer and step > 200 and (step % summary_every == 0):
+                summary = tf.Summary(value=summaries)
+                trainer.summary_writer.add_summary(summary, global_step=step)
+        return results
+
+    def train_step(self, sess: tf.Session, epoch: int):
+        ops = [self.inc_step] + self.global_ops
+        for trainer in self.active():
+            ops.extend(trainer.train_ops)
+#        print('ops', ops)
+        results = self._metric_step(Stage.TRAIN, ops, sess, epoch, summary_every=20)
+#        print('results: ', results)
+        #return results[:len(self.global_ops) + 1] # step, grad_norm
+        return results[0]
+
+    def eval_step(self, sess: tf.Session, epoch: int, step, n_batches, stages:List[Stage]=None):
+        target_stages = stages if stages is not None else self.eval_stages
+        for stage in target_stages:
+            self._metric_step(stage, [], sess, epoch, step, repeats=n_batches)
+
+    def metric(self, stage, name):
+        return AggMetric([trainer.dict_metrics[stage][name] for trainer in self.trainers])
+
+    def end_epoch(self):
+        for trainer in self.active():
+            trainer.end_epoch()
+
+    def has_active(self):
+        return len(self.active())
+
+
+
+
+def train(features_set, sampling_period, name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01,
+          seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0,
+          eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False,
+          forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True,
+          side_split=True, max_steps=None, save_from_step=None, do_eval=True, save_epochs_performance=False):#, horizon_window_size=63, history_window_size=283):
+
+    eval_k = int(round(26214 * eval_memsize / n_models))
+    eval_batch_size = int(
+        eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers))  # 128 -> 1024, 256->512, 512->256
+    eval_pct = 0.1
+    batch_size = hparams.batch_size
+#    history_window_size = hparams.history_window_size
+    tf.reset_default_graph()
+    if seed:
+        tf.set_random_seed(seed)
+
+    with tf.device("/cpu:0"):
+        inp = VarFeeder.read_vars(f"data/{name}")
+        if side_split:
+            splitter = Splitter(page_features(inp, features_set), inp.page_map, 3, train_sampling=train_sampling,
+                                test_sampling=eval_sampling, seed=seed)
+        else:
+            splitter = FakeSplitter(page_features(inp, features_set), 3, seed=seed, test_sampling=eval_sampling)
+
+    real_train_pages = splitter.splits[0].train_size
+    real_eval_pages = splitter.splits[0].test_size
+    
+    items_per_eval = real_eval_pages * eval_pct
+    eval_batches = int(np.ceil(items_per_eval / eval_batch_size))
+    steps_per_epoch = real_train_pages // batch_size
+    eval_every_step = int(round(steps_per_epoch * eval_pct))
+    # eval_every_step = int(round(items_per_eval * train_sampling / batch_size))
+
+    global_step = tf.train.get_or_create_global_step()
+    inc_step = tf.assign_add(global_step, 1)
+
+    all_models: List[ModelTrainerV2] = []
+
+    print('eval_pct', eval_pct)
+    print('eval_k', eval_k)
+    print('eval_batch_size', eval_batch_size)
+    print('real_train_pages', real_train_pages)
+    print('real_eval_pages', real_eval_pages)
+    print('batch_size', batch_size)
+    print('items_per_eval', items_per_eval)
+    print('eval_batches', eval_batches)
+    print('steps_per_epoch', steps_per_epoch)
+    print('eval_every_step', eval_every_step)
+
+
+    def random_draw_history_and_horizon_window_sizes(trainer,sess):
+        """
+        Want to not only have random start end, but also variable size chunks for 
+        history and horizon sizes in TRAINING phase.
+        (in prediction phase, use fixed sizes, and then for different sizes see how performance is.)
+        """
+#        metrics = []
+        history = np.random.randint(low=hparams.history_window_size_minmax[0],high=hparams.history_window_size_minmax[1]+1)
+        horizon = np.random.randint(low=hparams.horizon_window_size_minmax[0],high=hparams.horizon_window_size_minmax[1]+1)        
+#        print('random draw: history, horizon', history, horizon)
+#        attn_window = history - horizon + 1
+#        max_train_empty = min(history-1, int(np.floor(history * (1 - TT.train_model.inp.train_completeness_threshold))))
+#        max_predict_empty = int(np.floor(horizon * (1 - TT.train_model.inp.predict_completeness_threshold)))   
+        for TT in trainer.trainers:
+            TT.train_model.inp.history_window_size = history
+            TT.train_model.inp.horizon_window_size = horizon
+            TT.train_model.inp.attn_window = history - horizon + 1
+            TT.train_model.inp.max_train_empty = min(history-1, int(np.floor(history * (1 - TT.train_model.inp.train_completeness_threshold))))
+            TT.train_model.inp.max_predict_empty = int(np.floor(horizon * (1 - TT.train_model.inp.predict_completeness_threshold)))   
+#            TT.train_model.inp = InputPipeline
+#            TT.train_model.init(sess)
+            TT.train_model.inp.inp.restore(sess)
+            TT.train_model.inp.init_iterator(sess)
+            
+            
+            
+            #model.pipe = InputPipeline(...) #!!!!can just reinit new pipe each time?        
+#                #In InputPipe __init__:
+#                def init_iterator(self, session):
+#                    session.run(self.iterator.initializer) 
+            
+            
+            
+            
+#            metrics.append(TT.dict_metrics)
+#        MOD_=0
+#        STAGE_=1#index
+#        __ = list(metrics[MOD_].values())[STAGE_]['SMAPE']
+#        print(__.name)
+#        print(__.op)
+#        print(__.smoother)
+#        print(__.epoch_values)
+#        print(__.best_value)
+#        print(__.best_step)
+#        print(__.last_epoch)
+#        print(__.improved)
+#        print(__._top)
+        return trainer
+
+
+
+    def create_model(features_set, sampling_period, scope, index, prefix, seed):
+
+        #Just dummy filler, not important what value [since in training we will randomly vary these]
+        HISTORY_DUMMY = 100
+        HORIZON_DUMMY = 60
+
+        with tf.variable_scope('input') as inp_scope:
+            with tf.device("/cpu:0"):
+                split = splitter.splits[index]
+                pipe = InputPipe(features_set, sampling_period, inp, features=split.train_set, N_time_series=split.train_size,
+                                 mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose,
+                                 train_completeness_threshold=train_completeness_threshold,
+                                 predict_completeness_threshold=train_completeness_threshold, history_window_size=HISTORY_DUMMY,
+                                 horizon_window_size=HORIZON_DUMMY,
+                                 rand_seed=seed, train_skip_first=hparams.train_skip_first,
+                                 back_offset=HORIZON_DUMMY if forward_split else 0)
+                inp_scope.reuse_variables()
+                TCT = 0.01
+                if side_split:
+                    side_eval_pipe = InputPipe(features_set, sampling_period, inp, features=split.test_set, N_time_series=split.test_size,
+                                               mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
+                                               verbose=verbose, horizon_window_size=HORIZON_DUMMY,
+                                               train_completeness_threshold=TCT, predict_completeness_threshold=0,
+                                               history_window_size=HISTORY_DUMMY, rand_seed=seed, runs_in_burst=eval_batches,
+                                               back_offset=HORIZON_DUMMY * (2 if forward_split else 1))
+                else:
+                    side_eval_pipe = None
+                if forward_split:
+                    forward_eval_pipe = InputPipe(features_set, sampling_period, inp, features=split.test_set, N_time_series=split.test_size,
+                                                  mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
+                                                  verbose=verbose, horizon_window_size=HORIZON_DUMMY,
+                                                  train_completeness_threshold=TCT, predict_completeness_threshold=0,
+                                                  history_window_size=HISTORY_DUMMY, rand_seed=seed, runs_in_burst=eval_batches,
+                                                  back_offset=HORIZON_DUMMY)
+                else:
+                    forward_eval_pipe = None
+        avg_sgd = asgd_decay is not None
+        #asgd_decay = 0.99 if avg_sgd else None
+        train_model = Model(pipe, hparams, is_train=True, graph_prefix=prefix, asgd_decay=asgd_decay, seed=seed)
+        scope.reuse_variables()
+
+        eval_stages = []
+        if side_split:
+            side_eval_model = Model(side_eval_pipe, hparams, is_train=False,
+                                    #loss_mask=np.concatenate([np.zeros(50, dtype=np.float32), np.ones(10, dtype=np.float32)]),
+                                    seed=seed)
+            eval_stages.append((Stage.EVAL_SIDE, side_eval_model))
+            if avg_sgd:
+                eval_stages.append((Stage.EVAL_SIDE_EMA, side_eval_model))
+        if forward_split:
+            forward_eval_model = Model(forward_eval_pipe, hparams, is_train=False, seed=seed)
+            eval_stages.append((Stage.EVAL_FRWD, forward_eval_model))
+            if avg_sgd:
+                eval_stages.append((Stage.EVAL_FRWD_EMA, forward_eval_model))
+
+        if write_summaries:
+            summ_path = f"{logdir}/{name}_{index}"
+            if os.path.exists(summ_path):
+                shutil.rmtree(summ_path)
+            summ_writer = tf.summary.FileWriter(summ_path)  # , graph=tf.get_default_graph()
+        else:
+            summ_writer = None
+        if do_eval and forward_split:
+            stop_metric = lambda metrics: metrics[Stage.EVAL_FRWD]['SMAPE'].avg_epoch
+        else:
+            stop_metric = None
+        return ModelTrainerV2(train_model, eval_stages, index, patience=patience,
+                              stop_metric=stop_metric,
+                              summary_writer=summ_writer)
+
+
+
+
+    if n_models == 1:
+        with tf.device(f"/gpu:{gpu}"):
+            scope = tf.get_variable_scope()
+            all_models = [create_model(features_set, sampling_period, scope, 0, None, seed=seed)]
+    else:
+        for i in range(n_models):
+            device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}"
+            with tf.device(device):
+                prefix = f"m_{i}"
+                with tf.variable_scope(prefix) as scope:
+                    all_models.append(create_model(features_set, sampling_period, scope, i, prefix=prefix, seed=seed + i))
+    trainer = MultiModelTrainer(all_models, inc_step)
+    if save_best_model or save_from_step:
+        saver_path = f'data/cpt/{name}'
+        if os.path.exists(saver_path):
+            shutil.rmtree(saver_path)
+        os.makedirs(saver_path)
+        saver = tf.train.Saver(max_to_keep=10, name='train_saver')
+    else:
+        saver = None
+    avg_sgd = asgd_decay is not None
+    if avg_sgd:
+        from itertools import chain
+        def ema_vars(model):
+            ema = model.train_model.ema
+            return {ema.average_name(v):v for v in model.train_model.ema._averages}
+
+        ema_names = dict(chain(*[ema_vars(model).items() for model in all_models]))
+        #ema_names = all_models[0].train_model.ema.variables_to_restore()
+        ema_loader = tf.train.Saver(var_list=ema_names,  max_to_keep=1, name='ema_loader')
+        ema_saver = tf.train.Saver(max_to_keep=1, name='ema_saver')
+    else:
+        ema_loader = None
+
+    init = tf.global_variables_initializer()
+
+    if forward_split and do_eval:
+        eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE')
+        eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE')
+        eval_qntl = trainer.metric(Stage.EVAL_FRWD, 'qntl')
+    else:
+        eval_smape = DummyMetric()
+        eval_mae = DummyMetric()
+        eval_qntl = DummyMetric()
+
+    if side_split and do_eval:
+        eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE')
+        eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE')
+        eval_qntl_side = trainer.metric(Stage.EVAL_SIDE, 'qntl')
+    else:
+        eval_mae_side = DummyMetric()
+        eval_smape_side = DummyMetric()
+        eval_qntl_side = DummyMetric()
+
+    train_smape = trainer.metric(Stage.TRAIN, 'SMAPE')
+    train_mae = trainer.metric(Stage.TRAIN, 'MAE')
+    train_qntl = trainer.metric(Stage.TRAIN, 'qntl')
+    grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm')
+    eval_stages = []
+    ema_eval_stages = []
+    if forward_split and do_eval:
+        eval_stages.append(Stage.EVAL_FRWD)
+        ema_eval_stages.append(Stage.EVAL_FRWD_EMA)
+    if side_split and do_eval:
+        eval_stages.append(Stage.EVAL_SIDE)
+        ema_eval_stages.append(Stage.EVAL_SIDE_EMA)
+
+    # gpu_options=tf.GPUOptions(allow_growth=False),
+    with tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
+                                          gpu_options=tf.GPUOptions(allow_growth=gpu_allow_growth))) as sess:
+        sess.run(init)
+        # pipe.load_vars(sess)
+        inp.restore(sess)
+        for model in all_models:
+            model.init(sess)#is just doing:
+#            class ModelTrainerV2:
+#                def init(self, sess):
+#                    for model in list(self.eval_models) + [self.train_model]:
+#                        model.inp.init_iterator(sess)            
+            
+        # if beholder:
+        #    visualizer = Beholder(session=sess, logdir=summ_path)
+        step = 0
+        prev_top = np.inf
+        best_smape = np.inf
+        # Contains best value (first item) and subsequent values
+        best_epoch_smape = []
+
+        #Save out per epoch values to look at later [only per epoch, not savingout per step]
+        if save_epochs_performance:
+            output_list = []
+                    
+        for epoch in range(max_epoch):
+
+            # n_steps = pusher.N_time_series // batch_size
+            if tqdm:
+                tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False)
+            else:
+                tqr = range(steps_per_epoch)
+
+            for _ in tqr:
+                #!!!!!!!!!! Variable random length train predict windows
+                #Random draw the train, predict window lengths
+                print(_)
+                trainer = random_draw_history_and_horizon_window_sizes(trainer,sess)
+#                print('+++++++++++++++', [(TT.train_model.inp.history_window_size,TT.train_model.inp.horizon_window_size) for TT in trainer.trainers])
+#                print('--------', [(TT.train_model.inp.max_train_empty,TT.train_model.inp.max_predict_empty) for TT in trainer.trainers])
+#                print('::::::::::::::', [(TT.train_model.inp.iterator.get_next()) for TT in trainer.trainers])
+#                print('::::::::::::::', [(TT.train_model.inp.time_x,TT.train_model.inp.time_y) for TT in trainer.trainers])
+                #model.init(sess) ????
+                #model.pipe = InputPipeline(...) #!!!!can just reinit new pipe each time?        
+#                #In InputPipe __init__:
+#                def init_iterator(self, session):
+#                    session.run(self.iterator.initializer)                
+                try:
+                    step = trainer.train_step(sess, epoch)
+#                    print('+-+-+-+-+-+-+-', [(TT.train_model.inp.history_window_size,TT.train_model.inp.horizon_window_size) for TT in trainer.trainers])
+#                    print('0000000000000', [(TT.train_model.inp.max_train_empty,TT.train_model.inp.max_predict_empty) for TT in trainer.trainers])                    
+                except tf.errors.OutOfRangeError:
+                    break
+                    # if beholder:
+                    #  if step % 5 == 0:
+                    # noinspection PyUnboundLocalVariable
+                    #  visualizer.update()
+                if step % eval_every_step == 0:
+                    if eval_stages:
+                        trainer.eval_step(sess, epoch, step, eval_batches, stages=eval_stages)
+
+                    if save_best_model and epoch > 0 and eval_smape.last < best_smape:
+                        best_smape = eval_smape.last
+                        saver.save(sess, f'data/cpt/{name}/cpt', global_step=step)
+                    if save_from_step and step >= save_from_step:
+                        saver.save(sess, f'data/cpt/{name}/cpt', global_step=step)
+
+                    if avg_sgd and ema_eval_stages:
+                        ema_saver.save(sess, 'data/cpt_tmp/ema',  write_meta_graph=False)
+                        # restore ema-backed vars
+                        ema_loader.restore(sess, 'data/cpt_tmp/ema')
+
+                        trainer.eval_step(sess, epoch, step, eval_batches, stages=ema_eval_stages)
+                        # restore normal vars
+                        ema_saver.restore(sess, 'data/cpt_tmp/ema')
+
+                MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last)
+                improvement = '↑' if eval_smape.improved else ' '
+                SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last,  train_smape.last)
+                qntl = "%.3f/%.3f/%.3f" % (eval_qntl.last, eval_qntl_side.last, train_qntl.last)
+                if tqdm:
+                    tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE, qntl=qntl)
+                if not trainer.has_active() or (max_steps and step > max_steps):
+                    break
+
+            if tqdm:
+                tqr.close()
+            trainer.end_epoch()
+            if not best_epoch_smape or eval_smape.avg_epoch < best_epoch_smape[0]:
+                best_epoch_smape = [eval_smape.avg_epoch]
+            else:
+                best_epoch_smape.append(eval_smape.avg_epoch)
+
+            current_top = eval_smape.top
+            if prev_top > current_top:
+                prev_top = current_top
+                has_best_indicator = '↑'
+            else:
+                has_best_indicator = ' '
+            status = "%2d: Best top SMAPE=%.3f%s (%s)" % (
+                epoch + 1, current_top, has_best_indicator,
+                ",".join(["%.3f" % m.top for m in eval_smape.metrics]))
+
+            if trainer.has_active():
+#                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d active models" % \
+#                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch,
+#                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch,
+#                           trainer.has_active())
+                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, qntl=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, , qntl=%.3f/%.3f; %d active models" % \
+                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_qntl.best_epoch, eval_qntl_side.best_epoch,
+                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch, eval_qntl.avg_epoch, eval_qntl_side.avg_epoch,
+                           trainer.has_active())                
+                print(status, file=sys.stderr)
+                if save_epochs_performance:
+                    output_list.append([eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_qntl.best_epoch, eval_qntl_side.best_epoch,
+                               eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch, eval_qntl.avg_epoch, eval_qntl_side.avg_epoch,
+                               trainer.has_active()])
+            else:
+                print(status, file=sys.stderr)
+                print("Early stopping!", file=sys.stderr)
+                break
+            if max_steps and step > max_steps:
+                print("Max steps calculated", file=sys.stderr)
+                break
+            sys.stderr.flush()
+
+        if save_epochs_performance:
+            x = np.array(output_list)
+            outname = f"{logdir}/{name}_training_epochs_performance.npy"
+            np.save(outname,x)
+            
+        # noinspection PyUnboundLocalVariable
+        return np.mean(best_epoch_smape, dtype=np.float64)
+
+
+def predict(features_set, sampling_period, checkpoints, TEST_dir, hparams, history_window_size, horizon_window_size, return_x=False, verbose=False, back_offset=0, n_models=1,
+            target_model=0, asgd=False, seed=1, batch_size=1024): #For predict: allow horizon_window_size to be fixed
+    with tf.variable_scope('input') as inp_scope:
+        with tf.device("/cpu:0"):
+#            inp = VarFeeder.read_vars("data/vars")
+            inp = VarFeeder.read_vars(TEST_dir)
+#            tf.Print(inp,['inp.counts',inp.counts])
+            print(inp)
+#            try:
+#            
+#            except:
+#                pass
+            pipe = InputPipe(features_set, sampling_period, inp, page_features(inp, features_set), inp.N_time_series, mode=ModelMode.PREDICT, batch_size=batch_size,
+                             train_completeness_threshold=0.01,
+                             horizon_window_size=horizon_window_size,
+                             predict_completeness_threshold=0.0, history_window_size=history_window_size,
+                             back_offset=back_offset)
+            
+            print('pipe.time_x',pipe.time_x)
+            _ = tf.where(tf.is_nan(pipe.time_x))
+            #pipe.time_x = tf.Print(pipe.time_x, ['where NANs in inp.time_x :', tf.shape(_), _, _[0], _[1], _[2], _[-3], _[-2], _[-1]])
+#            pipe.time_x = tf.Print(pipe.time_x, ['where NANs in inp.time_x :', tf.shape(_), _])
+            pipe.time_x = tf.check_numerics(pipe.time_x,'pipe.time_x has NANs')            
+            
+    asgd_decay = 0.99 if asgd else None
+    if n_models == 1:
+        model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay)
+    else:
+        models = []
+        for i in range(n_models):
+            prefix = f"m_{i}"
+            with tf.variable_scope(prefix) as scope:
+                models.append(Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay, graph_prefix=prefix))
+        model = models[target_model]
+
+    if asgd:
+        var_list = model.ema.variables_to_restore()
+        prefix = f"m_{target_model}"
+        for var in list(var_list.keys()):
+            if var.endswith('ExponentialMovingAverage') and not var.startswith(prefix):
+                del var_list[var]
+    else:
+        var_list = None
+    saver = tf.train.Saver(name='eval_saver', var_list=var_list)
+    x_buffer = []
+    predictions = None
+    pred_df_quantiles_dict = defaultdict(lambda:[])
+    with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
+        pipe.load_vars(sess)
+        for checkpoint in checkpoints:
+            print('checkpoint',checkpoint)
+            pred_buffer = []
+            pipe.init_iterator(sess)
+            saver.restore(sess, checkpoint)
+            
+#            if return_x:
+#                pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.page_ix])
+##                print('pred',pred)
+##                print('x',x)
+##                print('pname',pname)
+#            else:
+#                pred, pname = sess.run([model.predictions, model.inp.page_ix])
+            
+            pred, pname = sess.run([model.predictions, model.inp.page_ix])            
+
+            #Our data already has page names (id's) as ints, so this decoding won't work, so just do str(id)
+            try:
+                utf_names = [str(name, 'utf-8') for name in pname]
+            except UnicodeDecodeError:
+                utf_names = [str(name) for name in pname]
+            
+            if hparams.DO_QUANTILES:
+                Nquantiles = pred.shape[0]
+                assert Nquantiles==len(hparams.QUANTILES), f'QUANTILES is {QUANTILES} but pred.shape is {pred.shape}'
+                for nn, qq in enumerate(hparams.QUANTILES):
+                    pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred[nn]))
+                    pred_df_quantiles_dict[qq].append(pred_df)
+#                    print('pred_df',pred_df)
+#                    print('pred',pred_df.shape)
+#                print('pred_df_quantiles_dict',pred_df_quantiles_dict)
+            else:
+                pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred[0]))
+                pred_buffer.append(pred_df)
+                
+                
+#            if return_x:
+#                # noinspection PyUnboundLocalVariable
+#                x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64))
+#                x_buffer.append(x_values)
+            
+            
+                cp_predictions = pd.concat(pred_buffer)
+                if predictions is None:
+                    predictions = cp_predictions
+                else:
+                    predictions += cp_predictions
+           
+            
+
+    #!!!!!!!!!!!! need to change these lines when sampling WEEKLY MONTHLY
+    start_prediction = inp.data_end + pd.Timedelta('1D') - pd.Timedelta(back_offset, 'D')
+    end_prediction = start_prediction + pd.Timedelta(horizon_window_size - 1, 'D')
+                
+    if hparams.DO_QUANTILES:
+        predictions = {}
+        Ncpt = float(len(checkpoints))
+        for kk, vv in pred_df_quantiles_dict.items():
+            _ = sum(vv)/Ncpt
+            _.columns = pd.date_range(start_prediction, end_prediction)
+            predictions[kk] = _
+            
+    else:
+        predictions /= len(checkpoints) #Since it is averaging predictions over the chckpoints
+        predictions.columns = pd.date_range(start_prediction, end_prediction)
+        predictions['point_est'] = predictions
+#    if return_x:
+#        x = pd.concat(x_buffer)
+#        start_data = inp.data_end - pd.Timedelta(history_window_size - 1, 'D') - pd.Timedelta(back_offset, 'D')
+#        end_data = inp.data_end - pd.Timedelta(back_offset, 'D')
+#        x.columns = pd.date_range(start_data, end_data)
+#        return predictions, x
+#    else:
+#        return predictions
+        
+        
+#    print('predictions',predictions)
+    return predictions
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='Train the model')
+    parser.add_argument('features_set', help="Which set of features to use. His default for Kaggle vs. one of my custom sets: {'arturius','simple','full','full_w_context'}")
+    parser.add_argument('sampling_period', help="{'daily','weekly','monthly'}")
+    parser.add_argument('--name', default='s32', help='Model name to identify different logs/checkpoints')
+    parser.add_argument('--hparam_set', default='s32', help="Hyperparameters set to use (see hparams.py for available sets)")
+    parser.add_argument('--n_models', default=1, type=int, help="Jointly train n models with different seeds")
+    parser.add_argument('--multi_gpu', default=False,  action='store_true', help="Use multiple GPUs for multi-model training, one GPU per model")
+    parser.add_argument('--seed', default=5, type=int, help="Random seed")
+    parser.add_argument('--logdir', default='data/logs', help="Directory for summary logs")
+    parser.add_argument('--max_epoch', type=int, default=100, help="Max number of epochs")
+    parser.add_argument('--patience', type=int, default=2, help="Early stopping: stop after N epochs without improvement. Requires do_eval=True")
+    parser.add_argument('--train_sampling', type=float, default=1.0, help="Sample this percent of data for training")
+    parser.add_argument('--eval_sampling', type=float, default=1.0, help="Sample this percent of data for evaluation")
+    parser.add_argument('--eval_memsize', type=int, default=5, help="Approximate amount of avalable memory on GPU, used for calculation of optimal evaluation batch size")
+    parser.add_argument('--gpu', default=0, type=int, help='GPU instance to use')
+    parser.add_argument('--gpu_allow_growth', default=False,  action='store_true', help='Allow to gradually increase GPU memory usage instead of grabbing all available memory at start')
+    parser.add_argument('--save_best_model', default=False,  action='store_true', help='Save best model during training. Requires do_eval=True')
+    parser.add_argument('--no_forward_split', default=True, dest='forward_split',  action='store_false', help='Use walk-forward split for model evaluation. Requires do_eval=True')
+    parser.add_argument('--side_split', default=False, action='store_true', help='Use side split for model evaluation. Requires do_eval=True')
+    parser.add_argument('--no_eval', default=True, dest='do_eval', action='store_false', help="Don't evaluate model quality during training")
+    parser.add_argument('--no_summaries', default=True, dest='write_summaries', action='store_false', help="Don't Write Tensorflow summaries")
+    parser.add_argument('--verbose', default=False, action='store_true', help='Print additional information during graph construction')
+    parser.add_argument('--asgd_decay', type=float,  help="EMA decay for averaged SGD. Not use ASGD if not set")
+    parser.add_argument('--no_tqdm', default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training")
+    parser.add_argument('--max_steps', type=int, help="Stop training after max steps")
+    parser.add_argument('--save_from_step', type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step")
+#    parser.add_argument('--horizon_window_size', default=63, type=int, help="Number of days to predict")
+#    parser.add_argument('--history_window_size', default=283, type=int, help="Train window chunk size")#Now that we want to do train size - val size performance heatmaps
+    parser.add_argument('--save_epochs_performance', default=False, dest='save_epochs_performance', action='store_true', help='Save out per EPOCH metrics (NOT per step, only per EPOCH')
+    args = parser.parse_args()
+
+    param_dict = dict(vars(args))
+    param_dict['hparams'] = build_from_set(args.hparam_set)
+    del param_dict['hparam_set']
+    train(**param_dict)
+
+    # hparams = build_hparams()
+    # result = train("definc_attn", hparams, n_models=1, train_sampling=1.0, eval_sampling=1.0, patience=5, multi_gpu=True,
+    #                save_best_model=False, gpu=0, eval_memsize=15, seed=5, verbose=True, forward_split=False,
+    #                write_summaries=True, side_split=True, do_eval=False, horizon_window_size=63, asgd_decay=None, max_steps=11500,
+    #                save_from_step=10500)
+
+    # print("Training result:", result)
+    # preds = PREDICT('data/cpt/fair_365-15428', 380, hparams, verbose=True, back_offset=60, n_models=3)
+    # print(preds)
diff --git a/trainer.py b/trainer.py
old mode 100644
new mode 100755
index 4ecb65b..c5bb872
--- a/trainer.py
+++ b/trainer.py
@@ -16,6 +16,8 @@
 from model import Model
 import argparse
 
+from collections import defaultdict
+
 
 log = logging.getLogger('trainer')
 
@@ -164,7 +166,7 @@ def __init__(self, train_model: Model, eval: List[Tuple[Stage, Model]], model_no
         self.summary_writer = summary_writer
 
         def std_metrics(model: Model, smoothness):
-            return [Metric('SMAPE', model.smape, smoothness), Metric('MAE', model.mae, smoothness)]
+            return [Metric('SMAPE', model.smape, smoothness), Metric('MAE', model.mae, smoothness), Metric('qntl', model.ave_quantile_loss, smoothness)]
 
         self._metrics = {Stage.TRAIN: std_metrics(train_model, 0.9) + [Metric('GrNorm', train_model.glob_norm)]}
         for stage, model in eval:
@@ -182,6 +184,7 @@ def metrics(self):
     @property
     def train_ops(self):
         model = self.train_model
+#        print('model.train_op',model.train_op)
         return [model.train_op]  # , model.summaries
 
     def metric_ops(self, key):
@@ -247,7 +250,9 @@ def train_step(self, sess: tf.Session, epoch: int):
         ops = [self.inc_step] + self.global_ops
         for trainer in self.active():
             ops.extend(trainer.train_ops)
+#        print('ops', ops)
         results = self._metric_step(Stage.TRAIN, ops, sess, epoch, summary_every=20)
+#        print('results: ', results)
         #return results[:len(self.global_ops) + 1] # step, grad_norm
         return results[0]
 
@@ -267,162 +272,35 @@ def has_active(self):
         return len(self.active())
 
 
-class ModelTrainer:
-    def __init__(self, train_model, eval_model, model_no=0, summary_writer=None, keep_best=5, patience=None):
-        self.train_model = train_model
-        self.eval_model = eval_model
-        self.stopped = False
-        self.smooth_train_mae = Ema()
-        self.smooth_train_smape = Ema()
-        self.smooth_eval_mae = Ema(0.5)
-        self.smooth_eval_smape = Ema(0.5)
-        self.smooth_grad = Ema(0.9)
-        self.summary_writer = summary_writer
-        self.model_no = model_no
-        self.best_top_n_loss = []
-        self.keep_best = keep_best
-        self.best_step = 0
-        self.patience = patience
-        self.train_pipe = train_model.inp
-        self.eval_pipe = eval_model.inp
-        self.epoch_mae = []
-        self.epoch_smape = []
-        self.last_epoch = -1
-
-    @property
-    def train_ops(self):
-        model = self.train_model
-        return [model.train_op, model.update_ema, model.summaries, model.mae, model.smape, model.glob_norm]
-
-    def process_train_results(self, run_results, offset, global_step, write_summary):
-        offset += 2
-        summaries, mae, smape, glob_norm = run_results[offset:offset + 4]
-        results = self.smooth_train_mae(mae), self.smooth_train_smape(smape), self.smooth_grad(glob_norm)
-        if self.summary_writer and write_summary:
-            self.summary_writer.add_summary(summaries, global_step=global_step)
-        return np.array(results)
-
-    @property
-    def eval_ops(self):
-        model = self.eval_model
-        return [model.mae, model.smape]
-
-    @property
-    def eval_len(self):
-        return len(self.eval_ops)
-
-    @property
-    def train_len(self):
-        return len(self.train_ops)
-
-    @property
-    def best_top_loss(self):
-        return -np.array(self.best_top_n_loss).mean()
-
-    @property
-    def best_epoch_mae(self):
-        return min(self.epoch_mae) if self.epoch_mae else np.NaN
 
-    @property
-    def mean_epoch_mae(self):
-        return np.mean(self.epoch_mae) if self.epoch_mae else np.NaN
 
-    @property
-    def mean_epoch_smape(self):
-        return np.mean(self.epoch_smape) if self.epoch_smape else np.NaN
-
-    @property
-    def best_epoch_smape(self):
-        return min(self.epoch_smape) if self.epoch_smape else np.NaN
-
-    def remember_for_epoch(self, epoch, mae, smape):
-        if epoch > self.last_epoch:
-            self.last_epoch = epoch
-            self.epoch_mae = []
-            self.epoch_smape = []
-        self.epoch_mae.append(mae)
-        self.epoch_smape.append(smape)
-
-    @property
-    def best_epoch_metrics(self):
-        return np.array([self.best_epoch_mae, self.best_epoch_smape])
-
-    @property
-    def mean_epoch_metrics(self):
-        return np.array([self.mean_epoch_mae, self.mean_epoch_smape])
-
-    def process_eval_results(self, run_results, offset, global_step, epoch):
-        totals = np.zeros(self.eval_len, np.float)
-        for result in run_results:
-            items = np.array(result[offset:offset + self.eval_len])
-            totals += items
-        results = totals / len(run_results)
-        mae, smape = results
-        if self.summary_writer and global_step > 200:
-            summary = tf.Summary(value=[
-                tf.Summary.Value(tag=f"test/MAE_{self.model_no}", simple_value=mae),
-                tf.Summary.Value(tag=f"test/SMAPE_{self.model_no}", simple_value=smape),
-            ])
-            self.summary_writer.add_summary(summary, global_step=global_step)
-        smooth_mae = self.smooth_eval_mae(mae)
-        smooth_smape = self.smooth_eval_smape(smape)
-        self.remember_for_epoch(epoch, mae, smape)
-
-        current_loss = -smooth_smape
-
-        prev_best_n = np.mean(self.best_top_n_loss) if self.best_top_n_loss else -np.inf
-        if self.best_top_n_loss:
-            log.debug("Current loss=%.3f, old best=%.3f, wait steps=%d", -current_loss,
-                      -max(self.best_top_n_loss), global_step - self.best_step)
-
-        if len(self.best_top_n_loss) >= self.keep_best:
-            heapq.heappushpop(self.best_top_n_loss, current_loss)
-        else:
-            heapq.heappush(self.best_top_n_loss, current_loss)
-        log.debug("Best loss=%.3f, top_5 avg loss=%.3f, top_5=%s",
-                  -max(self.best_top_n_loss), -np.mean(self.best_top_n_loss),
-                  ",".join(["%.3f" % -mae for mae in self.best_top_n_loss]))
-        new_best_n = np.mean(self.best_top_n_loss)
-
-        new_best = new_best_n > prev_best_n
-        if new_best:
-            self.best_step = global_step
-            log.debug("New best step %d, current loss=%.3f", global_step, -current_loss)
-        else:
-            step_count = global_step - self.best_step
-            if step_count > self.patience:
-                self.stopped = True
-
-        return mae, smape, new_best, smooth_mae, smooth_smape
-
-
-def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01,
+def train(features_set, sampling_period, name, hparams, multi_gpu=False, n_models=1, train_completeness_threshold=0.01,
           seed=None, logdir='data/logs', max_epoch=100, patience=2, train_sampling=1.0,
           eval_sampling=1.0, eval_memsize=5, gpu=0, gpu_allow_growth=False, save_best_model=False,
           forward_split=False, write_summaries=False, verbose=False, asgd_decay=None, tqdm=True,
-          side_split=True, max_steps=None, save_from_step=None, do_eval=True, predict_window=63):
+          side_split=True, max_steps=None, save_from_step=None, do_eval=True, save_epochs_performance=False):#, horizon_window_size=63, history_window_size=283):
 
     eval_k = int(round(26214 * eval_memsize / n_models))
     eval_batch_size = int(
         eval_k / (hparams.rnn_depth * hparams.encoder_rnn_layers))  # 128 -> 1024, 256->512, 512->256
     eval_pct = 0.1
     batch_size = hparams.batch_size
-    train_window = hparams.train_window
+#    history_window_size = hparams.history_window_size
     tf.reset_default_graph()
     if seed:
         tf.set_random_seed(seed)
 
     with tf.device("/cpu:0"):
-        inp = VarFeeder.read_vars("data/vars")
+        inp = VarFeeder.read_vars(f"data/{name}")
         if side_split:
-            splitter = Splitter(page_features(inp), inp.page_map, 3, train_sampling=train_sampling,
+            splitter = Splitter(page_features(inp, features_set), inp.page_map, 3, train_sampling=train_sampling,
                                 test_sampling=eval_sampling, seed=seed)
         else:
-            splitter = FakeSplitter(page_features(inp), 3, seed=seed, test_sampling=eval_sampling)
+            splitter = FakeSplitter(page_features(inp, features_set), 3, seed=seed, test_sampling=eval_sampling)
 
     real_train_pages = splitter.splits[0].train_size
     real_eval_pages = splitter.splits[0].test_size
-
+    
     items_per_eval = real_eval_pages * eval_pct
     eval_batches = int(np.ceil(items_per_eval / eval_batch_size))
     steps_per_epoch = real_train_pages // batch_size
@@ -432,38 +310,106 @@ def train(name, hparams, multi_gpu=False, n_models=1, train_completeness_thresho
     global_step = tf.train.get_or_create_global_step()
     inc_step = tf.assign_add(global_step, 1)
 
-
     all_models: List[ModelTrainerV2] = []
 
-    def create_model(scope, index, prefix, seed):
+    print('eval_pct', eval_pct)
+    print('eval_k', eval_k)
+    print('eval_batch_size', eval_batch_size)
+    print('real_train_pages', real_train_pages)
+    print('real_eval_pages', real_eval_pages)
+    print('batch_size', batch_size)
+    print('items_per_eval', items_per_eval)
+    print('eval_batches', eval_batches)
+    print('steps_per_epoch', steps_per_epoch)
+    print('eval_every_step', eval_every_step)
+
+
+    def random_draw_history_and_horizon_window_sizes(trainer,sess,direct):
+        """
+        Want to not only have random start end, but also variable size chunks for 
+        history and horizon sizes in TRAINING phase.
+        (in prediction phase, use fixed sizes, and then for different sizes see how performance is.)
+        """
+#        metrics = []
+        history = np.random.randint(low=hparams.history_window_size_minmax[0],high=hparams.history_window_size_minmax[1]+1)
+        horizon = np.random.randint(low=hparams.horizon_window_size_minmax[0],high=hparams.horizon_window_size_minmax[1]+1)        
+#        print('random draw: history, horizon', history, horizon)
+#        attn_window = history - horizon + 1
+#        max_train_empty = min(history-1, int(np.floor(history * (1 - TT.train_model.inp.train_completeness_threshold))))
+#        max_predict_empty = int(np.floor(horizon * (1 - TT.train_model.inp.predict_completeness_threshold)))   
+        for TT in trainer.trainers:
+            TT.train_model.inp.history_window_size = history
+            
+            #Randomly draw the horizon size. But only do this if not using the direct decoder.
+            #The direct decoder uses a fixed max horizon (e.g. 60 days), and whenever you only care to forecast fewer days, then just
+            #take the first N days of the full 60 day forecast. So the horizon stays fixed for this method.
+            if not direct:
+                TT.train_model.inp.horizon_window_size = horizon
+                
+            TT.train_model.inp.attn_window = history - horizon + 1
+            TT.train_model.inp.max_train_empty = min(history-1, int(np.floor(history * (1 - TT.train_model.inp.train_completeness_threshold))))
+            TT.train_model.inp.max_predict_empty = int(np.floor(horizon * (1 - TT.train_model.inp.predict_completeness_threshold)))   
+#            TT.train_model.inp = InputPipeline
+#            TT.train_model.init(sess)
+            TT.train_model.inp.inp.restore(sess)
+            TT.train_model.inp.init_iterator(sess)
+            
+            #model.pipe = InputPipeline(...) #!!!!can just reinit new pipe each time?        
+#                #In InputPipe __init__:
+#                def init_iterator(self, session):
+#                    session.run(self.iterator.initializer) 
+            
+#            metrics.append(TT.dict_metrics)
+#        MOD_=0
+#        STAGE_=1#index
+#        __ = list(metrics[MOD_].values())[STAGE_]['SMAPE']
+#        print(__.name)
+#        print(__.op)
+#        print(__.smoother)
+#        print(__.epoch_values)
+#        print(__.best_value)
+#        print(__.best_step)
+#        print(__.last_epoch)
+#        print(__.improved)
+#        print(__._top)
+        return trainer
+
+
+
+    def create_model(features_set, sampling_period, scope, index, prefix, seed):
+
+        #Just dummy filler, not important what value [since in training we will randomly vary these]
+        HISTORY_DUMMY = 100
+        HORIZON_DUMMY = 60
 
         with tf.variable_scope('input') as inp_scope:
             with tf.device("/cpu:0"):
                 split = splitter.splits[index]
-                pipe = InputPipe(inp, features=split.train_set, n_pages=split.train_size,
+                pipe = InputPipe(features_set, sampling_period, inp, features=split.train_set, N_time_series=split.train_size,
                                  mode=ModelMode.TRAIN, batch_size=batch_size, n_epoch=None, verbose=verbose,
                                  train_completeness_threshold=train_completeness_threshold,
-                                 predict_completeness_threshold=train_completeness_threshold, train_window=train_window,
-                                 predict_window=predict_window,
+                                 predict_completeness_threshold=train_completeness_threshold, history_window_size=HISTORY_DUMMY,
+                                 horizon_window_size=HORIZON_DUMMY,
                                  rand_seed=seed, train_skip_first=hparams.train_skip_first,
-                                 back_offset=predict_window if forward_split else 0)
+                                 back_offset=HORIZON_DUMMY if forward_split else 0)
                 inp_scope.reuse_variables()
+                TCT = 0.01
                 if side_split:
-                    side_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size,
+                    side_eval_pipe = InputPipe(features_set, sampling_period, inp, features=split.test_set, N_time_series=split.test_size,
                                                mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
-                                               verbose=verbose, predict_window=predict_window,
-                                               train_completeness_threshold=0.01, predict_completeness_threshold=0,
-                                               train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches,
-                                               back_offset=predict_window * (2 if forward_split else 1))
+                                               verbose=verbose, horizon_window_size=HORIZON_DUMMY,
+                                               train_completeness_threshold=TCT, predict_completeness_threshold=0,
+                                               history_window_size=HISTORY_DUMMY, rand_seed=seed, runs_in_burst=eval_batches,
+                                               back_offset=HORIZON_DUMMY * (2 if forward_split else 1))
                 else:
                     side_eval_pipe = None
                 if forward_split:
-                    forward_eval_pipe = InputPipe(inp, features=split.test_set, n_pages=split.test_size,
+                    forward_eval_pipe = InputPipe(features_set, sampling_period, inp, features=split.test_set, N_time_series=split.test_size,
                                                   mode=ModelMode.EVAL, batch_size=eval_batch_size, n_epoch=None,
-                                                  verbose=verbose, predict_window=predict_window,
-                                                  train_completeness_threshold=0.01, predict_completeness_threshold=0,
-                                                  train_window=train_window, rand_seed=seed, runs_in_burst=eval_batches,
-                                                  back_offset=predict_window)
+                                                  verbose=verbose, horizon_window_size=HORIZON_DUMMY,
+                                                  train_completeness_threshold=TCT, predict_completeness_threshold=0,
+                                                  history_window_size=HISTORY_DUMMY, rand_seed=seed, runs_in_burst=eval_batches,
+                                                  back_offset=HORIZON_DUMMY)
                 else:
                     forward_eval_pipe = None
         avg_sgd = asgd_decay is not None
@@ -501,17 +447,19 @@ def create_model(scope, index, prefix, seed):
                               summary_writer=summ_writer)
 
 
+
+
     if n_models == 1:
         with tf.device(f"/gpu:{gpu}"):
             scope = tf.get_variable_scope()
-            all_models = [create_model(scope, 0, None, seed=seed)]
+            all_models = [create_model(features_set, sampling_period, scope, 0, None, seed=seed)]
     else:
         for i in range(n_models):
             device = f"/gpu:{i}" if multi_gpu else f"/gpu:{gpu}"
             with tf.device(device):
                 prefix = f"m_{i}"
                 with tf.variable_scope(prefix) as scope:
-                    all_models.append(create_model(scope, i, prefix=prefix, seed=seed + i))
+                    all_models.append(create_model(features_set, sampling_period, scope, i, prefix=prefix, seed=seed + i))
     trainer = MultiModelTrainer(all_models, inc_step)
     if save_best_model or save_from_step:
         saver_path = f'data/cpt/{name}'
@@ -540,19 +488,24 @@ def ema_vars(model):
     if forward_split and do_eval:
         eval_smape = trainer.metric(Stage.EVAL_FRWD, 'SMAPE')
         eval_mae = trainer.metric(Stage.EVAL_FRWD, 'MAE')
+        eval_qntl = trainer.metric(Stage.EVAL_FRWD, 'qntl')
     else:
         eval_smape = DummyMetric()
         eval_mae = DummyMetric()
+        eval_qntl = DummyMetric()
 
     if side_split and do_eval:
         eval_mae_side = trainer.metric(Stage.EVAL_SIDE, 'MAE')
         eval_smape_side = trainer.metric(Stage.EVAL_SIDE, 'SMAPE')
+        eval_qntl_side = trainer.metric(Stage.EVAL_SIDE, 'qntl')
     else:
         eval_mae_side = DummyMetric()
         eval_smape_side = DummyMetric()
+        eval_qntl_side = DummyMetric()
 
     train_smape = trainer.metric(Stage.TRAIN, 'SMAPE')
     train_mae = trainer.metric(Stage.TRAIN, 'MAE')
+    train_qntl = trainer.metric(Stage.TRAIN, 'qntl')
     grad_norm = trainer.metric(Stage.TRAIN, 'GrNorm')
     eval_stages = []
     ema_eval_stages = []
@@ -570,7 +523,12 @@ def ema_vars(model):
         # pipe.load_vars(sess)
         inp.restore(sess)
         for model in all_models:
-            model.init(sess)
+            model.init(sess)#is just doing:
+#            class ModelTrainerV2:
+#                def init(self, sess):
+#                    for model in list(self.eval_models) + [self.train_model]:
+#                        model.inp.init_iterator(sess)            
+            
         # if beholder:
         #    visualizer = Beholder(session=sess, logdir=summ_path)
         step = 0
@@ -579,17 +537,36 @@ def ema_vars(model):
         # Contains best value (first item) and subsequent values
         best_epoch_smape = []
 
+        #Save out per epoch values to look at later [only per epoch, not savingout per step]
+        if save_epochs_performance:
+            output_list = []
+                    
         for epoch in range(max_epoch):
 
-            # n_steps = pusher.n_pages // batch_size
+            # n_steps = pusher.N_time_series // batch_size
             if tqdm:
                 tqr = trange(steps_per_epoch, desc="%2d" % (epoch + 1), leave=False)
             else:
                 tqr = range(steps_per_epoch)
 
             for _ in tqr:
+                #!!!!!!!!!! Variable random length train predict windows
+                #Random draw the train, predict window lengths
+                print(_)
+                trainer = random_draw_history_and_horizon_window_sizes(trainer,sess,hparams.MLP_DIRECT_DECODER)
+#                print('+++++++++++++++', [(TT.train_model.inp.history_window_size,TT.train_model.inp.horizon_window_size) for TT in trainer.trainers])
+#                print('--------', [(TT.train_model.inp.max_train_empty,TT.train_model.inp.max_predict_empty) for TT in trainer.trainers])
+#                print('::::::::::::::', [(TT.train_model.inp.iterator.get_next()) for TT in trainer.trainers])
+#                print('::::::::::::::', [(TT.train_model.inp.time_x,TT.train_model.inp.time_y) for TT in trainer.trainers])
+                #model.init(sess) ????
+                #model.pipe = InputPipeline(...) #!!!!can just reinit new pipe each time?        
+#                #In InputPipe __init__:
+#                def init_iterator(self, session):
+#                    session.run(self.iterator.initializer)                
                 try:
                     step = trainer.train_step(sess, epoch)
+#                    print('+-+-+-+-+-+-+-', [(TT.train_model.inp.history_window_size,TT.train_model.inp.horizon_window_size) for TT in trainer.trainers])
+#                    print('0000000000000', [(TT.train_model.inp.max_train_empty,TT.train_model.inp.max_predict_empty) for TT in trainer.trainers])                    
                 except tf.errors.OutOfRangeError:
                     break
                     # if beholder:
@@ -618,8 +595,9 @@ def ema_vars(model):
                 MAE = "%.3f/%.3f/%.3f" % (eval_mae.last, eval_mae_side.last, train_mae.last)
                 improvement = '↑' if eval_smape.improved else ' '
                 SMAPE = "%s%.3f/%.3f/%.3f" % (improvement, eval_smape.last, eval_smape_side.last,  train_smape.last)
+                qntl = "%.3f/%.3f/%.3f" % (eval_qntl.last, eval_qntl_side.last, train_qntl.last)
                 if tqdm:
-                    tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE)
+                    tqr.set_postfix(gr=grad_norm.last, MAE=MAE, SMAPE=SMAPE, qntl=qntl)
                 if not trainer.has_active() or (max_steps and step > max_steps):
                     break
 
@@ -642,11 +620,19 @@ def ema_vars(model):
                 ",".join(["%.3f" % m.top for m in eval_smape.metrics]))
 
             if trainer.has_active():
-                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d am" % \
-                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch,
-                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch,
-                           trainer.has_active())
+#                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, %d active models" % \
+#                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch,
+#                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch,
+#                           trainer.has_active())
+                status += ", frwd/side best MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, qntl=%.3f/%.3f; avg MAE=%.3f/%.3f, SMAPE=%.3f/%.3f, , qntl=%.3f/%.3f; %d active models" % \
+                          (eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_qntl.best_epoch, eval_qntl_side.best_epoch,
+                           eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch, eval_qntl.avg_epoch, eval_qntl_side.avg_epoch,
+                           trainer.has_active())                
                 print(status, file=sys.stderr)
+                if save_epochs_performance:
+                    output_list.append([eval_mae.best_epoch, eval_mae_side.best_epoch, eval_smape.best_epoch, eval_smape_side.best_epoch, eval_qntl.best_epoch, eval_qntl_side.best_epoch,
+                               eval_mae.avg_epoch,  eval_mae_side.avg_epoch,  eval_smape.avg_epoch,  eval_smape_side.avg_epoch, eval_qntl.avg_epoch, eval_qntl_side.avg_epoch,
+                               trainer.has_active()])
             else:
                 print(status, file=sys.stderr)
                 print("Early stopping!", file=sys.stderr)
@@ -656,21 +642,39 @@ def ema_vars(model):
                 break
             sys.stderr.flush()
 
+        if save_epochs_performance:
+            x = np.array(output_list)
+            outname = f"{logdir}/{name}_training_epochs_performance.npy"
+            np.save(outname,x)
+            
         # noinspection PyUnboundLocalVariable
         return np.mean(best_epoch_smape, dtype=np.float64)
 
 
-def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=6, back_offset=0, n_models=1,
-            target_model=0, asgd=False, seed=1, batch_size=1024):
+def predict(features_set, sampling_period, checkpoints, TEST_dir, hparams, history_window_size, horizon_window_size, return_x=False, verbose=False, back_offset=0, n_models=1,
+            target_model=0, asgd=False, seed=1, batch_size=1024): #For predict: allow horizon_window_size to be fixed
     with tf.variable_scope('input') as inp_scope:
         with tf.device("/cpu:0"):
-            inp = VarFeeder.read_vars("data/vars")
-            pipe = InputPipe(inp, page_features(inp), inp.n_pages, mode=ModelMode.PREDICT, batch_size=batch_size,
-                             n_epoch=1, verbose=verbose,
+#            inp = VarFeeder.read_vars("data/vars")
+            inp = VarFeeder.read_vars(TEST_dir)
+#            tf.Print(inp,['inp.counts',inp.counts])
+            print(inp)
+#            try:
+#            
+#            except:
+#                pass
+            pipe = InputPipe(features_set, sampling_period, inp, page_features(inp, features_set), inp.N_time_series, mode=ModelMode.PREDICT, batch_size=batch_size,
                              train_completeness_threshold=0.01,
-                             predict_window=predict_window,
-                             predict_completeness_threshold=0.0, train_window=hparams.train_window,
+                             horizon_window_size=horizon_window_size,
+                             predict_completeness_threshold=0.0, history_window_size=history_window_size,
                              back_offset=back_offset)
+            
+            print('pipe.time_x',pipe.time_x)
+            _ = tf.where(tf.is_nan(pipe.time_x))
+            #pipe.time_x = tf.Print(pipe.time_x, ['where NANs in inp.time_x :', tf.shape(_), _, _[0], _[1], _[2], _[-3], _[-2], _[-1]])
+#            pipe.time_x = tf.Print(pipe.time_x, ['where NANs in inp.time_x :', tf.shape(_), _])
+            pipe.time_x = tf.check_numerics(pipe.time_x,'pipe.time_x has NANs')            
+            
     asgd_decay = 0.99 if asgd else None
     if n_models == 1:
         model = Model(pipe, hparams, is_train=False, seed=seed, asgd_decay=asgd_decay)
@@ -693,57 +697,93 @@ def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=
     saver = tf.train.Saver(name='eval_saver', var_list=var_list)
     x_buffer = []
     predictions = None
+    pred_df_quantiles_dict = defaultdict(lambda:[])
     with tf.Session(config=tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))) as sess:
         pipe.load_vars(sess)
         for checkpoint in checkpoints:
+            print('checkpoint',checkpoint)
             pred_buffer = []
             pipe.init_iterator(sess)
             saver.restore(sess, checkpoint)
-            cnt = 0
-            while True:
-                try:
-                    if return_x:
-                        pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.page_ix])
-                    else:
-                        pred, pname = sess.run([model.predictions, model.inp.page_ix])
-                    utf_names = [str(name, 'utf-8') for name in pname]
-                    pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred))
-                    pred_buffer.append(pred_df)
-                    if return_x:
-                        # noinspection PyUnboundLocalVariable
-                        x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64))
-                        x_buffer.append(x_values)
-                    newline = cnt % 80 == 0
-                    if cnt > 0:
-                        print('.', end='\n' if newline else '', flush=True)
-                    if newline:
-                        print(cnt, end='')
-                    cnt += 1
-                except tf.errors.OutOfRangeError:
-                    print('🎉')
-                    break
-            cp_predictions = pd.concat(pred_buffer)
-            if predictions is None:
-                predictions = cp_predictions
+            
+#            if return_x:
+#                pred, x, pname = sess.run([model.predictions, model.inp.true_x, model.inp.page_ix])
+##                print('pred',pred)
+##                print('x',x)
+##                print('pname',pname)
+#            else:
+#                pred, pname = sess.run([model.predictions, model.inp.page_ix])
+            
+            pred, pname = sess.run([model.predictions, model.inp.page_ix])            
+
+            #Our data already has page names (id's) as ints, so this decoding won't work, so just do str(id)
+            try:
+                utf_names = [str(name, 'utf-8') for name in pname]
+            except UnicodeDecodeError:
+                utf_names = [str(name) for name in pname]
+            
+            if hparams.DO_QUANTILES:
+                Nquantiles = pred.shape[0]
+                assert Nquantiles==len(hparams.QUANTILES), f'QUANTILES is {QUANTILES} but pred.shape is {pred.shape}'
+                for nn, qq in enumerate(hparams.QUANTILES):
+                    pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred[nn]))
+                    pred_df_quantiles_dict[qq].append(pred_df)
+#                    print('pred_df',pred_df)
+#                    print('pred',pred_df.shape)
+#                print('pred_df_quantiles_dict',pred_df_quantiles_dict)
             else:
-                predictions += cp_predictions
-    predictions /= len(checkpoints)
-    offset = pd.Timedelta(back_offset, 'D')
-    start_prediction = inp.data_end + pd.Timedelta('1D') - offset
-    end_prediction = start_prediction + pd.Timedelta(predict_window - 1, 'D')
-    predictions.columns = pd.date_range(start_prediction, end_prediction)
-    if return_x:
-        x = pd.concat(x_buffer)
-        start_data = inp.data_end - pd.Timedelta(hparams.train_window - 1, 'D') - back_offset
-        end_data = inp.data_end - back_offset
-        x.columns = pd.date_range(start_data, end_data)
-        return predictions, x
+                pred_df = pd.DataFrame(index=utf_names, data=np.expm1(pred[0]))
+                pred_buffer.append(pred_df)
+                
+                
+#            if return_x:
+#                # noinspection PyUnboundLocalVariable
+#                x_values = pd.DataFrame(index=utf_names, data=np.round(np.expm1(x)).astype(np.int64))
+#                x_buffer.append(x_values)
+            
+            
+                cp_predictions = pd.concat(pred_buffer)
+                if predictions is None:
+                    predictions = cp_predictions
+                else:
+                    predictions += cp_predictions
+           
+            
+
+    #!!!!!!!!!!!! need to change these lines when sampling WEEKLY MONTHLY
+    start_prediction = inp.data_end + pd.Timedelta('1D') - pd.Timedelta(back_offset, 'D')
+    end_prediction = start_prediction + pd.Timedelta(horizon_window_size - 1, 'D')
+                
+    if hparams.DO_QUANTILES:
+        predictions = {}
+        Ncpt = float(len(checkpoints))
+        for kk, vv in pred_df_quantiles_dict.items():
+            _ = sum(vv)/Ncpt
+            _.columns = pd.date_range(start_prediction, end_prediction)
+            predictions[kk] = _
+            
     else:
-        return predictions
+        predictions /= len(checkpoints) #Since it is averaging predictions over the chckpoints
+        predictions.columns = pd.date_range(start_prediction, end_prediction)
+        predictions['point_est'] = predictions
+#    if return_x:
+#        x = pd.concat(x_buffer)
+#        start_data = inp.data_end - pd.Timedelta(history_window_size - 1, 'D') - pd.Timedelta(back_offset, 'D')
+#        end_data = inp.data_end - pd.Timedelta(back_offset, 'D')
+#        x.columns = pd.date_range(start_data, end_data)
+#        return predictions, x
+#    else:
+#        return predictions
+        
+        
+#    print('predictions',predictions)
+    return predictions
 
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description='Train the model')
+    parser.add_argument('features_set', help="Which set of features to use. His default for Kaggle vs. one of my custom sets: {'arturius','simple','full','full_w_context'}")
+    parser.add_argument('sampling_period', help="{'daily','weekly','monthly'}")
     parser.add_argument('--name', default='s32', help='Model name to identify different logs/checkpoints')
     parser.add_argument('--hparam_set', default='s32', help="Hyperparameters set to use (see hparams.py for available sets)")
     parser.add_argument('--n_models', default=1, type=int, help="Jointly train n models with different seeds")
@@ -767,7 +807,9 @@ def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=
     parser.add_argument('--no_tqdm', default=True, dest='tqdm', action='store_false', help="Don't use tqdm for status display during training")
     parser.add_argument('--max_steps', type=int, help="Stop training after max steps")
     parser.add_argument('--save_from_step', type=int, help="Save model on each evaluation (10 evals per epoch), starting from this step")
-    parser.add_argument('--predict_window', default=63, type=int, help="Number of days to predict")
+#    parser.add_argument('--horizon_window_size', default=63, type=int, help="Number of days to predict")
+#    parser.add_argument('--history_window_size', default=283, type=int, help="Train window chunk size")#Now that we want to do train size - val size performance heatmaps
+    parser.add_argument('--save_epochs_performance', default=False, dest='save_epochs_performance', action='store_true', help='Save out per EPOCH metrics (NOT per step, only per EPOCH')
     args = parser.parse_args()
 
     param_dict = dict(vars(args))
@@ -778,9 +820,9 @@ def predict(checkpoints, hparams, return_x=False, verbose=False, predict_window=
     # hparams = build_hparams()
     # result = train("definc_attn", hparams, n_models=1, train_sampling=1.0, eval_sampling=1.0, patience=5, multi_gpu=True,
     #                save_best_model=False, gpu=0, eval_memsize=15, seed=5, verbose=True, forward_split=False,
-    #                write_summaries=True, side_split=True, do_eval=False, predict_window=63, asgd_decay=None, max_steps=11500,
+    #                write_summaries=True, side_split=True, do_eval=False, horizon_window_size=63, asgd_decay=None, max_steps=11500,
     #                save_from_step=10500)
 
     # print("Training result:", result)
-    # preds = predict('data/cpt/fair_365-15428', 380, hparams, verbose=True, back_offset=60, n_models=3)
+    # preds = PREDICT('data/cpt/fair_365-15428', 380, hparams, verbose=True, back_offset=60, n_models=3)
     # print(preds)