From 25d50c9459d1968fe8dc060e19d5aa4e9c9ae335 Mon Sep 17 00:00:00 2001
From: Ian Lee <lee1001@llnl.gov>
Date: Mon, 18 Sep 2017 15:18:18 -0700
Subject: [PATCH 01/14] Bumps version number to next dev version

Also moves to 0.3.x rather than 0.2.x line, to match Semantic
Versioning (semver.org) philosophies.
---
 pshtt/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pshtt/__init__.py b/pshtt/__init__.py
index d93b5b24..2385e834 100644
--- a/pshtt/__init__.py
+++ b/pshtt/__init__.py
@@ -1 +1 @@
-__version__ = '0.2.3'
+__version__ = '0.3.0-dev'

From c74b42842e5e32acd8b8efbc9aff6d226f4d6b5d Mon Sep 17 00:00:00 2001
From: Ian Lee <lee1001@llnl.gov>
Date: Tue, 19 Sep 2017 18:35:43 -0700
Subject: [PATCH 02/14] Default to building universal wheel

Because the project is pure Python and 2/3 compatible,
we can build a single wheel out of the package using:

`$ python setup.py bdist_wheel`

Ref: https://packaging.python.org/tutorials/distributing-packages/\#universal-wheels
---
 setup.cfg | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/setup.cfg b/setup.cfg
index b88034e4..e5cc9829 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,2 +1,5 @@
+[bdist_wheel]
+universal = true
+
 [metadata]
 description-file = README.md

From a3ed7e4bf43fa90135a141d30587e4761cdf9e09 Mon Sep 17 00:00:00 2001
From: Ariana Mirian <arianamirian28@gmail.com>
Date: Fri, 22 Sep 2017 08:34:29 -0700
Subject: [PATCH 03/14] Scripts to run pshtt on GCE

---
 gce-scripts/README.md                | 205 +++++++++++++++++++++++++++
 gce-scripts/check_instances.sh       |  31 ++++
 gce-scripts/combine_shards.py        |  28 ++++
 gce-scripts/grab_and_combine_data.sh |  37 +++++
 gce-scripts/packages_to_install.sh   |  51 +++++++
 gce-scripts/run_all_scripts.sh       |  49 +++++++
 gce-scripts/run_instances.sh         |  31 ++++
 gce-scripts/running_script.sh        |  11 ++
 gce-scripts/scp_and_setup.sh         | 139 ++++++++++++++++++
 gce-scripts/split_up_dataset.sh      |  19 +++
 10 files changed, 601 insertions(+)
 create mode 100644 gce-scripts/README.md
 create mode 100644 gce-scripts/check_instances.sh
 create mode 100644 gce-scripts/combine_shards.py
 create mode 100644 gce-scripts/grab_and_combine_data.sh
 create mode 100644 gce-scripts/packages_to_install.sh
 create mode 100644 gce-scripts/run_all_scripts.sh
 create mode 100644 gce-scripts/run_instances.sh
 create mode 100644 gce-scripts/running_script.sh
 create mode 100644 gce-scripts/scp_and_setup.sh
 create mode 100644 gce-scripts/split_up_dataset.sh

diff --git a/gce-scripts/README.md b/gce-scripts/README.md
new file mode 100644
index 00000000..1c3a5f14
--- /dev/null
+++ b/gce-scripts/README.md
@@ -0,0 +1,205 @@
+# Pshtt as an HTTPS status checker
+
+Welcome! This is the documentation on how to run pshtt to scan sites for their
+HTTPS status. These instructions are mostly about how to run it at scale, but at
+the end, there are instructions on how to run on a local instance.
+
+This document goes over how to both run pshtt on multiple instances on google
+cloud engine and also how to run it as a singular instance on your local
+machine. It takes about 30 minutes to set up from start to finish.
+
+Running pshtt on 150 instances takes about 12 - 15 hours for a million sites.
+Assume at worst that each site will take 10 seconds (which is the default
+timeout) and scale up to whatever timeframe you want to run in based off of
+that.
+
+Example: 1000 sites in 2 hours would take 2 instances.
+
+# How to run Pshtt on Google Cloud Engine
+
+## Before you run
+
+1.  Set up a [google compute engine
+    account](https://cloud.google.com/compute/docs/access/user-accounts/).
+
+2.  Make sure you have the correct quota allowances.
+
+    *   Go to the [quotas page](https://cloud.google.com/compute/quotas) and select the project that
+		you want to run this under.
+    *   Request quotas --- click on the following items in the list and click
+        "edit qutoas" at the top of the page:
+        *   CPUS (all regions) --> 150
+        *   In use IP addresses --> 150
+        *   One Region's in use IPs (ex us-west1) --> 150
+        *   Same Region's CPUs (ex. us-west1) --> 150
+
+3.  Create Instance Group Template.
+
+    You will want to run multiple instances (presumably), and creating an
+    Instance Group template allows you to make up to 150 machines under the same
+    template.
+
+    *   Go to the template
+        tab and click "Create Instance Template".
+    *   Name --> "pshtt-template"
+    *   Machine type -- 1 CPU (n1-standard-1 (1 vCPU, 3.75 GB memory)).
+    *   Check allow HTTP and HTTPS traffic.
+    *   Boot Disk --- Ubuntu 14.04 LTS.
+    *   automatic restart (under management tab) -- off.
+    *   Hit create.
+
+# How to run Pshtt on Google Cloud Engine
+
+1.  Create a ssh key ONLY for the google cloud instances and upload to your
+    profile.
+
+    This is a security measure. ***DO NOT USE YOUR REGULAR SSH KEY.***
+
+    *   `cd ~/.ssh && ssh-keygen -t rsa -f gce_pshtt_key`
+    *   Go to the metadata
+        tab and hit edit.
+    *   `cd ~/.ssh && cat gce_pshtt_key.pub`
+    *   Copy the output of the above command and paste it into the console.
+
+2.  Create the instance group.
+
+    It is important to name your instance group something identifiable,
+    especially if you are sharing a project with others. Remember this instance
+    group name for a later step. ***We recommend that you try 1 instance at
+    first to make sure it works***.
+
+    *   Go to the instance group tab.
+    *   Click Multi-Zone, and select the region that you requested your
+        instances for.
+    *   Chose "pshtt-template" under instance template.
+
+    *   Hit create.
+
+    *   Welcome to your new instance group!
+
+## Updating Data Files and Setting up to Run
+
+The following is a set of commands to run to make your running directory.
+
+1.  Download the gcloud command line tool.
+
+    *   follow the [download
+        link](https://cloud.google.com/sdk/docs/#install_the_latest_cloud_tools_version_cloudsdk_current_version)
+        and install the correct sdk for your OS.
+    *   If this is your first time installing the gcloud command line tool,
+        follow the instructions on the page. Do not set any default zones.
+    *   If you already have this installed, following the following
+        instructions:
+    *   `gcloud init`
+        *   Click `2` create a new configuration.
+        *   Enter `pshtt-configuration`
+        *   Choose the appropriate account
+        *   Click the appopriate number corresponding to your google project
+        *   If it complains that the API is not enabled, hit enabled and retry.
+        *   Do not set default zone or region
+        *   at this point, your default project should be this google project.
+            You can switch to any of your previous projects by running `gcloud
+            config set project PROJECTNAME`
+
+2.  Setting up your directory.
+
+    *   `mkdir ~/pshtt_run`
+        *   Creates the dir that you will run your program out of.
+    *   `gcloud compute instances list | sed -n '1!p' | grep
+        "<instance-group-name>" | awk '{print $5}' > ~/pshtt_run/hosts.txt`
+    *   `<instance-group-name>` is what you named the instance group you created
+        above.
+
+3.  Copy all .sh scripts from this [PR]():
+
+    *   Keep the name of the scripts the same.
+    *   `chmod +x ~/pshtt_run/*.sh`
+        *   which will make all the scripts executable.
+    *   `touch domains.csv`
+        *   Your domain list, one domain per line, with the input list ending in
+            `.csv`.
+        *   domains must have the schema stripped of them and no trailing '/',
+            such as:
+            *   domain.tld/path/to/page
+            *   domain.tld
+            *   subdomain.domain.tld
+            *   www.subdomain.domain.tld
+    *   `mkdir ~/pshtt_run/data_results/`
+    *   `mv ~/pshtt_run/combine_shards.py ~/pshtt_run/data_results`
+        *   Places combine_shards.py into data_results/.
+    *   `mkdir ~/pshtt_run/input_files/`
+
+4.  roots.pem
+
+    We want to use our own CA file when running pshtt. We use the mozilla root
+    store for this purpose. Follow instructions on this
+    [PR](https://github.com/agl/extract-nss-root-certs).
+
+5.  Updating ssh key
+
+    *   If your new ssh key is called "gce_pshtt_key", skip this step.
+    *   If you did not name your ***new*** ssh key gce_pshtt_key, then you will
+        need to go through and rename the gce_pshtt_key in all the .sh files to
+        whatever you named your key.
+    *   in vim, this is ":%s/gce_pshtt_key/yourkeynamehere/g <enter>".
+
+## How to run
+
+1.  `screen -S pshtt_running`
+2.  `cd ~/pshtt_run/`
+3.  `./run_all_scripts <input_file_name> <number_of_shards> <shard_name> >
+    log.out`
+    *   number of shards == number of hosts
+    *   each machine will contain a shard of the data to run.
+    *   This is the script that sets up all machines and puts all datafiles on
+        the machines for running.
+    *   `./run_all_scripts top-1m.nocommas.8.31.2017 100 alexa`
+    *   will produce 100 shards all starting with "alexa" in the input_files
+        dir.
+        *   ex. alexa000.csv
+    *   NOTE: you can ONLY create 999 shards. If you need more than 999 shards,
+        you will need to change the split_up_dataset.sh file.
+4.  exit screen `cntr+a+d`
+
+## During the run
+
+*   `./check_instances.sh`
+    *   will print the ip of each host, as well as FINISHED or NOT FINISHED.
+
+## After the run
+
+*   `./grab_and_combine_data.sh`
+
+    *   will grab all log and result data files, combine data files into one
+        large result file, and put these into data_results/.
+
+*   Delete your instance group. If you want to run data analysis, jump down to
+    the data analysis portion.
+
+# Running Pshtt on your local machine
+
+1.  Copy packages_to_install.sh and install the packages_to_install.sh.
+    *   `sudo ./packages_to_install.sh`
+2.  Clone pshtt.
+    *   `git clone https://github.com/dhs-ncats/pshtt.git`
+3.  Put roots.pem, running_script.sh, and your input file in the same dir as
+    pshtt.
+    *   Follow directions under Updating data files above on how to get a
+        roots.pem.
+    *   domains must have the schema stripped of them and no trailing '/', such
+        as:
+        *   domain.tld/path/to/page
+        *   domain.tld
+        *   subdomain.domain.tld
+        *   www.subdomain.domain.tld
+    *   `chmod +x running_script.sh` to make it executable.
+4.  Run `./running_script.sh <input_filename>`
+5.  Results and profit.
+    *   Results can be found in `<input_filename>.json`.
+    *   If you want to be able to use this json file with any of the colab
+        notebooks (like the one listed below), you will also need to run
+        combine_shards.py.into the same dir as the json file.
+        *   Copy combine_shards.py into the same dir as the json file.
+        *   `echo <input_filename>.json > to_combine.txt`
+        *   `python combine_shards.py to_combine.txt > final_results.json`
+    *   Log can be found in `time_<input_filename>.txt`.
diff --git a/gce-scripts/check_instances.sh b/gce-scripts/check_instances.sh
new file mode 100644
index 00000000..ff13db77
--- /dev/null
+++ b/gce-scripts/check_instances.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Checks all the instances in hosts and checks the end of the log file
+# to see if it's finished. The script prints out FINISHED or NOT FINISHED
+# for each host respectively.
+
+hosts_file='hosts.txt'
+list_of_files=$(ls -1q input_files)
+i=1
+
+# Grab the correct input file for the corresponding machine.
+for z in $list_of_files;
+do
+    machine=$(sed "${i}q;d" $hosts_file)
+    # Check if the file has 'Wrote Results', which indicates that it's finished.
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" tail pshtt/time_"${z}".txt | grep -q 'Wrote results'
+    finished=$(echo $?)
+    if [[ "${finished}" -eq 0 ]]; then
+        echo 'server '"${machine}"' FINISHED'
+    else
+        echo 'server '"${machine}"' NOT FINISHED'
+    fi
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" cat pshtt/time_"${z}".txt | grep -q 'Traceback'
+    error=$(echo $?)
+    if [[ "${error}" -eq 0 ]]; then
+        echo 'server '"${machine}"' ERROR ON THIS MACHINE. CHECK INSTANCE.'
+    else
+        echo 'server '"${machine}"' NO ERROR.'
+    fi
+    ((i=i+1))
+done
diff --git a/gce-scripts/combine_shards.py b/gce-scripts/combine_shards.py
new file mode 100644
index 00000000..69efa4ec
--- /dev/null
+++ b/gce-scripts/combine_shards.py
@@ -0,0 +1,28 @@
+"""Combines pshtt shards into one final data file."""
+import json
+import sys
+
+
+def main():
+  if (len(sys.argv)) < 2:
+    print 'you need a filename!'
+    exit(1)
+  # Master file is the file with the list of filenames to intake.
+  # Fileception.
+  master_file = sys.argv[1]
+  filenames = []
+
+  # Read in the filenames that are the different shards.
+  with open(master_file, 'r') as input_file:
+    for line in input_file:
+      filenames.append(line.rstrip())
+  # For each shard, read it in and append to the final list to
+  # print out.
+  for item in filenames:
+    with open(item, 'r') as input_file:
+      json_data = json.load(input_file)
+      for item in json_data:
+        print json.dumps(item)
+
+if __name__ == '__main__':
+  main()
diff --git a/gce-scripts/grab_and_combine_data.sh b/gce-scripts/grab_and_combine_data.sh
new file mode 100644
index 00000000..e42679ae
--- /dev/null
+++ b/gce-scripts/grab_and_combine_data.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# If pshtt is done on all machines, it grabs both
+# the log file and the output file from the machines and
+# places them in the data_results/ directory.
+
+# This script also sets up the files to be combined by
+# the combine_shards script. Because pshtt outputs the results
+# as a list of dicts, we need to combine all of those lists.
+# We output the dicts as a file of dicts, one per line.
+hosts_file='hosts.txt'
+list_of_files=$(ls -1q input_files)
+i=1
+
+for z in $list_of_files;
+do
+    machine=$(sed "${i}q;d" $hosts_file)
+    echo 'Kicking off '"${machine}"' number '$i
+    # Grab the actual result file.
+    echo 'grabbing result file'
+    scp -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}":~/pshtt/"${z}".json data_results/
+    echo $?
+    # Grab the log file from that machine.
+    echo 'grabbing log file'
+    scp -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}":~/pshtt/time_"${z}".txt data_results/
+    echo $?
+    echo 'creating to_combine.txt'
+    touch data_results/to_combine.txt
+    echo $?
+    echo 'putting file name into combine script'
+    echo "${z}"'.json' >> data_results/to_combine.txt
+    echo $?
+    ((i=i+1))
+done
+
+cd data_results
+python combine_shards.py to_combine.txt > final_results.json
diff --git a/gce-scripts/packages_to_install.sh b/gce-scripts/packages_to_install.sh
new file mode 100644
index 00000000..2a100bba
--- /dev/null
+++ b/gce-scripts/packages_to_install.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+# Installs all the necessary packages for pshtt to run.
+# Logs which package it is installing as well as it's success (0) or failure
+# (1).
+echo 'UPDATE'
+apt-get -y update -qq
+echo $? ' ERROR CODE'
+echo 'PYTHON PIP'
+apt-get -y install python-pip -qq
+echo $? ' ERROR CODE'
+echo 'GIT'
+apt-get -y install git -qq
+echo $? ' ERROR CODE'
+echo 'PYTHON3-PIP'
+apt-get -y install python3-pip -qq
+echo $? ' ERROR CODE'
+echo 'LIBFFI6'
+apt-get -y install libffi6 libffi-dev -qq
+echo $? ' ERROR CODE'
+echo 'LIBSSL'
+apt-get -y install build-essential libssl-dev libffi-dev python-dev python3-dev -qq
+echo $? ' ERROR CODE'
+echo 'SETUPTOOLS'
+pip3 install --upgrade setuptools -qq
+echo $? ' ERROR CODE'
+echo 'CFFI'
+pip3 install cffi -qq
+echo $? ' ERROR CODE'
+echo 'SSLYZE'
+pip3 install sslyze -qq
+echo $? ' ERROR CODE'
+echo 'PUBLIC SUFFIX'
+pip3 install publicsuffix -qq
+echo $? ' ERROR CODE'
+echo 'REQUESTS'
+pip3 install --upgrade requests -qq
+echo $? ' ERROR CODE'
+echo 'DOCOPT'
+pip3 install docopt -qq
+echo $? ' ERROR CODE'
+echo 'PYOPENSSL'
+pip3 install pyopenssl -qq
+echo $? ' ERROR CODE'
+echo 'PYTABLEWRITER'
+pip3 install pytablewriter -qq
+echo $? ' ERROR CODE'
+echo 'TYPING'
+pip3 install typing -qq
+echo $? ' ERROR CODE'
+echo 'FINISHED INSTALLING PACKAGES'
diff --git a/gce-scripts/run_all_scripts.sh b/gce-scripts/run_all_scripts.sh
new file mode 100644
index 00000000..0d5f0bed
--- /dev/null
+++ b/gce-scripts/run_all_scripts.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# This is the first script to run. This script calls
+# all the other pertinent scripts for setting up
+# and kicking off runs.
+
+# ./run_all_scripts.sh <data_file_to_split> <#_of_shards> <shard_output_prepend>
+# Ex: ./run_all_scripts.sh top-1m.nocommas.8.31.2017 100 alexa
+
+# Only the first input argument is required. The other two will default
+# to 10 and shard respectively.
+
+# will split up the file top-1m.nocommas.8.31.2017 into 100 files
+# into a dir called input_files, and all the files will start with
+# alexa_. So the shard files will be alexa000.csv, alexa001.csv
+# etc.
+
+# If any of the scripts fails, this hard fails and tells the user what script
+# went wrong.
+
+
+input_file=$1
+number_of_shards=${2-10}
+output_file_name=${3-shard_}
+
+echo 'Splitting dataset'
+./split_up_dataset.sh "${1}" "${2}" "${3}"
+error=$(echo $?)
+
+if [[ "${error}" -eq 1 ]]; then
+  echo 'ERROR WITH SPLIT DATASET SCRIPT'
+  exit 1
+fi
+
+echo 'Scp and setup'
+./scp_and_setup.sh "${3}"
+error=$(echo $?)
+if [[ "${error}" -eq 1 ]]; then
+  echo 'ERROR WITH SCP AND SETUP SCRIPT'
+  exit 1
+fi
+
+echo 'Running instances'
+./run_instances.sh
+error=$(echo $?)
+if [[ "${error}" -eq 1 ]]; then
+  echo 'ERROR WITH RUNNING INSTANCES SCRIPT'
+  exit 1
+fi
diff --git a/gce-scripts/run_instances.sh b/gce-scripts/run_instances.sh
new file mode 100644
index 00000000..b0be57b0
--- /dev/null
+++ b/gce-scripts/run_instances.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+
+# Runs pshtt on all instances, using the correct input file.
+
+hosts_file='hosts.txt'
+list_of_files=$(ls -1q input_files/)
+i=1
+
+# For each file, find the corresponding machine it's been uploaded to,
+# check if the screen exists (create if not) and kick off pshtt on that screen.
+
+for z in $list_of_files;
+do
+    machine=$(sed "${i}q;d" $hosts_file)
+    # Check if screen exists.
+    echo 'Kicking off '"${machine}"' number '$i
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" screen -list | grep -q "pshtt_screen"
+    answer=$(echo $?)
+    # If screen does not exist, then create it.
+    if [[ "${answer}" -eq 1 ]] ; then
+      echo 'Creating screen'
+      ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" screen -S pshtt_screen -d -m
+      echo $?
+    fi
+
+    # Run script in screen.
+    echo 'Kicking off script'
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" "screen -S pshtt_screen -X -p 0 stuff $'cd pshtt && ./running_script.sh $z\n'"
+    echo $?
+    ((i=i+1))
+done
diff --git a/gce-scripts/running_script.sh b/gce-scripts/running_script.sh
new file mode 100644
index 00000000..3852934d
--- /dev/null
+++ b/gce-scripts/running_script.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+# Runs pshtt with a 10 second time, "Google-Transparency-Report" as the user
+# agent, with roots.pem as the CA file, and debug on. Logging goes to
+# time_<input_file_name>.txt
+
+# ./running_script.sh test_file.csv
+# output files: test_file.csv.json, time_test_file.csv.txt
+
+input_file=$1
+(time python3 -m pshtt.cli "${input_file}" -t 10 -u "Google-Transparency-Report" -j -o "${input_file}".json -f "roots.pem" --debug) 2> time_"${input_file}".txt
diff --git a/gce-scripts/scp_and_setup.sh b/gce-scripts/scp_and_setup.sh
new file mode 100644
index 00000000..a9a99a7b
--- /dev/null
+++ b/gce-scripts/scp_and_setup.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+
+# This file is broken up into three distinct parts.
+# The first part is uploading the packages to install
+# script to all machines, and kicking it off.
+# We do this first because 1) we need those packages to do anything else
+# and 2) it takes about 10 - 15 seconds per machine, so we parallelize it.
+
+# The second part is simply a check to see if the packages are finished
+# installing. We test the last machine in the list first because if that is
+# finished then all the other machines SHOULD also be finished. After we verify
+# that the last machine is finished, loop back through all of the machines and
+# make sure that they've all finished. If they haven't print out an error
+# warning for that machine and stop the whole process.
+# Takes the host file and the list of shards and
+# scps shards to hosts.
+# Also scps various scripts and installs pshtt
+# and all the necessary packages.
+# List of IPs, separated by line
+hosts_file='hosts.txt'
+# number of files that we need to cycle through
+num_files=$(ls -1q input_files/ | wc -l)
+# list of files; we do this deterministically
+# because then we can run this command across
+# other scripts and expect the same order of files.
+list_of_files=$(ls -1q input_files)
+# counter to keep track of which machine we're on (for logging purposes).
+i=1
+# We flip this bit if we find an error with any of the machines. This tells us
+# to stop the process so that the user can go by hand and fix the machine.
+error_with_packages=1
+
+# Upload script and install packages on all machines.
+# parallelized.
+################################################################
+for x in $list_of_files;
+do
+    # Grab the ip from hosts.txt that corresponds to the file number we are
+    # uploading.
+    # If we are uploading file #3 in the list, go to line 3 in the hosts file
+    # and upload to that ip.
+
+    machine=$(sed "${i}q;d" $hosts_file)
+    echo 'Now on '"${machine}"' number '$i
+    # Do not do strict host key checking so that you dont have to type "yes" for
+    # each machine.
+    echo 'Uploading packages_to_install.sh'
+    scp -i ~/.ssh/gce_pshtt_key -o "StrictHostKeyChecking no" packages_to_install.sh ubuntu@"${machine}":~/
+    echo $?
+    # We echo after each command to ensure that it worked. 0 means success.
+    # The Log file is how we can tell if the packages have all been uploaded.
+    echo 'Creating packages log file'
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" touch package_log_file.txt
+    echo $?
+    # Check to see if this screen exists already.
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" screen -list | grep -q "package_screen"
+    answer=$(echo $?)
+    # If the screen exists, then we won't create another one. Otherwise, create.
+    if [[ "${answer}" -eq 1 ]] ; then
+      echo 'Creating screen'
+      ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" screen -S package_screen -d -m
+      echo $?
+    fi
+    # Run packages_to_install and pipe to packages_log_file.txt on each machine.
+    ssh -i ~/.ssh/gce_pshtt_key -t ubuntu@"${machine}" "screen -S package_screen -X -p 0 stuff $'sudo ./packages_to_install.sh > package_log_file.txt\n'"
+    echo $?
+    ((i=i+1))
+done
+
+
+# Check that all machines have finished installing packages.
+###################################################################
+# Grab the last machine in the hosts file. This was the last one to
+# be uploaded and kicked off, so presumably it will be the last one
+# to finish.
+machine=$(sed "${num_files}q;d" $hosts_file)
+while true
+do
+    echo 'Waiting on packages to install'
+    # Wait 10 seconds before checking the file again.
+    sleep 10
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" tail package_log_file.txt | grep -q 'FINISHED INSTALLING PACKAGES'
+    finished=$(echo $?)
+    if [[ "${finished}" -eq 0 ]]; then
+      break
+    fi
+done
+
+# Since the last machine is finished, go check the other machines.
+i=1
+for z in $list_of_files;
+do
+    machine=$(sed "${i}q;d" $hosts_file)
+    echo 'Now on '"${machine}"' number '$i
+    echo 'Checking packages finished installing'
+    ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" tail package_log_file.txt | grep -q 'FINISHED INSTALLING PACKAGES'
+    finished=$(echo $?)
+    if [[ "${finished}" -eq 0 ]]; then
+      # Check if any of the machines had a problem installing packages.
+      ssh -i ~/.ssh/gce_pshtt_key ubuntu@"${machine}" cat package_log_file.txt | grep -q '1 ERROR CODE'
+      error=$(echo $?)
+      if [[ "${error}" -eq 0 ]]; then
+        echo 'ERROR WITH '"${machine}"
+        error_with_packages=0
+      fi
+    fi
+    ((i=i+1))
+done
+
+# If any of the machines had an error with a package, stop the entire process,
+# inform the user.
+if [[ "${error_with_packages}" -eq 0 ]]; then
+    echo 'ERROR FOUND WITH PACKAGES'
+    exit 1
+fi
+
+# Upload remaining data files.
+#####################################################################
+i=1
+for y in $list_of_files;
+do
+    machine=$(sed "${i}q;d" $hosts_file)
+    echo 'Now on '"${machine}"' number '$i
+    echo 'Cloning github repo file'
+    ssh -i ~/.ssh/gce_pshtt_key -t ubuntu@"${machine}" git clone https://github.com/dhs-ncats/pshtt.git
+    echo $?
+    echo 'copying data file to pshtt directory'
+    scp -i ~/.ssh/gce_pshtt_key input_files/"${y}" ubuntu@"${machine}":~/pshtt/
+    echo $?
+    echo 'Copying roots.pem into pshtt directory'
+    scp -i ~/.ssh/gce_pshtt_key "roots.pem" ubuntu@"${machine}":~/pshtt/
+    echo $?
+    echo 'Copying running script into pshtt directory'
+    scp -i ~/.ssh/gce_pshtt_key running_script.sh ubuntu@"${machine}":~/pshtt/
+    echo $?
+    echo "${y}";
+    ((i=i+1))
+done
+
diff --git a/gce-scripts/split_up_dataset.sh b/gce-scripts/split_up_dataset.sh
new file mode 100644
index 00000000..3b70473e
--- /dev/null
+++ b/gce-scripts/split_up_dataset.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+# ./split_up_dataset <input_file> <number_of_shards> <output_file_name>
+# Ex: ./split_up_dataset.sh top-1m.nocommas.8.31.2017 100 alexa
+
+# Uses split to break up the input file into N shards.
+# Because of how split works, some files will be larger or smaller
+# than others, but the sum of the files will equal the length of the
+# original file.
+
+# Add .csv suffix because that's what pshtt takes in.
+
+# Place all files into input_files dir for posterity.
+
+input_file=$1
+number_of_shards=${2-10}
+output_file_name=${3-shard_}
+
+split -a 3 --number=l/"${number_of_shards}" -d "${input_file}" input_files/"${output_file_name}" --additional-suffix=.csv

From 29e9c56abde0dcd2a1cae8e343bc2bce41780964 Mon Sep 17 00:00:00 2001
From: Ariana Mirian <arianamirian28@gmail.com>
Date: Fri, 22 Sep 2017 08:46:05 -0700
Subject: [PATCH 04/14] Updated to make more clear

---
 gce-scripts/README.md         | 12 ++++++------
 gce-scripts/running_script.sh |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/gce-scripts/README.md b/gce-scripts/README.md
index 1c3a5f14..7c799c06 100644
--- a/gce-scripts/README.md
+++ b/gce-scripts/README.md
@@ -24,8 +24,8 @@ Example: 1000 sites in 2 hours would take 2 instances.
 
 2.  Make sure you have the correct quota allowances.
 
-    *   Go to the [quotas page](https://cloud.google.com/compute/quotas) and select the project that
-		you want to run this under.
+    *   Go to the [quotas page](https://console.cloud.google.com/iam-admin/quotas?_ga=2.32845757.-2082203426.1506093243)
+		and select the project that you want to run this under.
     *   Request quotas --- click on the following items in the list and click
         "edit qutoas" at the top of the page:
         *   CPUS (all regions) --> 150
@@ -39,7 +39,7 @@ Example: 1000 sites in 2 hours would take 2 instances.
     Instance Group template allows you to make up to 150 machines under the same
     template.
 
-    *   Go to the template
+    *   Go to Compute Engine, then click on the Instance templates
         tab and click "Create Instance Template".
     *   Name --> "pshtt-template"
     *   Machine type -- 1 CPU (n1-standard-1 (1 vCPU, 3.75 GB memory)).
@@ -56,8 +56,8 @@ Example: 1000 sites in 2 hours would take 2 instances.
     This is a security measure. ***DO NOT USE YOUR REGULAR SSH KEY.***
 
     *   `cd ~/.ssh && ssh-keygen -t rsa -f gce_pshtt_key`
-    *   Go to the metadata
-        tab and hit edit.
+    *   Go to the [metadata
+        tab](https://console.cloud.google.com/compute/metadata/sshKeys?_ga=2.222782360.-2082203426.1506093243) and hit edit.
     *   `cd ~/.ssh && cat gce_pshtt_key.pub`
     *   Copy the output of the above command and paste it into the console.
 
@@ -110,7 +110,7 @@ The following is a set of commands to run to make your running directory.
     *   `<instance-group-name>` is what you named the instance group you created
         above.
 
-3.  Copy all .sh scripts from this [PR]():
+3.  Copy all .sh scripts from this directory:
 
     *   Keep the name of the scripts the same.
     *   `chmod +x ~/pshtt_run/*.sh`
diff --git a/gce-scripts/running_script.sh b/gce-scripts/running_script.sh
index 3852934d..9114d946 100644
--- a/gce-scripts/running_script.sh
+++ b/gce-scripts/running_script.sh
@@ -8,4 +8,4 @@
 # output files: test_file.csv.json, time_test_file.csv.txt
 
 input_file=$1
-(time python3 -m pshtt.cli "${input_file}" -t 10 -u "Google-Transparency-Report" -j -o "${input_file}".json -f "roots.pem" --debug) 2> time_"${input_file}".txt
+(time python3 -m pshtt.cli "${input_file}" -t 10 -u -j -o "${input_file}".json -f "roots.pem" --debug) 2> time_"${input_file}".txt

From 4336d248a2d3c1618bfd3775d6ac62a15c7e07c9 Mon Sep 17 00:00:00 2001
From: Ariana Mirian <arianamirian28@gmail.com>
Date: Fri, 22 Sep 2017 08:59:35 -0700
Subject: [PATCH 05/14] Updated README

---
 gce-scripts/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/gce-scripts/README.md b/gce-scripts/README.md
index 7c799c06..19d87da4 100644
--- a/gce-scripts/README.md
+++ b/gce-scripts/README.md
@@ -24,7 +24,7 @@ Example: 1000 sites in 2 hours would take 2 instances.
 
 2.  Make sure you have the correct quota allowances.
 
-    *   Go to the [quotas page](https://console.cloud.google.com/iam-admin/quotas?_ga=2.32845757.-2082203426.1506093243)
+    *   Go to the [quotas page](https://cloud.google.com/compute/quotas)
 		and select the project that you want to run this under.
     *   Request quotas --- click on the following items in the list and click
         "edit qutoas" at the top of the page:
@@ -57,7 +57,7 @@ Example: 1000 sites in 2 hours would take 2 instances.
 
     *   `cd ~/.ssh && ssh-keygen -t rsa -f gce_pshtt_key`
     *   Go to the [metadata
-        tab](https://console.cloud.google.com/compute/metadata/sshKeys?_ga=2.222782360.-2082203426.1506093243) and hit edit.
+        tab](https://cloud.google.com/compute/docs/instances/adding-removing-ssh-keys) and hit edit.
     *   `cd ~/.ssh && cat gce_pshtt_key.pub`
     *   Copy the output of the above command and paste it into the console.
 

From d11e111c7830f2eb40252e24eb5620946e167aa1 Mon Sep 17 00:00:00 2001
From: Ariana Mirian <arianamirian28@gmail.com>
Date: Fri, 22 Sep 2017 09:07:19 -0700
Subject: [PATCH 06/14] Fixed indentation conflict with flake

---
 gce-scripts/combine_shards.py | 38 +++++++++++++++++------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/gce-scripts/combine_shards.py b/gce-scripts/combine_shards.py
index 69efa4ec..13754378 100644
--- a/gce-scripts/combine_shards.py
+++ b/gce-scripts/combine_shards.py
@@ -4,25 +4,25 @@
 
 
 def main():
-  if (len(sys.argv)) < 2:
-    print 'you need a filename!'
-    exit(1)
-  # Master file is the file with the list of filenames to intake.
-  # Fileception.
-  master_file = sys.argv[1]
-  filenames = []
+    if (len(sys.argv)) < 2:
+        print 'you need a filename!'
+        exit(1)
+    # Master file is the file with the list of filenames to intake.
+    # Fileception.
+    master_file = sys.argv[1]
+    filenames = []
 
-  # Read in the filenames that are the different shards.
-  with open(master_file, 'r') as input_file:
-    for line in input_file:
-      filenames.append(line.rstrip())
-  # For each shard, read it in and append to the final list to
-  # print out.
-  for item in filenames:
-    with open(item, 'r') as input_file:
-      json_data = json.load(input_file)
-      for item in json_data:
-        print json.dumps(item)
+    # Read in the filenames that are the different shards.
+    with open(master_file, 'r') as input_file:
+        for line in input_file:
+            filenames.append(line.rstrip())
+    # For each shard, read it in and append to the final list to
+    # print out.
+    for item in filenames:
+        with open(item, 'r') as input_file:
+            json_data = json.load(input_file)
+            for item in json_data:
+                print json.dumps(item)
 
 if __name__ == '__main__':
-  main()
+    main()

From 3b6a04e777e4b15ffa11d152142a3f8b68988d77 Mon Sep 17 00:00:00 2001
From: Ariana Mirian <arianamirian28@gmail.com>
Date: Fri, 22 Sep 2017 09:10:23 -0700
Subject: [PATCH 07/14] Still fixing flake problems :|

---
 gce-scripts/combine_shards.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/gce-scripts/combine_shards.py b/gce-scripts/combine_shards.py
index 13754378..b9cc9189 100644
--- a/gce-scripts/combine_shards.py
+++ b/gce-scripts/combine_shards.py
@@ -5,7 +5,7 @@
 
 def main():
     if (len(sys.argv)) < 2:
-        print 'you need a filename!'
+        print('you need a filename!')
         exit(1)
     # Master file is the file with the list of filenames to intake.
     # Fileception.
@@ -22,7 +22,8 @@ def main():
         with open(item, 'r') as input_file:
             json_data = json.load(input_file)
             for item in json_data:
-                print json.dumps(item)
+                print(json.dumps(item))
+
 
 if __name__ == '__main__':
     main()

From 8af54fdf2e085dd99fcfa72369411a962496f7c1 Mon Sep 17 00:00:00 2001
From: Ariana Mirian <amirian@umich.edu>
Date: Fri, 29 Sep 2017 12:45:32 -0700
Subject: [PATCH 08/14] Fixed errors in explanations, and removed extraneous
 installation dependencies

---
 gce-scripts/README.md                | 1 -
 gce-scripts/check_instances.sh       | 0
 gce-scripts/grab_and_combine_data.sh | 0
 gce-scripts/packages_to_install.sh   | 5 +----
 gce-scripts/run_all_scripts.sh       | 0
 gce-scripts/run_instances.sh         | 0
 gce-scripts/running_script.sh        | 5 ++---
 gce-scripts/scp_and_setup.sh         | 0
 gce-scripts/split_up_dataset.sh      | 0
 9 files changed, 3 insertions(+), 8 deletions(-)
 mode change 100644 => 100755 gce-scripts/check_instances.sh
 mode change 100644 => 100755 gce-scripts/grab_and_combine_data.sh
 mode change 100644 => 100755 gce-scripts/packages_to_install.sh
 mode change 100644 => 100755 gce-scripts/run_all_scripts.sh
 mode change 100644 => 100755 gce-scripts/run_instances.sh
 mode change 100644 => 100755 gce-scripts/running_script.sh
 mode change 100644 => 100755 gce-scripts/scp_and_setup.sh
 mode change 100644 => 100755 gce-scripts/split_up_dataset.sh

diff --git a/gce-scripts/README.md b/gce-scripts/README.md
index 19d87da4..b0b7ed88 100644
--- a/gce-scripts/README.md
+++ b/gce-scripts/README.md
@@ -120,7 +120,6 @@ The following is a set of commands to run to make your running directory.
             `.csv`.
         *   domains must have the schema stripped of them and no trailing '/',
             such as:
-            *   domain.tld/path/to/page
             *   domain.tld
             *   subdomain.domain.tld
             *   www.subdomain.domain.tld
diff --git a/gce-scripts/check_instances.sh b/gce-scripts/check_instances.sh
old mode 100644
new mode 100755
diff --git a/gce-scripts/grab_and_combine_data.sh b/gce-scripts/grab_and_combine_data.sh
old mode 100644
new mode 100755
diff --git a/gce-scripts/packages_to_install.sh b/gce-scripts/packages_to_install.sh
old mode 100644
new mode 100755
index 2a100bba..d145d6ee
--- a/gce-scripts/packages_to_install.sh
+++ b/gce-scripts/packages_to_install.sh
@@ -6,9 +6,6 @@
 echo 'UPDATE'
 apt-get -y update -qq
 echo $? ' ERROR CODE'
-echo 'PYTHON PIP'
-apt-get -y install python-pip -qq
-echo $? ' ERROR CODE'
 echo 'GIT'
 apt-get -y install git -qq
 echo $? ' ERROR CODE'
@@ -19,7 +16,7 @@ echo 'LIBFFI6'
 apt-get -y install libffi6 libffi-dev -qq
 echo $? ' ERROR CODE'
 echo 'LIBSSL'
-apt-get -y install build-essential libssl-dev libffi-dev python-dev python3-dev -qq
+apt-get -y install build-essential libssl-dev libffi-dev python3-dev -qq
 echo $? ' ERROR CODE'
 echo 'SETUPTOOLS'
 pip3 install --upgrade setuptools -qq
diff --git a/gce-scripts/run_all_scripts.sh b/gce-scripts/run_all_scripts.sh
old mode 100644
new mode 100755
diff --git a/gce-scripts/run_instances.sh b/gce-scripts/run_instances.sh
old mode 100644
new mode 100755
diff --git a/gce-scripts/running_script.sh b/gce-scripts/running_script.sh
old mode 100644
new mode 100755
index 9114d946..895b2f8e
--- a/gce-scripts/running_script.sh
+++ b/gce-scripts/running_script.sh
@@ -1,8 +1,7 @@
 #!/bin/bash
 
-# Runs pshtt with a 10 second time, "Google-Transparency-Report" as the user
-# agent, with roots.pem as the CA file, and debug on. Logging goes to
-# time_<input_file_name>.txt
+# Runs pshtt with a 10 second timeout, with roots.pem as the CA file,
+# and debug on. Logging goes to time_<input_file_name>.txt
 
 # ./running_script.sh test_file.csv
 # output files: test_file.csv.json, time_test_file.csv.txt
diff --git a/gce-scripts/scp_and_setup.sh b/gce-scripts/scp_and_setup.sh
old mode 100644
new mode 100755
diff --git a/gce-scripts/split_up_dataset.sh b/gce-scripts/split_up_dataset.sh
old mode 100644
new mode 100755

From 2266c532b7abbfe03f2b07cdca59889f84d56c21 Mon Sep 17 00:00:00 2001
From: Ariana Mirian <amirian@umich.edu>
Date: Fri, 29 Sep 2017 12:51:55 -0700
Subject: [PATCH 09/14] Fixed typo

---
 gce-scripts/README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/gce-scripts/README.md b/gce-scripts/README.md
index b0b7ed88..78f63c94 100644
--- a/gce-scripts/README.md
+++ b/gce-scripts/README.md
@@ -187,7 +187,6 @@ The following is a set of commands to run to make your running directory.
         roots.pem.
     *   domains must have the schema stripped of them and no trailing '/', such
         as:
-        *   domain.tld/path/to/page
         *   domain.tld
         *   subdomain.domain.tld
         *   www.subdomain.domain.tld

From 25e56e48269eee9139d6de27ad8f93a1c5ce902e Mon Sep 17 00:00:00 2001
From: Ian Lee <lee1001@llnl.gov>
Date: Sat, 30 Sep 2017 12:49:10 -0700
Subject: [PATCH 10/14] Converted model classes to new style

This is a Python 2 -> 3 difference.
Reference: https://www.python.org/doc/newstyle/
---
 pshtt/models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pshtt/models.py b/pshtt/models.py
index 47b0c0fc..1ae71a39 100644
--- a/pshtt/models.py
+++ b/pshtt/models.py
@@ -1,5 +1,5 @@
 
-class Domain:
+class Domain(object):
 
     def __init__(self, domain):
         self.domain = domain
@@ -23,7 +23,7 @@ def to_object(self):
         }
 
 
-class Endpoint:
+class Endpoint(object):
 
     def __init__(self, protocol, host, base_domain):
         # Basic endpoint description

From 8ccf1254eb4459cd49b19372f3071cc4b4f945fd Mon Sep 17 00:00:00 2001
From: Ian Lee <lee1001@llnl.gov>
Date: Sat, 30 Sep 2017 13:00:06 -0700
Subject: [PATCH 11/14] Added Travis CI build status badge

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 133481a9..80231bd2 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,7 @@
 ## Pushing HTTPS :lock:
 
+[![Build Status](https://travis-ci.org/dhs-ncats/pshtt.svg?branch=master)](https://travis-ci.org/dhs-ncats/pshtt)
+
 `pshtt` (_"pushed"_) is a tool to scan domains for HTTPS best practices. It saves its results to a CSV (or JSON).
 
 `pshtt` was developed to _push_ organizations— especially large ones like the US Federal Government :us: — to adopt HTTPS across the enterprise. Federal .gov domains must comply with [M-15-13](https://https.cio.gov), a 2015 memorandum from the White House Office of Management and Budget that requires federal agencies to enforce HTTPS on their public web sites and services by the end of 2016. Much has been done, and [still more yet to do](https://18f.gsa.gov/2017/01/04/tracking-the-us-governments-progress-on-moving-https/).

From 7ae1af2c86502f698892ad50779661727f1699ae Mon Sep 17 00:00:00 2001
From: Ian Lee <lee1001@llnl.gov>
Date: Sat, 30 Sep 2017 15:33:53 -0700
Subject: [PATCH 12/14] Updated Python versions support

Travis was testing Python 3.6, not included in the setup.py
classifiers list. Meanwhile, 3.4 is in the classifiers, but
not being tested. This brings those two in sync.
---
 .travis.yml | 2 +-
 setup.py    | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.travis.yml b/.travis.yml
index fdc9692b..422cd932 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -4,6 +4,7 @@ sudo: false
 
 python:
   - '2.7'
+  - '3.4'
   - '3.5'
   - '3.6'
 
@@ -13,4 +14,3 @@ install:
 
 script:
   - flake8 .
-
diff --git a/setup.py b/setup.py
index 07c33eb9..6ae6c505 100755
--- a/setup.py
+++ b/setup.py
@@ -49,6 +49,7 @@
         'Programming Language :: Python :: 3',
         'Programming Language :: Python :: 3.4',
         'Programming Language :: Python :: 3.5',
+        'Programming Language :: Python :: 3.6',
     ],
 
     # What does your project relate to?

From 8bc0b31dc3c4f8f8e70aba42d88993e6a1119caa Mon Sep 17 00:00:00 2001
From: Shane Frasier <maverick@maverickdolphin.com>
Date: Tue, 17 Oct 2017 13:16:30 -0400
Subject: [PATCH 13/14] pytablereader seems to require a newer version of
 requests.

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 9371b6ea..28ad4e89 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,4 @@
-requests==2.14.2
+requests>=2.18.4
 sslyze==1.1.0
 wget==3.2
 docopt

From adebfa468dc678483e49d0e79a2dab8665b6336f Mon Sep 17 00:00:00 2001
From: Ian Lee <IanLee1521@gmail.com>
Date: Tue, 17 Oct 2017 21:44:13 -0700
Subject: [PATCH 14/14] Updated requests version in setup.py

This is to match the change made to the requirements.txt file
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 07c33eb9..26274842 100755
--- a/setup.py
+++ b/setup.py
@@ -57,7 +57,7 @@
     packages=['pshtt'],
 
     install_requires=[
-        'requests>=2.14.2',
+        'requests>=2.18.4',
         'sslyze>=1.1.0',
         'wget>=3.2',
         'docopt',