From a23d052e99d8aaf9535ad7c274859b227c46fed5 Mon Sep 17 00:00:00 2001
From: Ben Lindsay <benjlindsay@gmail.com>
Date: Tue, 13 Jun 2017 16:05:48 -0400
Subject: [PATCH] initial commit

---
 .gitignore                 |   2 +
 LICENSE.txt                |   7 ++
 MANIFEST                   |   5 ++
 MANIFEST.in                |   1 +
 README.md                  |  15 +++++
 README.rst                 |   6 ++
 create_jobs/__init__.py    |   1 +
 create_jobs/create_jobs.py | 135 +++++++++++++++++++++++++++++++++++++
 setup.py                   |  27 ++++++++
 9 files changed, 199 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 LICENSE.txt
 create mode 100644 MANIFEST
 create mode 100644 MANIFEST.in
 create mode 100644 README.md
 create mode 100644 README.rst
 create mode 100644 create_jobs/__init__.py
 create mode 100755 create_jobs/create_jobs.py
 create mode 100755 setup.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1b100fe
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+dist/
+*.pyc
diff --git a/LICENSE.txt b/LICENSE.txt
new file mode 100644
index 0000000..05c6fdd
--- /dev/null
+++ b/LICENSE.txt
@@ -0,0 +1,7 @@
+Copyright 2017 Ben Lindsay
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..bcf66ea
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,5 @@
+# file GENERATED by distutils, do NOT edit
+README.rst
+setup.py
+create_jobs/__init__.py
+create_jobs/create_jobs.py
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..9561fb1
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1 @@
+include README.rst
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..396c4d5
--- /dev/null
+++ b/README.md
@@ -0,0 +1,15 @@
+# create_jobs
+
+A tool to facilitate creating a bunch of compute jobs
+
+## What this tool does
+
+If you use either PBS or SLURM and you want to run a series of jobs that are very similar but differ in 1 or more parameter values or input files, this tool could help.
+
+## Installation
+
+Install using `pip install create_jobs`. Click [here](https://pypi.python.org/pypi/create_jobs) to view this on the Python Package Index site. If you don't have `pip` on your computer, then [download Anaconda](https://www.continuum.io/downloads). You don't need root permissions. If you're on Linux, the download will give you a `bash` script that you just run using something like `bash Anaconda2-2.4.0-Linux-x86_64.sh`. This will give you a python distribution that includes `pip`.
+
+## Usage
+
+This tool operates on the concept of input files, a job submission file, and a table of parameters for each job. The table can be provided as either a pandas DataFrame, a Python dictionary, or tabular file (anything that `pandas.read_csv()` can read works). There is no limit to the number of input files you can have, and no default name for input files. The default name for job submission files is `sub.sh`.
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..f16c54f
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,6 @@
+job_tree
+########
+
+For usage and documentation, check out the `GitHub repo`_.
+
+.. _`GitHub repo`: https://github.com/benlindsay/job_tree/
diff --git a/create_jobs/__init__.py b/create_jobs/__init__.py
new file mode 100644
index 0000000..d10e357
--- /dev/null
+++ b/create_jobs/__init__.py
@@ -0,0 +1 @@
+from create_jobs import *
diff --git a/create_jobs/create_jobs.py b/create_jobs/create_jobs.py
new file mode 100755
index 0000000..2d5a322
--- /dev/null
+++ b/create_jobs/create_jobs.py
@@ -0,0 +1,135 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2017 Ben Lindsay <benjlindsay@gmail.com>
+
+from os import popen, mkdir
+from os.path import join, isfile, isdir
+import pandas as pd
+import time
+import string
+
+def create_jobs(file_list=None, param_table=None, base_dir='.',
+                table_sep='\s+', sub_file='sub.sh', sub_prog=None,
+                sleep_time=0, submit=True):
+    """
+    Recursively generate the directory tree specified by values in files or
+    functions from 'tier_list'. Copies the files in 'file_list' to each
+    job directory and replaces all the variables with appropriate values
+    """
+    # Check variables
+    if file_list is None:
+        raise ValueError("No file_list provided")
+    if param_table is None:
+        raise ValueError("No param_table provided")
+    if isinstance(param_table, pd.DataFrame):
+        param_df = param_table
+    elif isinstance(param_table, basestring):
+        if isfile(param_table):
+            param_df = pd.read_csv(param_table, sep=table_sep)
+        else:
+            raise ValueError("{} is not a valid file name!".format(param_table))
+    elif isinstance(param_table, dict):
+        param_df = pd.DataFrame(param_table)
+    else:
+        raise ValueError("param_table must be either a pandas DataFrame " +
+                         "or a file name!")
+    if sub_prog is None:
+        sub_prog = _find_sub_prog()
+
+    # Create JOB_NAME column if not already there
+    if not 'JOB_NAME' in param_df.columns:
+        param_df['JOB_NAME'] = param_df.index
+
+    # Iterate over rows of dataframe, creating and submitting jobs
+    param_dict_list = param_df.to_dict(orient='records')
+    for param_dict in param_dict_list:
+        job_dir = join(base_dir, str(param_dict['JOB_NAME']))
+        if isdir(job_dir):
+            print('{} already exists. Skipping.'.format(job_dir))
+            continue
+        else:
+            mkdir(job_dir)
+        _copy_and_replace_files(file_list, job_dir, param_dict)
+        if submit:
+            sub_file = _replace_vars(sub_file, param_dict)
+            _submit_job(job_dir, sub_file, sleep_time, sub_prog)
+
+def _find_sub_prog():
+    """
+    Returns the first job submission command found on the system.
+    Currently, only qsub and sbatch are supported
+    """
+    possible_sub_prog_list = ['qsub', 'sbatch']
+    for prog in possible_sub_prog_list:
+        if popen('command -v ' + prog).read() != '':
+            return prog
+    raise ValueError("Could not find any of the following programs: {}",
+                     possible_sub_prog_list)
+
+def _copy_and_replace_files(file_list, job_dir, param_dict):
+    """
+    Given a list, `file_list`, whose members are either file paths or
+    tuples like `('/path/to/from_file_name', 'to_file_name')` and job directory
+    `job_dir`, copies the files to the job directory and replaces
+    variables in those files and in the file names.
+    """
+    print("Copying files to {} and replacing vars".format(job_dir))
+    for input_file in file_list:
+        if isinstance(input_file, basestring):
+            from_file = input_file
+            to_file = join(job_dir, input_file)
+        elif isinstance(input_file, tuple):
+            from_file = input_file[0]
+            to_file = join(job_dir, input_file[1])
+        # Replace variables in file names, if any
+        from_file = _replace_vars(from_file, param_dict)
+        to_file = _replace_vars(to_file, param_dict)
+        # Copy file to job_dir with variables in text of file replaced
+        with open(from_file, 'r') as f_in, \
+                open(to_file, 'w') as f_out:
+            text = f_in.read()
+            text = _replace_vars(text, param_dict)
+            f_out.write(text)
+
+def _replace_vars(text, param_dict):
+    """
+    Given a block of text, replace any instances of '{key}' with 'value'
+    if param_dict contains 'key':'value' pair.
+    This is done safely so that brackets in a file don't cause an error if
+    they don't contain a variable we want to replace.
+    See http://stackoverflow.com/a/17215533/2680824
+
+    Examples:
+        >>> _replace_vars('{last}, {first} {last}', {'first':'James', 'last':'Bond'})
+        'Bond, James Bond'
+        >>> _replace_vars('{last}, {first} {last}', {'last':'Bond'})
+        'Bond, {first} Bond'
+    """
+    return string.Formatter().vformat(text, (), _Safe_Dict(param_dict))
+
+class _Safe_Dict(dict):
+    """
+    Class with all the same functionality of a dictionary but if a key isn't
+    present, it just returns '{key}'.
+    This helps with _replace_vars().
+
+    Examples:
+        >>> d = _Safe_Dict({'last':'Bond'})
+        >>> d['last']
+        'Bond'
+        >>> d['first']
+        '{first}'
+    """
+    def __missing__(self, key):
+        return '{' + key + '}'
+
+def _submit_job(job_dir, sub_file, sleep_time, sub_prog):
+    """
+    Submit 'sub_file' in 'job_dir' using submission program 'sub_prog'.
+    Wait 'sleep_time' seconds between each submission.
+    """
+    print("submitting {}".format(join(job_dir, sub_file)))
+    popen('cd ' + job_dir + '; ' + sub_prog + ' ' + sub_file + '; cd -')
+    if sleep_time > 0:
+        time.sleep(sleep_time)
diff --git a/setup.py b/setup.py
new file mode 100755
index 0000000..2bf670d
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,27 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2017 Ben Lindsay <benjlindsay@gmail.com>
+
+from distutils.core import setup
+
+desc = 'A module for automating comput job creation and submission'
+
+with open('README.rst', 'r') as f:
+    long_desc = f.read()
+
+setup(
+  name = 'create_jobs',
+  packages = ['create_jobs'],
+  version = '0.0.1',
+  description = desc,
+  long_description = long_desc,
+  requires = ['pandas'],
+  install_requires = ['pandas'],
+  scripts = [],
+  author = 'Ben Lindsay',
+  author_email = 'benjlindsay@gmail.com',
+  url = 'https://github.com/benlindsay/create_jobs',
+  keywords = ['workflow', 'simulations'],
+  classifiers = [],
+)