diff --git a/genpipes/bfx/job2json_project_tracking.py b/genpipes/bfx/job2json_project_tracking.py index a8efd5a2e..f1e4b8e15 100644 --- a/genpipes/bfx/job2json_project_tracking.py +++ b/genpipes/bfx/job2json_project_tracking.py @@ -48,6 +48,6 @@ def run(input_file, samples, readsets, job_name, metrics): samples=samples, readsets=readsets, job_name=job_name, - metrics=metrics + metrics=metrics, ) ) diff --git a/genpipes/bfx/jsonator_project_tracking.py b/genpipes/bfx/jsonator_project_tracking.py index da3d0ec3a..3f64fcf77 100644 --- a/genpipes/bfx/jsonator_project_tracking.py +++ b/genpipes/bfx/jsonator_project_tracking.py @@ -94,7 +94,8 @@ def create(pipeline, sample): if extension: file_json = { 'location_uri': f'{path_prefix}/{output_file}', - 'file_name': os.path.basename(output_file) + 'file_name': os.path.basename(output_file), + 'file_md5sum': None } job_json['file'].append(file_json) else: diff --git a/genpipes/tools/job2json_project_tracking.py b/genpipes/tools/job2json_project_tracking.py index 14c72d920..459892bdf 100755 --- a/genpipes/tools/job2json_project_tracking.py +++ b/genpipes/tools/job2json_project_tracking.py @@ -12,9 +12,13 @@ import random import shutil import signal +import hashlib +import logging from datetime import datetime +log = logging.getLogger(__name__) + def main(args=None): """ Main function @@ -79,7 +83,17 @@ def sigterm_handler(_signo, _stack_frame): 'metric_name': metric_name, 'metric_value': metric_value }] + for fobj in job.get('file', []): + uri = fobj.get("location_uri") + local_path = uri_to_local_path(uri) + + if local_path and os.path.exists(local_path) and os.path.isfile(local_path): + md5 = compute_md5sum(local_path) + else: + md5 = None + fobj["file_md5sum"] = md5 + # Print to file with open(args.json_outfile, 'w') as out_json: json.dump(current_json, out_json, indent=4) @@ -114,5 +128,30 @@ def unlock(filepath): """ shutil.rmtree(filepath + '.lock', ignore_errors=True) +def compute_md5sum(filepath, block_size=8 * 1024 * 1024): + """ + Compute MD5 hex digest of filepath + """ + try: + with open(filepath, 'rb') as fh: + h = hashlib.md5() + for chunk in iter(lambda: fh.read(block_size), b''): + h.update(chunk) + return h.hexdigest() + except Exception as exc: + log.debug ("compute_md5sum: could not read '%s': %s", filepath, exc) + return None + +def uri_to_local_path(uri): + """ + Convert location_uri to local path (removing name of cluster) + """ + if uri is None: + return None + parts = uri.split("://", 1) + if len(parts) == 2: + return parts[1] + return uri + if __name__ == '__main__': main()