Skip to content
2 changes: 1 addition & 1 deletion genpipes/bfx/job2json_project_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,6 @@ def run(input_file, samples, readsets, job_name, metrics):
samples=samples,
readsets=readsets,
job_name=job_name,
metrics=metrics
metrics=metrics,
)
)
3 changes: 2 additions & 1 deletion genpipes/bfx/jsonator_project_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def create(pipeline, sample):
if extension:
file_json = {
'location_uri': f'{path_prefix}/{output_file}',
'file_name': os.path.basename(output_file)
'file_name': os.path.basename(output_file),
'file_md5sum': None
}
job_json['file'].append(file_json)
else:
Expand Down
39 changes: 39 additions & 0 deletions genpipes/tools/job2json_project_tracking.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@
import random
import shutil
import signal
import hashlib
import logging

from datetime import datetime

log = logging.getLogger(__name__)

def main(args=None):
"""
Main function
Expand Down Expand Up @@ -79,7 +83,17 @@ def sigterm_handler(_signo, _stack_frame):
'metric_name': metric_name,
'metric_value': metric_value
}]
for fobj in job.get('file', []):
uri = fobj.get("location_uri")
local_path = uri_to_local_path(uri)

if local_path and os.path.exists(local_path) and os.path.isfile(local_path):
md5 = compute_md5sum(local_path)
else:
md5 = None

fobj["file_md5sum"] = md5

# Print to file
with open(args.json_outfile, 'w') as out_json:
json.dump(current_json, out_json, indent=4)
Expand Down Expand Up @@ -114,5 +128,30 @@ def unlock(filepath):
"""
shutil.rmtree(filepath + '.lock', ignore_errors=True)

def compute_md5sum(filepath, block_size=8 * 1024 * 1024):
"""
Compute MD5 hex digest of filepath
"""
try:
with open(filepath, 'rb') as fh:
Comment thread
MareikeJaniak marked this conversation as resolved.
h = hashlib.md5()
for chunk in iter(lambda: fh.read(block_size), b''):
h.update(chunk)
return h.hexdigest()
except Exception as exc:
log.debug ("compute_md5sum: could not read '%s': %s", filepath, exc)
return None

def uri_to_local_path(uri):
"""
Convert location_uri to local path (removing name of cluster)
"""
if uri is None:
return None
parts = uri.split("://", 1)
if len(parts) == 2:
return parts[1]
return uri

if __name__ == '__main__':
main()