Skip to content

Commit f56f974

Browse files
committed
Merge remote-tracking branch 'refs/remotes/origin/ga-model-store' into ga-model-store
2 parents 39b1d7e + 81b1e1e commit f56f974

24 files changed

+15538
-4762
lines changed

requirements.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -5,4 +5,4 @@ jinja2
55
python-dateutil
66
click
77
prov
8-
schema_salad
8+
typing-extensions

rocrate/provenance_profile.py

+35-48
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,12 @@
1-
import copy
2-
import pdb
31
import datetime
4-
import logging
52
import urllib
63
import uuid
74
import json
8-
from io import BytesIO
95
from pathlib import PurePath, PurePosixPath
10-
from socket import getfqdn
116
from typing import (
127
Any,
138
Dict,
14-
Iterable,
159
List,
16-
MutableMapping,
1710
MutableSequence,
1811
Optional,
1912
Tuple,
@@ -23,9 +16,7 @@
2316

2417
from prov.identifier import Identifier
2518
from prov.model import PROV, PROV_LABEL, PROV_TYPE, PROV_VALUE, ProvDocument, ProvEntity
26-
from schema_salad.sourceline import SourceLine
27-
from typing_extensions import TYPE_CHECKING
28-
from tools.load_ga_export import load_ga_history_export, GalaxyJob, GalaxyDataset
19+
from tools.load_ga_export import load_ga_history_export, GalaxyJob
2920
from ast import literal_eval
3021
import os
3122

@@ -36,16 +27,12 @@
3627
from rocrate.provenance_constants import (
3728
ACCOUNT_UUID,
3829
CWLPROV,
39-
ENCODING,
40-
FOAF,
4130
METADATA,
4231
ORE,
4332
PROVENANCE,
4433
RO,
4534
SCHEMA,
4635
SHA1,
47-
SHA256,
48-
TEXT_PLAIN,
4936
UUID,
5037
WF4EVER,
5138
WFDESC,
@@ -59,15 +46,17 @@
5946
# from rocrate.provenance import ResearchObject
6047

6148
from pathlib import Path
62-
import rocrate.rocrate as roc
49+
6350

6451
def posix_path(local_path: str) -> str:
6552
return str(PurePosixPath(Path(local_path)))
6653

54+
6755
def remove_escapes(s):
6856
escapes = ''.join([chr(char) for char in range(1, 32)])
6957
translator = str.maketrans('', '', escapes)
70-
t = s.translate(translator)
58+
s.translate(translator)
59+
7160

7261
def reassign(d):
7362
for k, v in d.items():
@@ -78,16 +67,17 @@ def reassign(d):
7867
except ValueError:
7968
pass
8069

70+
8171
class ProvenanceProfile:
82-
"""
72+
"""\
8373
Provenance profile.
8474

8575
Populated from a galaxy workflow export.
8676
"""
8777

8878
def __init__(
8979
self,
90-
ga_export: Dict,
80+
ga_export: Dict,
9181
full_name: str = None,
9282
orcid: str = None,
9383
# prov_name: str = None,
@@ -112,12 +102,11 @@ def __init__(
112102
self.base_uri = "arcp://uuid,%s/" % self.ro_uuid
113103
self.document = ProvDocument()
114104
# TODO extract engine_uuid from galaxy, type: str
115-
self.engine_uuid = "urn:uuid:%s" % uuid.uuid4() #type: str
105+
self.engine_uuid = "urn:uuid:%s" % uuid.uuid4() # type: str
116106
self.full_name = full_name
117107
self.workflow_run_uuid = run_uuid or uuid.uuid4()
118108
self.workflow_run_uri = self.workflow_run_uuid.urn # type: str
119-
120-
# move to separate function
109+
# move to separate function
121110
metadata_export = load_ga_history_export(ga_export)
122111
self.generate_prov_doc()
123112
self.jobs = []
@@ -153,7 +142,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
153142
# PROV_TYPE: FOAF["OnlineAccount"],
154143
# TODO: change how we register galaxy version, probably a declare_version func
155144
# self.galaxy_version = self.ga_export["jobs_attrs"][0]["galaxy_version"]
156-
# TODO: change notation to already imported namespaces?
145+
# TODO: change notation to already imported namespaces?
157146
self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
158147
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
159148
self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#")
@@ -176,7 +165,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
176165
"provenance", self.base_uri + posix_path(PROVENANCE) + "/"
177166
)
178167
# TODO: use appropriate refs for ga_export and related inputs
179-
ro_identifier_workflow = self.base_uri + "ga_export" + "/"
168+
ro_identifier_workflow = self.base_uri + "ga_export" + "/"
180169
self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow)
181170
ro_identifier_input = (
182171
self.base_uri + "ga_export/datasets#"
@@ -240,15 +229,15 @@ def declare_process(
240229
"""Record the start of each Process."""
241230
if process_run_id is None:
242231
process_run_id = uuid.uuid4().urn
243-
244-
cmd = ga_export_jobs_attrs["command_line"]
232+
233+
# cmd = ga_export_jobs_attrs["command_line"]
245234
process_name = ga_export_jobs_attrs["tool_id"]
246235
tool_version = ga_export_jobs_attrs["tool_version"]
247236
prov_label = "Run of " + process_name
248237
start_time = ga_export_jobs_attrs["create_time"]
249238
end_time = ga_export_jobs_attrs["update_time"]
250239

251-
#TODO: Find out how to include commandline as a string
240+
# TODO: Find out how to include commandline as a string
252241
# cmd = self.document.entity(
253242
# uuid.uuid4().urn,
254243
# {PROV_TYPE: WFPROV["Artifact"], PROV_LABEL: ga_export_jobs_attrs["command_line"]}
@@ -259,9 +248,9 @@ def declare_process(
259248
start_time,
260249
end_time,
261250
{
262-
PROV_TYPE: WFPROV["ProcessRun"],
263-
PROV_LABEL: prov_label,
264-
#TODO: Find out how to include commandline as a string
251+
PROV_TYPE: WFPROV["ProcessRun"],
252+
PROV_LABEL: prov_label,
253+
# TODO: Find out how to include commandline as a string
265254
# PROV_LABEL: cmd
266255
},
267256
)
@@ -289,7 +278,7 @@ def used_artefacts(
289278
base += "/" + process_name
290279
tool_id = process_metadata["tool_id"]
291280
base += "/" + tool_id
292-
items = ["inputs","outputs","parameters"]
281+
items = ["inputs", "outputs", "parameters"]
293282
# print(process_metadata["params"])
294283
for item in items:
295284
# print(item)
@@ -317,7 +306,6 @@ def used_artefacts(
317306

318307
# for artefact in value:
319308
try:
320-
# pdb.set_trace()
321309
entity = self.declare_artefact(value)
322310
self.document.used(
323311
process_run_id,
@@ -356,7 +344,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
356344
# byte_s = BytesIO(value)
357345
# data_file = self.research_object.add_data_file(byte_s)
358346
# FIXME: Don't naively assume add_data_file uses hash in filename!
359-
data_id = "data:%s" % str(value) #PurePosixPath(data_file).stem
347+
data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
360348
return self.document.entity(
361349
data_id,
362350
{PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)},
@@ -394,7 +382,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
394382
)
395383

396384
if value.get("class"):
397-
#_logger.warning("Unknown data class %s.", value["class"])
385+
# _logger.warning("Unknown data class %s.", value["class"])
398386
# FIXME: The class might be "http://example.com/somethingelse"
399387
coll.add_asserted_type(CWLPROV[value["class"]])
400388

@@ -404,7 +392,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
404392
# clean up unwanted characters
405393
if isinstance(key, str):
406394
key = key.replace("|", "_")
407-
if isinstance(val, str):
395+
if isinstance(val, str):
408396
val = val.replace("|", "_")
409397

410398
v_ent = self.declare_artefact(val)
@@ -451,7 +439,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
451439
# FIXME: list value does not support adding "@id"
452440
return coll
453441
except TypeError:
454-
#_logger.warning("Unrecognized type %s of %r", type(value), value)
442+
# _logger.warning("Unrecognized type %s of %r", type(value), value)
455443
# Let's just fall back to Python repr()
456444
entity = self.document.entity(uuid.uuid4().urn, {PROV_LABEL: repr(value)})
457445
# self.research_object.add_uri(entity.identifier.uri)
@@ -466,7 +454,7 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
466454
if "checksum" in value:
467455
csum = cast(str, value["checksum"])
468456
(method, checksum) = csum.split("$", 1)
469-
if method == SHA1: # and self.research_object.has_data_file(checksum):
457+
if method == SHA1: # and self.research_object.has_data_file(checksum):
470458
entity = self.document.entity("data:" + checksum)
471459

472460
if not entity and "location" in value:
@@ -513,8 +501,8 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
513501

514502
# Check for secondaries
515503
for sec in cast(
516-
# MutableSequence[CWLObjectType],
517-
value.get("secondaryFiles", [])
504+
# MutableSequence[CWLObjectType],
505+
value.get("secondaryFiles", []) # noqa
518506
):
519507
# TODO: Record these in a specializationOf entity with UUID?
520508
if sec["class"] == "File":
@@ -535,8 +523,10 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
535523

536524
return file_entity, entity, checksum
537525

538-
def declare_directory(self
539-
# , value: CWLObjectType
526+
def declare_directory(
527+
self,
528+
# value: CWLObjectType
529+
value
540530
) -> ProvEntity:
541531
"""Register any nested files/directories."""
542532
# FIXME: Calculate a hash-like identifier for directory
@@ -647,12 +637,11 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
647637
# checksum = PurePosixPath(data_file).name
648638
# FIXME: Don't naively assume add_data_file uses hash in filename!
649639
value = str(value).replace("|", "_")
650-
data_id = "data:%s" % str(value) #PurePosixPath(data_file).stem
640+
data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
651641
entity = self.document.entity(
652642
data_id, {PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)}
653643
) # type: ProvEntity
654-
return entity #, checksum
655-
644+
return entity # , checksum
656645

657646
def generate_output_prov(
658647
self,
@@ -735,7 +724,7 @@ def activity_has_provenance(self, activity, prov_ids):
735724
self.document.activity(activity, other_attributes=attribs)
736725
# Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
737726
# as prov:mentionOf() is only for entities, not activities
738-
uris = [i.uri for i in prov_ids]
727+
# uris = [i.uri for i in prov_ids]
739728
# self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri)
740729

741730
def finalize_prov_profile(self, name=None, out_path=None):
@@ -770,7 +759,7 @@ def finalize_prov_profile(self, name=None, out_path=None):
770759

771760
# https://www.w3.org/TR/prov-xml/
772761
# serialized_prov_docs["xml"] = self.document.serialize(format="xml", indent=4)
773-
prov_ids.append(self.provenance_ns[filename + ".xml"])
762+
prov_ids.append(self.provenance_ns[filename + ".xml"])
774763
with open(basename + ".xml", "w") as provenance_file:
775764
self.document.serialize(provenance_file, format="xml", indent=4)
776765

@@ -779,7 +768,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
779768
prov_ids.append(self.provenance_ns[filename + ".provn"])
780769
with open(basename + ".provn", "w") as provenance_file:
781770
self.document.serialize(provenance_file, format="provn", indent=2)
782-
783771

784772
# https://www.w3.org/Submission/prov-json/
785773
# serialized_prov_docs["json"] = self.document.serialize(format="json", indent=2)
@@ -810,7 +798,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
810798
prov_ids.append(self.provenance_ns[filename + ".jsonld"])
811799
with open(basename + ".jsonld", "w") as provenance_file:
812800
self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld")
813-
814801

815-
#_logger.debug("[provenance] added provenance: %s", prov_ids)
802+
# _logger.debug("[provenance] added provenance: %s", prov_ids)
816803
return (serialized_prov_docs, prov_ids)

rocrate/rocrate_api.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@
1818
# limitations under the License.
1919

2020
from pathlib import Path
21-
import os
2221

2322
import rocrate.rocrate as roc
2423
from rocrate.provenance_profile import ProvenanceProfile
@@ -79,6 +78,7 @@ def make_workflow_rocrate(workflow_path, wf_type, include_files=[],
7978

8079
return wf_crate
8180

81+
8282
# WIP
8383
def make_workflow_run_rocrate(workflow_path, wf_type, wfr_metadata_path, author=None, orcid=None,
8484
include_files=[], fetch_remote=False, prov_name=None, prov_path=None, cwl=None, diagram=None):
@@ -110,4 +110,4 @@ def make_workflow_run_rocrate(workflow_path, wf_type, wfr_metadata_path, author=
110110
for file_entry in include_files:
111111
wfr_crate.add_file(file_entry)
112112

113-
return wfr_crate
113+
return wfr_crate

0 commit comments

Comments
 (0)