1
- import copy
2
- import pdb
3
1
import datetime
4
- import logging
5
2
import urllib
6
3
import uuid
7
4
import json
8
- from io import BytesIO
9
5
from pathlib import PurePath, PurePosixPath
10
- from socket import getfqdn
11
6
from typing import (
12
7
Any,
13
8
Dict,
14
- Iterable,
15
9
List,
16
- MutableMapping,
17
10
MutableSequence,
18
11
Optional,
19
12
Tuple,
23
16
24
17
from prov.identifier import Identifier
25
18
from prov.model import PROV, PROV_LABEL, PROV_TYPE, PROV_VALUE, ProvDocument, ProvEntity
26
- from schema_salad.sourceline import SourceLine
27
- from typing_extensions import TYPE_CHECKING
28
- from tools.load_ga_export import load_ga_history_export, GalaxyJob, GalaxyDataset
19
+ from tools.load_ga_export import load_ga_history_export, GalaxyJob
29
20
from ast import literal_eval
30
21
import os
31
22
36
27
from rocrate.provenance_constants import (
37
28
ACCOUNT_UUID,
38
29
CWLPROV,
39
- ENCODING,
40
- FOAF,
41
30
METADATA,
42
31
ORE,
43
32
PROVENANCE,
44
33
RO,
45
34
SCHEMA,
46
35
SHA1,
47
- SHA256,
48
- TEXT_PLAIN,
49
36
UUID,
50
37
WF4EVER,
51
38
WFDESC,
59
46
# from rocrate.provenance import ResearchObject
60
47
61
48
from pathlib import Path
62
- import rocrate.rocrate as roc
49
+
63
50
64
51
def posix_path(local_path: str) -> str:
65
52
return str(PurePosixPath(Path(local_path)))
66
53
54
+
67
55
def remove_escapes(s):
68
56
escapes = ''.join([chr(char) for char in range(1, 32)])
69
57
translator = str.maketrans('', '', escapes)
70
- t = s.translate(translator)
58
+ s.translate(translator)
59
+
71
60
72
61
def reassign(d):
73
62
for k, v in d.items():
@@ -78,16 +67,17 @@ def reassign(d):
78
67
except ValueError:
79
68
pass
80
69
70
+
81
71
class ProvenanceProfile:
82
- """
72
+ """\
83
73
Provenance profile.
84
74
85
75
Populated from a galaxy workflow export.
86
76
"""
87
77
88
78
def __init__(
89
79
self,
90
- ga_export: Dict,
80
+ ga_export: Dict,
91
81
full_name: str = None,
92
82
orcid: str = None,
93
83
# prov_name: str = None,
@@ -112,12 +102,11 @@ def __init__(
112
102
self.base_uri = "arcp://uuid,%s/" % self.ro_uuid
113
103
self.document = ProvDocument()
114
104
# TODO extract engine_uuid from galaxy, type: str
115
- self.engine_uuid = "urn:uuid:%s" % uuid.uuid4() #type: str
105
+ self.engine_uuid = "urn:uuid:%s" % uuid.uuid4() # type: str
116
106
self.full_name = full_name
117
107
self.workflow_run_uuid = run_uuid or uuid.uuid4()
118
108
self.workflow_run_uri = self.workflow_run_uuid.urn # type: str
119
-
120
- # move to separate function
109
+ # move to separate function
121
110
metadata_export = load_ga_history_export(ga_export)
122
111
self.generate_prov_doc()
123
112
self.jobs = []
@@ -153,7 +142,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
153
142
# PROV_TYPE: FOAF["OnlineAccount"],
154
143
# TODO: change how we register galaxy version, probably a declare_version func
155
144
# self.galaxy_version = self.ga_export["jobs_attrs"][0]["galaxy_version"]
156
- # TODO: change notation to already imported namespaces?
145
+ # TODO: change notation to already imported namespaces?
157
146
self.document.add_namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
158
147
# document.add_namespace('prov', 'http://www.w3.org/ns/prov#')
159
148
self.document.add_namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#")
@@ -176,7 +165,7 @@ def generate_prov_doc(self) -> Tuple[str, ProvDocument]:
176
165
"provenance", self.base_uri + posix_path(PROVENANCE) + "/"
177
166
)
178
167
# TODO: use appropriate refs for ga_export and related inputs
179
- ro_identifier_workflow = self.base_uri + "ga_export" + "/"
168
+ ro_identifier_workflow = self.base_uri + "ga_export" + "/"
180
169
self.wf_ns = self.document.add_namespace("wf", ro_identifier_workflow)
181
170
ro_identifier_input = (
182
171
self.base_uri + "ga_export/datasets#"
@@ -240,15 +229,15 @@ def declare_process(
240
229
"""Record the start of each Process."""
241
230
if process_run_id is None:
242
231
process_run_id = uuid.uuid4().urn
243
-
244
- cmd = ga_export_jobs_attrs["command_line"]
232
+
233
+ # cmd = ga_export_jobs_attrs["command_line"]
245
234
process_name = ga_export_jobs_attrs["tool_id"]
246
235
tool_version = ga_export_jobs_attrs["tool_version"]
247
236
prov_label = "Run of " + process_name
248
237
start_time = ga_export_jobs_attrs["create_time"]
249
238
end_time = ga_export_jobs_attrs["update_time"]
250
239
251
- #TODO: Find out how to include commandline as a string
240
+ # TODO: Find out how to include commandline as a string
252
241
# cmd = self.document.entity(
253
242
# uuid.uuid4().urn,
254
243
# {PROV_TYPE: WFPROV["Artifact"], PROV_LABEL: ga_export_jobs_attrs["command_line"]}
@@ -259,9 +248,9 @@ def declare_process(
259
248
start_time,
260
249
end_time,
261
250
{
262
- PROV_TYPE: WFPROV["ProcessRun"],
263
- PROV_LABEL: prov_label,
264
- #TODO: Find out how to include commandline as a string
251
+ PROV_TYPE: WFPROV["ProcessRun"],
252
+ PROV_LABEL: prov_label,
253
+ # TODO: Find out how to include commandline as a string
265
254
# PROV_LABEL: cmd
266
255
},
267
256
)
@@ -289,7 +278,7 @@ def used_artefacts(
289
278
base += "/" + process_name
290
279
tool_id = process_metadata["tool_id"]
291
280
base += "/" + tool_id
292
- items = ["inputs","outputs","parameters"]
281
+ items = ["inputs", "outputs", "parameters"]
293
282
# print(process_metadata["params"])
294
283
for item in items:
295
284
# print(item)
@@ -317,7 +306,6 @@ def used_artefacts(
317
306
318
307
# for artefact in value:
319
308
try:
320
- # pdb.set_trace()
321
309
entity = self.declare_artefact(value)
322
310
self.document.used(
323
311
process_run_id,
@@ -356,7 +344,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
356
344
# byte_s = BytesIO(value)
357
345
# data_file = self.research_object.add_data_file(byte_s)
358
346
# FIXME: Don't naively assume add_data_file uses hash in filename!
359
- data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
347
+ data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
360
348
return self.document.entity(
361
349
data_id,
362
350
{PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)},
@@ -394,7 +382,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
394
382
)
395
383
396
384
if value.get("class"):
397
- #_logger.warning("Unknown data class %s.", value["class"])
385
+ # _logger.warning("Unknown data class %s.", value["class"])
398
386
# FIXME: The class might be "http://example.com/somethingelse"
399
387
coll.add_asserted_type(CWLPROV[value["class"]])
400
388
@@ -404,7 +392,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
404
392
# clean up unwanted characters
405
393
if isinstance(key, str):
406
394
key = key.replace("|", "_")
407
- if isinstance(val, str):
395
+ if isinstance(val, str):
408
396
val = val.replace("|", "_")
409
397
410
398
v_ent = self.declare_artefact(val)
@@ -451,7 +439,7 @@ def declare_artefact(self, value: Any) -> ProvEntity:
451
439
# FIXME: list value does not support adding "@id"
452
440
return coll
453
441
except TypeError:
454
- #_logger.warning("Unrecognized type %s of %r", type(value), value)
442
+ # _logger.warning("Unrecognized type %s of %r", type(value), value)
455
443
# Let's just fall back to Python repr()
456
444
entity = self.document.entity(uuid.uuid4().urn, {PROV_LABEL: repr(value)})
457
445
# self.research_object.add_uri(entity.identifier.uri)
@@ -466,7 +454,7 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
466
454
if "checksum" in value:
467
455
csum = cast(str, value["checksum"])
468
456
(method, checksum) = csum.split("$", 1)
469
- if method == SHA1: # and self.research_object.has_data_file(checksum):
457
+ if method == SHA1: # and self.research_object.has_data_file(checksum):
470
458
entity = self.document.entity("data:" + checksum)
471
459
472
460
if not entity and "location" in value:
@@ -513,8 +501,8 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
513
501
514
502
# Check for secondaries
515
503
for sec in cast(
516
- # MutableSequence[CWLObjectType],
517
- value.get("secondaryFiles", [])
504
+ # MutableSequence[CWLObjectType],
505
+ value.get("secondaryFiles", []) # noqa
518
506
):
519
507
# TODO: Record these in a specializationOf entity with UUID?
520
508
if sec["class"] == "File":
@@ -535,8 +523,10 @@ def declare_file(self, value: Dict) -> Tuple[ProvEntity, ProvEntity, str]:
535
523
536
524
return file_entity, entity, checksum
537
525
538
- def declare_directory(self
539
- # , value: CWLObjectType
526
+ def declare_directory(
527
+ self,
528
+ # value: CWLObjectType
529
+ value
540
530
) -> ProvEntity:
541
531
"""Register any nested files/directories."""
542
532
# FIXME: Calculate a hash-like identifier for directory
@@ -647,12 +637,11 @@ def declare_string(self, value: str) -> Tuple[ProvEntity, str]:
647
637
# checksum = PurePosixPath(data_file).name
648
638
# FIXME: Don't naively assume add_data_file uses hash in filename!
649
639
value = str(value).replace("|", "_")
650
- data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
640
+ data_id = "data:%s" % str(value) # PurePosixPath(data_file).stem
651
641
entity = self.document.entity(
652
642
data_id, {PROV_TYPE: WFPROV["Artifact"], PROV_VALUE: str(value)}
653
643
) # type: ProvEntity
654
- return entity #, checksum
655
-
644
+ return entity # , checksum
656
645
657
646
def generate_output_prov(
658
647
self,
@@ -735,7 +724,7 @@ def activity_has_provenance(self, activity, prov_ids):
735
724
self.document.activity(activity, other_attributes=attribs)
736
725
# Tip: we can't use https://www.w3.org/TR/prov-links/#term-mention
737
726
# as prov:mentionOf() is only for entities, not activities
738
- uris = [i.uri for i in prov_ids]
727
+ # uris = [i.uri for i in prov_ids]
739
728
# self.research_object.add_annotation(activity, uris, PROV["has_provenance"].uri)
740
729
741
730
def finalize_prov_profile(self, name=None, out_path=None):
@@ -770,7 +759,7 @@ def finalize_prov_profile(self, name=None, out_path=None):
770
759
771
760
# https://www.w3.org/TR/prov-xml/
772
761
# serialized_prov_docs["xml"] = self.document.serialize(format="xml", indent=4)
773
- prov_ids.append(self.provenance_ns[filename + ".xml"])
762
+ prov_ids.append(self.provenance_ns[filename + ".xml"])
774
763
with open(basename + ".xml", "w") as provenance_file:
775
764
self.document.serialize(provenance_file, format="xml", indent=4)
776
765
@@ -779,7 +768,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
779
768
prov_ids.append(self.provenance_ns[filename + ".provn"])
780
769
with open(basename + ".provn", "w") as provenance_file:
781
770
self.document.serialize(provenance_file, format="provn", indent=2)
782
-
783
771
784
772
# https://www.w3.org/Submission/prov-json/
785
773
# serialized_prov_docs["json"] = self.document.serialize(format="json", indent=2)
@@ -810,7 +798,6 @@ def finalize_prov_profile(self, name=None, out_path=None):
810
798
prov_ids.append(self.provenance_ns[filename + ".jsonld"])
811
799
with open(basename + ".jsonld", "w") as provenance_file:
812
800
self.document.serialize(provenance_file, format="rdf", rdf_format="json-ld")
813
-
814
801
815
- #_logger.debug("[provenance] added provenance: %s", prov_ids)
802
+ # _logger.debug("[provenance] added provenance: %s", prov_ids)
816
803
return (serialized_prov_docs, prov_ids)
0 commit comments