Skip to content

WIP: Galaxy workflow-run export to ro-crate including provenance #118

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 31 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
86e827e
draft add_workflow_run function
pauldg Jan 27, 2022
5d291a4
first functions and tests for parsing a galaxy export and extracting …
pauldg Feb 22, 2022
c4378f4
add process parsing functionc
pauldg Feb 22, 2022
df02655
added functions for parsing history export
pauldg Feb 23, 2022
5bc6856
added tests and improved prov extraction
pauldg Mar 3, 2022
e210bfb
Merge branch 'ResearchObject:master' into ga-model-store
pauldg Mar 7, 2022
2a2f0c3
changes
pauldg Mar 7, 2022
efcbb95
updated provenance_profile.py
pauldg Mar 29, 2022
0e32c41
updated provenance_profile tests
pauldg Mar 30, 2022
a06d9ad
updates to prov extraction
pauldg Mar 31, 2022
d1b3f48
added section about galaxy workflow-run export to README
pauldg Mar 31, 2022
d8efff7
added generated ro-crate to test data
pauldg Apr 4, 2022
68c9581
removed local path from test
pauldg Apr 4, 2022
81b1e1e
fix requirements and pep8
simleo Apr 5, 2022
52d4d95
Merge branch 'ResearchObject:master' into ga-model-store
pauldg Apr 14, 2022
dac39ad
small updates to metadata parsing
pauldg Apr 15, 2022
3e85bf5
fixed whitespaces
pauldg Apr 15, 2022
642c3e1
added dataset registration in prov doc
pauldg Apr 26, 2022
af8eac2
resolved flake8 errors
pauldg Apr 27, 2022
bb9675a
updates to prov creation
pauldg Jun 23, 2022
350433d
example ro-crate notebook
pauldg Jun 23, 2022
a553d25
resolve conflicts with master
pauldg Jun 23, 2022
73324f5
resolved more conflicts
pauldg Jun 23, 2022
cf41846
added add_workflow_run_ro_crate function to rocrate.py
pauldg Jun 23, 2022
53d6c8d
included pydot and graphviz to requirements
pauldg Jun 23, 2022
b2c3ffb
reformat using black, exclude prov graph creation (to avoid installin…
pauldg Jun 29, 2022
2d90972
Merge branch 'ResearchObject:master' into ga-model-store
pauldg Sep 1, 2022
efe25a8
simplified output provenance
pauldg Sep 5, 2022
af2ffb8
resolved flake8 issues
pauldg Sep 5, 2022
e4ea3b3
first cleanup of code
pauldg Sep 9, 2022
791ba5c
second cleanup of code
pauldg Sep 13, 2022
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -315,6 +315,27 @@ rocrate add test-instance test1 http://example.com -r jobs -i test1_1
rocrate add test-definition test1 test/test1/sort-and-change-case-test.yml -e planemo -v '>=0.70'
```

Exporting a galaxy workflo-run export to an ro-crate.

```python
from rocrate import rocrate_api

wf_path = base_path + "example-history-export3.ga"
dataset_path = base_path + "example-history-export3/datasets/"
wfr_metadata_path = base_path + "example-history-export3"
files_list = os.listdir(dataset_path)
files_list = [dataset_path + f for f in files_list]

# Create base package
wf_crate = rocrate_api.make_workflow_run_rocrate(workflow_path=wf_path,
wfr_metadata_path=wfr_metadata_path, author=None, orcid=None,
wf_type="Galaxy",include_files=files_list, prov_name="test_prov")

# write crate to disk
out_path = base_path + "example-history-export3-crate-new"
wf_crate.write(out_path)
```


## License

Expand Down
160 changes: 160 additions & 0 deletions notebooks/ROcrate-example.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"sys.path.append('/home/padge/Elixir/workflow-export/ro-crate-py/')\n",
"import os\n",
"from rocrate.rocrate import ROCrate\n",
"base_path = '/home/padge/Elixir/workflow-export/'"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'/home/padge/Elixir/workflow-export/ro-crate-py/notebooks'"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"os.getcwd()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# adding a Dataset\n",
"crate = ROCrate()\n",
"os.makedirs(\"tmp3\", exist_ok=True)\n",
"open('tmp3/empty_file', 'w').close()\n",
"dataset_entity = crate.add_directory(source=\"tmpd3\", dest_path=\"new_tmp\")\n",
"crate.write(\"./new_crate4\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from IPython.core.debugger import set_trace;set_trace()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SRR10903401\n",
"FooBar\n"
]
}
],
"source": [
"shutil.rmtree(\"/home/padge/Downloads/Elixir/ro-crate-py/tmp/crate\")\n",
"shutil.copytree(\"/home/padge/Downloads/Elixir/ro-crate-py/test/test-data/read_crate\", \"/home/padge/Downloads/Elixir/ro-crate-py/tmp/crate\")\n",
"crate = ROCrate(\"/home/padge/Downloads/Elixir/ro-crate-py/tmp/crate\")\n",
"readme = crate.dereference(\"test_file_galaxy.txt\")\n",
"with open(readme.source, \"rt\") as f:\n",
" print(f.readline().strip())\n",
"new_source = \"/home/padge/Downloads/Elixir/ro-crate-py/tmp/foobar.txt\"\n",
"with open(new_source, \"wt\") as f:\n",
" f.write(\"FooBar\\n\")\n",
"crate.delete(readme)\n",
"crate.add_file(new_source, \"test_file_galaxy.txt\")\n",
"crate.write(\"/home/padge/Downloads/Elixir/ro-crate-py/tmp/crate\")\n",
"with open(readme.source, \"rt\") as f:\n",
" print(f.readline().strip())\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# import pdb\n",
"from IPython.core.debugger import set_trace\n",
"set_trace()\n",
"\n",
"# adding a Dataset\n",
"crate = ROCrate(\"/home/padge/Downloads/Elixir/ro-crate-py/tmp/crate/\")\n",
"sample_dir = '/home/padge/Downloads/Elixir/ro-crate-py/tmp/examples2/'\n",
"dataset_entity = crate.add_directory(sample_dir, \"new_dir\")\n",
"crate.write(\"/home/padge/Downloads/Elixir/ro-crate-py/tmp/crate2\")\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"from rocrate import rocrate_api\n",
"\n",
"wf_path = base_path + \"example-history-export3.ga\"\n",
"dataset_path = base_path + \"example-history-export3/datasets/\"\n",
"files_list = os.listdir(dataset_path)\n",
"files_list = [dataset_path + f for f in files_list]\n",
"\n",
"# Create base package\n",
"wf_crate = rocrate_api.make_workflow_rocrate(workflow_path=wf_path,wf_type=\"Galaxy\",include_files=files_list)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"# write crate to disk\n",
"out_path = base_path + \"example-history-export3-crate\"\n",
"wf_crate.write(out_path)"
]
}
],
"metadata": {
"interpreter": {
"hash": "bcc799c17010608b26cede4750872a7cf236af662c7ba3d883e539f7db8da28c"
},
"kernelspec": {
"display_name": "Python 3.9.5 64-bit ('base': conda)",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.10"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,6 @@ galaxy2cwl
jinja2
python-dateutil
click
prov
typing-extensions
pydot
55 changes: 55 additions & 0 deletions rocrate/provenance_constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import hashlib
import os
import uuid

from prov.identifier import Namespace

__citation__ = "https://doi.org/10.5281/zenodo.1208477"

# NOTE: Semantic versioning of the CWLProv Research Object
# **and** the cwlprov files
#
# Rough guide (major.minor.patch):
# 1. Bump major number if removing/"breaking" resources or PROV statements
# 2. Bump minor number if adding resources or PROV statements
# 3. Bump patch number for non-breaking non-adding changes,
# e.g. fixing broken relative paths
CWLPROV_VERSION = "https://w3id.org/cwl/prov/0.6.0"

# Research Object folders
METADATA = "metadata"
DATA = "data"
WORKFLOW = "workflow"
SNAPSHOT = "snapshot"
# sub-folders
MAIN = os.path.join(WORKFLOW, "main")
PROVENANCE = os.path.join(METADATA, "provenance")
LOGS = os.path.join(METADATA, "logs")
WFDESC = Namespace("wfdesc", "http://purl.org/wf4ever/wfdesc#")
WFPROV = Namespace("wfprov", "http://purl.org/wf4ever/wfprov#")
WF4EVER = Namespace("wf4ever", "http://purl.org/wf4ever/wf4ever#")
RO = Namespace("ro", "http://purl.org/wf4ever/ro#")
ORE = Namespace("ore", "http://www.openarchives.org/ore/terms/")
FOAF = Namespace("foaf", "http://xmlns.com/foaf/0.1/")
SCHEMA = Namespace("schema", "http://schema.org/")
CWLPROV = Namespace("cwlprov", "https://w3id.org/cwl/prov#")
ORCID = Namespace("orcid", "https://orcid.org/")
UUID = Namespace("id", "urn:uuid:")

# BagIt and YAML always use UTF-8
ENCODING = "UTF-8"
TEXT_PLAIN = 'text/plain; charset="%s"' % ENCODING

# sha1, compatible with the File type's "checksum" field
# e.g. "checksum" = "sha1$47a013e660d408619d894b20806b1d5086aab03b"
# See ./cwltool/schemas/v1.0/Process.yml
Hasher = hashlib.sha1
SHA1 = "sha1"
SHA256 = "sha256"
SHA512 = "sha512"

# TODO: Better identifiers for user, at least
# these should be preserved in ~/.config/cwl for every execution
# on this host
USER_UUID = uuid.uuid4().urn
ACCOUNT_UUID = uuid.uuid4().urn
Loading