Skip to content
Open
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
822320e
Auto-create sandbox for tests if it is not already there.
gsaksena Oct 4, 2017
44ffe88
back out changes for first time directory creation
gsaksena Oct 4, 2017
b4e9f8e
Merge branch 'master' into gsaksena_mmd2
gsaksena Oct 4, 2017
cbaafd7
updates for atomic mirror file saving
gsaksena Oct 5, 2017
518409f
removed metadata-based mirror caching, added initial status aggregation
gsaksena Oct 5, 2017
9ce8811
propagate mirror status to top
gsaksena Oct 5, 2017
e2fb22b
tweak top level error reporting for gdc_mirror
gsaksena Oct 10, 2017
7da50d1
make dicer data file writes atomic, as a responsibility of the converter
gsaksena Oct 10, 2017
3ec8fa1
factor out file naming magic from dice converters
gsaksena Oct 10, 2017
be4f31c
debug previous checkin
gsaksena Oct 10, 2017
855d011
add error tolerance and aggregate reporting to gdc_dice
gsaksena Oct 10, 2017
cf9b5f8
coded changes to set datestamp the new way
gsaksena Oct 11, 2017
f9e3e9d
add code to optionally append mirror metadata
gsaksena Oct 12, 2017
2bf501b
fix --append mode, properly interleave mirror metadata
gsaksena Oct 18, 2017
7b8e95c
updated help docs
gsaksena Oct 28, 2017
2557ad5
tweaked docs
gsaksena Oct 28, 2017
2155b01
improve merge mirror metadata semantics
gsaksena Oct 28, 2017
03fe15d
make unrecognized datatypes give more helpful error msg
gsaksena Oct 29, 2017
613c77c
add 3 new maf datatypes, update tests
gsaksena Oct 29, 2017
4f4f39e
raise exception at end of failing mirror and dice
gsaksena Oct 29, 2017
57b3c7d
add test that dicer actually returns an error
gsaksena Oct 29, 2017
47c5ff1
add test_pool
gsaksena Oct 29, 2017
2949a39
add new tests to top level target
gsaksena Oct 29, 2017
ecf5ecf
delete commented out code
gsaksena Oct 29, 2017
b61aa36
add duplicate file detection, and first write metadata to .partial fo…
gsaksena Nov 6, 2017
6566b16
fix test_pool for case where pool_sandbox dir wasn't left over from l…
gsaksena Nov 6, 2017
97886da
add error message when duplicate input files detected
gsaksena Nov 6, 2017
1c08992
merge from master, update baseline files to make tests pass
gsaksena Nov 6, 2017
4653128
gsaksena_mmd2 branch: update test baselines for latest GDC data release
Jan 17, 2018
003518b
adding additional .cfg files for tests
gsaksena Jan 17, 2018
40a5524
remove leftover header comments that do not apply here, to avoid misl…
Jan 17, 2018
96f9b32
ensure that pool and corrupted dicing sandbox areas are also cleaned up
Jan 17, 2018
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
61 changes: 50 additions & 11 deletions gdctools/GDCtool.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,23 +57,64 @@ def execute(self):
self.config_customize()
self.config_finalize()

#TODO perhaps refactor for better encapsulation, moving part to
# GDCtool.config_initialize() and part to gdc_mirror.config_customize().
# Though it is nice to have all the logic for setting datestamp in one place.
#TODO variable renaming - datestamp_required and datestamp, to make them reflect current usage
datestamp = self.options.datestamp
if self.datestamp_required:
datestamp = self.options.datestamp
if not datestamp:
datestamp = 'latest'
#non-gdc_mirror case

existing_dates = self.datestamps() # ascending sort order
if len(existing_dates) == 0:
raise ValueError("No datestamps found, use upstream tool first")

if not datestamp:
#default value = 'latest'
datestamp = 'latest'

if datestamp == 'latest':
datestamp = existing_dates[-1]
# find last datestamp in existing_dates that is in date format
for d in reversed(existing_dates):
if common.DATESTAMP_REGEX.match(d) is not None:
datestamp = d
break
else:
#TODO make this error message more helpful
raise ValueError("Looking for latest datestamp, but no datestamps found in correct format")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This actually means that there are no dates whatsoever

elif datestamp not in existing_dates:
raise ValueError("Requested datestamp not present in "
+ self.config.datestamps + "\n"
+ "Existing datestamps: " + repr(existing_dates))
else:
datestamp = time.strftime('%Y_%m_%d', time.localtime())
#gdc_mirror case
if not datestamp:
# default value = today's datestamp
datestamp = common.datestamp()
elif datestamp == 'pool':
pass
else:
#other strings such as <yyyy-mm-dd>, 'latest', valid variable names, and everything else are not allowed
raise ValueError("For gdc_mirror, date must be blank or 'pool'")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I thought the plan was to allow user-defined tags, and that anything besides 'latest' or a datestamp would work like 'pool'

Copy link
Contributor Author

@gsaksena gsaksena Oct 28, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Dropped other user defined tags because they are not needed to meet the requirement. YAGNI. Easy to add later, makes the code simpler and easier to document for today.


# TODO remove this old code **gs**
# if self.datestamp_required:
# datestamp = self.options.datestamp
# if not datestamp:
# datestamp = 'latest'

# existing_dates = self.datestamps() # ascending sort order
# if len(existing_dates) == 0:
# raise ValueError("No datestamps found, use upstream tool first")

# if datestamp == 'latest':
# datestamp = existing_dates[-1]
# elif datestamp not in existing_dates:
# raise ValueError("Requested datestamp not present in "
# + self.config.datestamps + "\n"
# + "Existing datestamps: " + repr(existing_dates))
# else:
# datestamp = time.strftime('%Y_%m_%d', time.localtime())

self.datestamp = datestamp
self.init_logging()
Expand All @@ -98,12 +139,10 @@ def config_add_args(self):
cli = self.cli
cli.add_argument('--config', nargs='+', type=argparse.FileType('r'),
help='One or more configuration files')

if self.datestamp_required:
cli.add_argument('--date', nargs='?', dest='datestamp',
help='Use data from a given dated version (snapshot) of '
'GDC data, specified in YYYY_MM_DD form. If omitted, '
'the latest available snapshot will be used.')
cli.add_argument('--date', nargs='?', dest='datestamp',
help='Use data from a given dated version (snapshot) of '
'GDC data, specified in YYYY_MM_DD form. If omitted, '
'the latest available snapshot will be used.')
cli.add_argument('--cases', nargs='+', metavar='case_id',
help='Process data only from these GDC cases')
cli.add_argument('--categories',nargs='+',metavar='category',
Expand Down
27 changes: 22 additions & 5 deletions gdctools/gdc_dice.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ def dice(self):
program = config.programs[0]
diced_prog_root = os.path.join(config.dice.dir, program)
mirror_prog_root = os.path.join(config.mirror.dir, program)
prog_status_tally = Counter()

# Ensure no simultaneous mirroring/dicing
with common.lock_context(diced_prog_root, "dice"), \
Expand Down Expand Up @@ -164,17 +165,20 @@ def dice(self):
for tcga_id in tcga_lookup:
# Dice single sample files first
for file_d in viewvalues(tcga_lookup[tcga_id]):
dice_one(file_d, trans_dict, raw_project_root,
dice_one_status = dice_one(file_d, trans_dict, raw_project_root,
diced_project_root, mfw,
dry_run=self.options.dry_run,
force=self.force)
prog_status_tally[dice_one_status] += 1
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do like the tallying


#Then dice the multi_sample_files
for file_d in multi_sample_files:
dice_one(file_d, trans_dict, raw_project_root,
dice_one_status = dice_one(file_d, trans_dict, raw_project_root,
diced_project_root, mfw,
dry_run=self.options.dry_run,
force=self.force)
prog_status_tally[dice_one_status] += 1


# Bookkeeping code -- write some useful tables
# and figures needed for downstream sample reports.
Expand Down Expand Up @@ -217,7 +221,11 @@ def dice(self):
_write_combined_counts(all_counts_file, all_counts, all_totals)
_link_to_prog(all_counts_file, datestamp, diced_prog_root)

logging.info("Dicing completed successfuly")
logging.info(str(prog_status_tally))
if prog_status_tally['error'] == 0:
logging.info("Dicing completed successfuly")
else:
logging.warn("One or more diced files FAILED")

def execute(self):
super(gdc_dice, self).execute()
Expand Down Expand Up @@ -361,10 +369,11 @@ def dice_one(file_dict, translation_dict, mirror_proj_root, diced_root,
true, a debug message will be displayed instead of performing the actual
dicing operation.
"""
dice_one_status = 'error'
mirror_path = meta.mirror_path(mirror_proj_root, file_dict)
if not os.path.isfile(mirror_path):
# Bad, this means there are integrity issues
raise ValueError("Expected mirror file missing: " + mirror_path)
logging.warning("Expected mirror file missing: " + mirror_path)
else:
## Get the right annotation and converter for this file
annot, convert = get_annotation_converter(file_dict, translation_dict)
Expand All @@ -380,12 +389,19 @@ def dice_one(file_dict, translation_dict, mirror_proj_root, diced_root,
already_diced = all(os.path.isfile(p) for p in expected_paths)
if force or not already_diced:
logging.info("Dicing file " + mirror_path)
convert(file_dict, mirror_path, dice_path)
try:
convert(file_dict, mirror_path, dice_path)
dice_one_status = 'pass'
except Exception as e:
logging.warning('Dice converter failed: %s'%str(e))
else:
logging.info("Skipping file " + mirror_path + " (already diced)")
dice_one_status = 'cached'

append_diced_metadata(file_dict, expected_paths,
annot, meta_file_writer)
else:
dice_one_status = 'dry_run'
else:
# To verbose to log the entire json, log just log data_type and file_id
warning_info = {
Expand All @@ -396,6 +412,7 @@ def dice_one(file_dict, translation_dict, mirror_proj_root, diced_root,
}
logging.warn('Unrecognized data:\n%s' % json.dumps(warning_info,
indent=2))
return dice_one_status

def get_annotation_converter(file_dict, translation_dict):
k = metadata_to_key(file_dict)
Expand Down
Loading