-
Notifications
You must be signed in to change notification settings - Fork 4
Gsaksena mmd2 #67
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Gsaksena mmd2 #67
Changes from 14 commits
822320e
44ffe88
b4e9f8e
cbaafd7
518409f
9ce8811
e2fb22b
7da50d1
3ec8fa1
be4f31c
855d011
cf9b5f8
f9e3e9d
2bf501b
7b8e95c
2557ad5
2155b01
03fe15d
613c77c
4f4f39e
57b3c7d
47c5ff1
2949a39
ecf5ecf
b61aa36
6566b16
97886da
1c08992
4653128
003518b
40a5524
96f9b32
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -57,23 +57,64 @@ def execute(self): | |
| self.config_customize() | ||
| self.config_finalize() | ||
|
|
||
| #TODO perhaps refactor for better encapsulation, moving part to | ||
| # GDCtool.config_initialize() and part to gdc_mirror.config_customize(). | ||
| # Though it is nice to have all the logic for setting datestamp in one place. | ||
| #TODO variable renaming - datestamp_required and datestamp, to make them reflect current usage | ||
| datestamp = self.options.datestamp | ||
| if self.datestamp_required: | ||
| datestamp = self.options.datestamp | ||
| if not datestamp: | ||
| datestamp = 'latest' | ||
| #non-gdc_mirror case | ||
|
|
||
| existing_dates = self.datestamps() # ascending sort order | ||
| if len(existing_dates) == 0: | ||
| raise ValueError("No datestamps found, use upstream tool first") | ||
|
|
||
| if not datestamp: | ||
| #default value = 'latest' | ||
| datestamp = 'latest' | ||
|
|
||
| if datestamp == 'latest': | ||
| datestamp = existing_dates[-1] | ||
| # find last datestamp in existing_dates that is in date format | ||
| for d in reversed(existing_dates): | ||
| if common.DATESTAMP_REGEX.match(d) is not None: | ||
| datestamp = d | ||
| break | ||
| else: | ||
| #TODO make this error message more helpful | ||
| raise ValueError("Looking for latest datestamp, but no datestamps found in correct format") | ||
| elif datestamp not in existing_dates: | ||
| raise ValueError("Requested datestamp not present in " | ||
| + self.config.datestamps + "\n" | ||
| + "Existing datestamps: " + repr(existing_dates)) | ||
| else: | ||
| datestamp = time.strftime('%Y_%m_%d', time.localtime()) | ||
| #gdc_mirror case | ||
| if not datestamp: | ||
| # default value = today's datestamp | ||
| datestamp = common.datestamp() | ||
| elif datestamp == 'pool': | ||
| pass | ||
| else: | ||
| #other strings such as <yyyy-mm-dd>, 'latest', valid variable names, and everything else are not allowed | ||
| raise ValueError("For gdc_mirror, date must be blank or 'pool'") | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I thought the plan was to allow user-defined tags, and that anything besides 'latest' or a datestamp would work like 'pool'
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Dropped other user defined tags because they are not needed to meet the requirement. YAGNI. Easy to add later, makes the code simpler and easier to document for today. |
||
|
|
||
| # TODO remove this old code **gs** | ||
| # if self.datestamp_required: | ||
| # datestamp = self.options.datestamp | ||
| # if not datestamp: | ||
| # datestamp = 'latest' | ||
|
|
||
| # existing_dates = self.datestamps() # ascending sort order | ||
| # if len(existing_dates) == 0: | ||
| # raise ValueError("No datestamps found, use upstream tool first") | ||
|
|
||
| # if datestamp == 'latest': | ||
| # datestamp = existing_dates[-1] | ||
| # elif datestamp not in existing_dates: | ||
| # raise ValueError("Requested datestamp not present in " | ||
| # + self.config.datestamps + "\n" | ||
| # + "Existing datestamps: " + repr(existing_dates)) | ||
| # else: | ||
| # datestamp = time.strftime('%Y_%m_%d', time.localtime()) | ||
|
|
||
| self.datestamp = datestamp | ||
| self.init_logging() | ||
|
|
@@ -98,12 +139,10 @@ def config_add_args(self): | |
| cli = self.cli | ||
| cli.add_argument('--config', nargs='+', type=argparse.FileType('r'), | ||
| help='One or more configuration files') | ||
|
|
||
| if self.datestamp_required: | ||
| cli.add_argument('--date', nargs='?', dest='datestamp', | ||
| help='Use data from a given dated version (snapshot) of ' | ||
| 'GDC data, specified in YYYY_MM_DD form. If omitted, ' | ||
| 'the latest available snapshot will be used.') | ||
| cli.add_argument('--date', nargs='?', dest='datestamp', | ||
| help='Use data from a given dated version (snapshot) of ' | ||
| 'GDC data, specified in YYYY_MM_DD form. If omitted, ' | ||
| 'the latest available snapshot will be used.') | ||
| cli.add_argument('--cases', nargs='+', metavar='case_id', | ||
| help='Process data only from these GDC cases') | ||
| cli.add_argument('--categories',nargs='+',metavar='category', | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -81,6 +81,7 @@ def dice(self): | |
| program = config.programs[0] | ||
| diced_prog_root = os.path.join(config.dice.dir, program) | ||
| mirror_prog_root = os.path.join(config.mirror.dir, program) | ||
| prog_status_tally = Counter() | ||
|
|
||
| # Ensure no simultaneous mirroring/dicing | ||
| with common.lock_context(diced_prog_root, "dice"), \ | ||
|
|
@@ -164,17 +165,20 @@ def dice(self): | |
| for tcga_id in tcga_lookup: | ||
| # Dice single sample files first | ||
| for file_d in viewvalues(tcga_lookup[tcga_id]): | ||
| dice_one(file_d, trans_dict, raw_project_root, | ||
| dice_one_status = dice_one(file_d, trans_dict, raw_project_root, | ||
| diced_project_root, mfw, | ||
| dry_run=self.options.dry_run, | ||
| force=self.force) | ||
| prog_status_tally[dice_one_status] += 1 | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do like the tallying |
||
|
|
||
| #Then dice the multi_sample_files | ||
| for file_d in multi_sample_files: | ||
| dice_one(file_d, trans_dict, raw_project_root, | ||
| dice_one_status = dice_one(file_d, trans_dict, raw_project_root, | ||
| diced_project_root, mfw, | ||
| dry_run=self.options.dry_run, | ||
| force=self.force) | ||
| prog_status_tally[dice_one_status] += 1 | ||
|
|
||
|
|
||
| # Bookkeeping code -- write some useful tables | ||
| # and figures needed for downstream sample reports. | ||
|
|
@@ -217,7 +221,11 @@ def dice(self): | |
| _write_combined_counts(all_counts_file, all_counts, all_totals) | ||
| _link_to_prog(all_counts_file, datestamp, diced_prog_root) | ||
|
|
||
| logging.info("Dicing completed successfuly") | ||
| logging.info(str(prog_status_tally)) | ||
| if prog_status_tally['error'] == 0: | ||
| logging.info("Dicing completed successfuly") | ||
| else: | ||
| logging.warn("One or more diced files FAILED") | ||
|
|
||
| def execute(self): | ||
| super(gdc_dice, self).execute() | ||
|
|
@@ -361,10 +369,11 @@ def dice_one(file_dict, translation_dict, mirror_proj_root, diced_root, | |
| true, a debug message will be displayed instead of performing the actual | ||
| dicing operation. | ||
| """ | ||
| dice_one_status = 'error' | ||
| mirror_path = meta.mirror_path(mirror_proj_root, file_dict) | ||
| if not os.path.isfile(mirror_path): | ||
| # Bad, this means there are integrity issues | ||
| raise ValueError("Expected mirror file missing: " + mirror_path) | ||
| logging.warning("Expected mirror file missing: " + mirror_path) | ||
| else: | ||
| ## Get the right annotation and converter for this file | ||
| annot, convert = get_annotation_converter(file_dict, translation_dict) | ||
|
|
@@ -380,12 +389,19 @@ def dice_one(file_dict, translation_dict, mirror_proj_root, diced_root, | |
| already_diced = all(os.path.isfile(p) for p in expected_paths) | ||
| if force or not already_diced: | ||
| logging.info("Dicing file " + mirror_path) | ||
| convert(file_dict, mirror_path, dice_path) | ||
| try: | ||
| convert(file_dict, mirror_path, dice_path) | ||
| dice_one_status = 'pass' | ||
| except Exception as e: | ||
| logging.warning('Dice converter failed: %s'%str(e)) | ||
| else: | ||
| logging.info("Skipping file " + mirror_path + " (already diced)") | ||
| dice_one_status = 'cached' | ||
|
|
||
| append_diced_metadata(file_dict, expected_paths, | ||
| annot, meta_file_writer) | ||
| else: | ||
| dice_one_status = 'dry_run' | ||
| else: | ||
| # To verbose to log the entire json, log just log data_type and file_id | ||
| warning_info = { | ||
|
|
@@ -396,6 +412,7 @@ def dice_one(file_dict, translation_dict, mirror_proj_root, diced_root, | |
| } | ||
| logging.warn('Unrecognized data:\n%s' % json.dumps(warning_info, | ||
| indent=2)) | ||
| return dice_one_status | ||
|
|
||
| def get_annotation_converter(file_dict, translation_dict): | ||
| k = metadata_to_key(file_dict) | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This actually means that there are no dates whatsoever