diff --git a/CHANGELOG.rst b/CHANGELOG.rst new file mode 100644 index 0000000..31f861f --- /dev/null +++ b/CHANGELOG.rst @@ -0,0 +1,12 @@ +########## +Change Log +########## + +All notable changes to this project are documented in this file. + + +Added +----- +- Make ``rose2`` python3 compatible +- Make ``rose2`` a callable python function + diff --git a/LICENSE.txt b/LICENSE.txt index 31c2487..d0143e6 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2014 Charles Lin (Bradner Lab) +Copyright (c) 2018 Charles Lin (Bradner Lab) and Collaborators: John DiMatteo Nick Semenkovich | semenko@alum.mit.edu | https://nick.semenkovich.com @@ -21,4 +21,4 @@ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN -CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file +CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in index 565af2e..393b418 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,2 +1,2 @@ include README.md -recursive-include annotation * +recursive-include rose2/annotation * diff --git a/README.md b/README.md index 7921e49..6cc603b 100644 --- a/README.md +++ b/README.md @@ -8,10 +8,29 @@ Install ======= ``` -pip install -e git+https://github.com/linlabbcm/rose2.git#egg=rose2 +pip install -e git+https://github.com/linlabbcm/rose2.git ``` Dependencies ============ -ROSE 2 requires [Bamliquidator](https://github.com/BradnerLab/pipeline/wiki/bamliquidator). +The CRC software uses the following dependencies: + +- ``Bamliquidator`` + +- ``Samtools`` + +Usage +===== + +As a command line tool: +``` +rose2 -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS] +``` + +As a python library: +``` +import rose2 + +rose2.rose(input_file, rankby, output_folder, genome, bams=None, control='', stitch=None, tss=0, mask_file=None) +``` diff --git a/rose2/__init__.py b/rose2/__init__.py index fce1dac..b2353e0 100644 --- a/rose2/__init__.py +++ b/rose2/__init__.py @@ -1,664 +1 @@ -#!/usr/bin/env python -''' -PROGRAM TO STITCH TOGETHER REGIONS TO FORM ENHANCERS, MAP READ DENSITY TO STITCHED REGIONS, -AND RANK ENHANCERS BY READ DENSITY TO DISCOVER SUPER-ENHANCERS -May2, 2014 -VERSION 0.2 -CONTACT: youngcomputation@wi.mit.edu -''' - -from __future__ import absolute_import # , division, print_function, unicode_literals - -import sys -import time -import copy -import os -import numpy -import subprocess -import string - -from collections import defaultdict - -from . import genemapper, utils - - -whereAmI = os.path.dirname(os.path.realpath(__file__)) -annotationDirectory = os.path.join(whereAmI, 'annotation') + '/' - -#================================================================== -#=====================HELPER FUNCTIONS============================= -#================================================================== - -def getBamChromList(bamFileList): - - ''' - gets the consensus list of chromosomes mapped by the bams - ''' - - #start w/ the first bam - cmd = 'samtools idxstats %s' % (bamFileList[0]) - idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) - idxStats= idxStats.communicate() - finalChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] - - #now go through each additional bam - for bamFile in bamFileList: - cmd = 'samtools idxstats %s' % (bamFile) - idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) - idxStats= idxStats.communicate() - chromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] - finalChromList = [chrom for chrom in finalChromList if chromList.count(chrom) != 0] - - return utils.uniquify(finalChromList) - - -def checkRefCollection(referenceCollection): - - ''' - makes sure the names of all loci in the reference collection are unique - ''' - - namesList = [locus.ID() for locus in referenceCollection.getLoci()] - - if len(namesList) != len(utils.uniquify(namesList)): - print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS") - sys.exit() - else: - print("REFERENCE COLLECTION PASSES QC") - return - - - -def filterGFF(gffFile,chromList): - - ''' - takes in a gff and filters out all lines that don't belong to a chrom in the chromList - ''' - gff = utils.parseTable(gffFile,'\t') - filteredGFF = [] - excludeList=[] - for line in gff: - if chromList.count(line[0]) ==1: - filteredGFF.append(line) - else: - excludeList.append(line[0]) - - excludeList = utils.uniquify(excludeList) - if len(excludeList) > 0: - print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: %s" % (','.join(excludeList))) - - return filteredGFF - - - -#========================================================================== -#===================FORMATTING FOLDERS===================================== -#========================================================================== - -def formatFolder(folderName,create=False): - - ''' - makes sure a folder exists and if not makes it - returns a bool for folder - ''' - - if folderName[-1] != '/': - folderName +='/' - - try: - foo = os.listdir(folderName) - return folderName - except OSError: - print('folder %s does not exist' % (folderName)) - if create: - os.system('mkdir %s' % (folderName)) - return folderName - else: - - return False - - - - -#================================================================== -#=====================REGION STITCHING============================= -#================================================================== - - - - -def optimizeStitching(locusCollection, name, outFolder, stepSize=500): - ''' - takes a locus collection and starts writing out stitching stats at step sized intervals - ''' - maxStitch = 5000 # set a hard wired match stitching parameter - - stitchTable = [['STEP', 'NUM_REGIONS', 'TOTAL_CONSTIT', 'TOTAL_REGION', 'MEAN_CONSTIT', 'MEDIAN_CONSTIT', 'MEAN_REGION', 'MEDIAN_REGION', 'MEAN_STITCH_FRACTION', 'MEDIAN_STITCH_FRACTION']] - # first consolidate the collection - locusCollection = locusCollection.stitchCollection(stitchWindow=0) - total_constit = sum([locus.len() for locus in locusCollection.getLoci()]) - step = 0 - while step <= maxStitch: - - print("Getting stitch stats for %s (bp)" % (step)) - stitchCollection = locusCollection.stitchCollection(stitchWindow=step) - num_regions = len(stitchCollection) - stitchLoci = stitchCollection.getLoci() - regionLengths = [locus.len() for locus in stitchLoci] - total_region = sum(regionLengths) - constitLengths = [] - for locus in stitchLoci: - - constitLoci = locusCollection.getOverlap(locus) - constitLengths.append(sum([locus.len() for locus in constitLoci])) - - meanConstit = round(numpy.mean(constitLengths), 2) - medianConstit = round(numpy.median(constitLengths), 2) - - meanRegion = round(numpy.mean(regionLengths), 2) - medianRegion = round(numpy.median(regionLengths), 2) - - stitchFractions = [float(constitLengths[i]) / float(regionLengths[i]) for i in range(len(regionLengths))] - meanStitchFraction = round(numpy.mean(stitchFractions), 2) - medianStitchFraction = round(numpy.median(stitchFractions), 2) - - newLine = [step, num_regions, total_constit, total_region, meanConstit, medianConstit, meanRegion, medianRegion, meanStitchFraction, medianStitchFraction] - - stitchTable.append(newLine) - - step += stepSize - - # write the stitch table to disk - stitchParamFile = '%s%s_stitch_params.tmp' % (outFolder, name) - utils.unParseTable(stitchTable, stitchParamFile, '\t') - # call the rscript - rCmd = 'ROSE2_stitchOpt.R %s %s %s' % (stitchParamFile, outFolder, name) - print(rCmd) - # get back the stitch parameter - rOutput = subprocess.Popen(rCmd, stdout=subprocess.PIPE, shell=True) - rOutputTest = rOutput.communicate() - - print(rOutputTest) - - stitchParam = rOutputTest[0].split('\n')[2] - try: - stitchParam = int(stitchParam) - except ValueError: - print("INVALID STITCHING PARAMETER. STITCHING OPTIMIZATION FAILED") - sys.exit() - - # delete? the table - # os.system('rm -f %s' % (stitchParamFile)) - return stitchParam - - -def regionStitching(referenceCollection, name, outFolder, stitchWindow, tssWindow, annotFile, removeTSS=True): - print('PERFORMING REGION STITCHING') - # first have to turn bound region file into a locus collection - - # need to make sure this names correctly... each region should have a unique name - #referenceCollection - - debugOutput = [] - # filter out all bound regions that overlap the TSS of an ACTIVE GENE - if removeTSS: - - print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF %sBP' % (tssWindow)) - # first make a locus collection of TSS - - startDict = utils.makeStartDict(annotFile) - - # now makeTSS loci for active genes - removeTicker = 0 - # this loop makes a locus centered around +/- tssWindow of transcribed genes - # then adds it to the list tssLoci - tssLoci = [] - for geneID in startDict.keys(): - tssLoci.append(utils.makeTSSLocus(geneID, startDict, tssWindow, tssWindow)) - - # this turns the tssLoci list into a LocusCollection - # 50 is the internal parameter for LocusCollection and doesn't really matter - tssCollection = utils.LocusCollection(tssLoci, 50) - - # gives all the loci in referenceCollection - boundLoci = referenceCollection.getLoci() - - # this loop will check if each bound region is contained by the TSS exclusion zone - # this will drop out a lot of the promoter only regions that are tiny - # typical exclusion window is around 2kb - for locus in boundLoci: - if len(tssCollection.getContainers(locus, 'both')) > 0: - - # if true, the bound locus overlaps an active gene - referenceCollection.remove(locus) - debugOutput.append([locus.__str__(), locus.ID(), 'CONTAINED']) - removeTicker += 1 - print('REMOVED %s LOCI BECAUSE THEY WERE CONTAINED BY A TSS' % (removeTicker)) - - # referenceCollection is now all enriched region loci that don't overlap an active TSS - - if stitchWindow == '': - print('DETERMINING OPTIMUM STITCHING PARAMTER') - optCollection = copy.deepcopy(referenceCollection) - stitchWindow = optimizeStitching(optCollection, name, outFolder, stepSize=500) - print('USING A STITCHING PARAMETER OF %s' % stitchWindow) - stitchedCollection = referenceCollection.stitchCollection(stitchWindow, 'both') - - if removeTSS: - # now replace any stitched region that overlap 2 distinct genes - # with the original loci that were there - fixedLoci = [] - tssLoci = [] - for geneID in startDict.keys(): - tssLoci.append(utils.makeTSSLocus(geneID, startDict, 50, 50)) - - # this turns the tssLoci list into a LocusCollection - # 50 is the internal parameter for LocusCollection and doesn't really matter - tssCollection = utils.LocusCollection(tssLoci, 50) - removeTicker = 0 - originalTicker = 0 - for stitchedLocus in stitchedCollection.getLoci(): - overlappingTSSLoci = tssCollection.getOverlap(stitchedLocus, 'both') - tssNames = [startDict[tssLocus.ID()]['name'] for tssLocus in overlappingTSSLoci] - tssNames = utils.uniquify(tssNames) - if len(tssNames) > 2: - - # stitchedCollection.remove(stitchedLocus) - originalLoci = referenceCollection.getOverlap(stitchedLocus, 'both') - originalTicker += len(originalLoci) - fixedLoci += originalLoci - debugOutput.append([stitchedLocus.__str__(), stitchedLocus.ID(), 'MULTIPLE_TSS']) - removeTicker += 1 - else: - fixedLoci.append(stitchedLocus) - - print('REMOVED %s STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs' % (removeTicker)) - print('ADDED BACK %s ORIGINAL LOCI' % (originalTicker)) - fixedCollection = utils.LocusCollection(fixedLoci, 50) - return fixedCollection, debugOutput, stitchWindow - else: - return stitchedCollection, debugOutput, stitchWindow - -#================================================================== -#=====================REGION LINKING MAPPING======================= -#================================================================== - - -def mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, output, refName): - ''' - makes a table of factor density in a stitched locus and ranks table by number of loci stitched together - ''' - - print('FORMATTING TABLE') - loci = stitchedCollection.getLoci() - - locusTable = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']] - - lociLenList = [] - - # strip out any that are in chrY - for locus in list(loci): - if locus.chr() == 'chrY': - loci.remove(locus) - - for locus in loci: - # numLociList.append(int(stitchLocus.ID().split('_')[1])) - lociLenList.append(locus.len()) - # numOrder = order(numLociList,decreasing=True) - lenOrder = utils.order(lociLenList, decreasing=True) - ticker = 0 - for i in lenOrder: - ticker += 1 - if ticker % 1000 == 0: - print(ticker) - locus = loci[i] - - # First get the size of the enriched regions within the stitched locus - refEnrichSize = 0 - refOverlappingLoci = referenceCollection.getOverlap(locus, 'both') - for refLocus in refOverlappingLoci: - refEnrichSize += refLocus.len() - - try: - stitchCount = int(locus.ID().split('_')[0]) - except ValueError: - stitchCount = 1 - coords = [int(x) for x in locus.coords()] - - locusTable.append([locus.ID(), locus.chr(), min(coords), max(coords), stitchCount, refEnrichSize]) - - print('GETTING MAPPED DATA') - print("USING A BAMFILE LIST:") - print(bamFileList) - for bamFile in bamFileList: - - bamFileName = bamFile.split('/')[-1] - - print('GETTING MAPPING DATA FOR %s' % bamFile) - # assumes standard convention for naming enriched region gffs - - # opening up the mapped GFF - print('OPENING %s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName)) - - mappedGFF = utils.parseTable('%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, refName, bamFileName), '\t') - - signalDict = defaultdict(float) - print('MAKING SIGNAL DICT FOR %s' % (bamFile)) - mappedLoci = [] - for line in mappedGFF[1:]: - - chrom = line[1].split('(')[0] - start = int(line[1].split(':')[-1].split('-')[0]) - end = int(line[1].split(':')[-1].split('-')[1]) - mappedLoci.append(utils.Locus(chrom, start, end, '.', line[0])) - try: - signalDict[line[0]] = float(line[2]) * (abs(end - start)) - except ValueError: - print('WARNING NO SIGNAL FOR LINE:') - print(line) - continue - - mappedCollection = utils.LocusCollection(mappedLoci, 500) - locusTable[0].append(bamFileName) - - for i in range(1, len(locusTable)): - signal = 0.0 - line = locusTable[i] - lineLocus = utils.Locus(line[1], line[2], line[3], '.') - overlappingRegions = mappedCollection.getOverlap(lineLocus, sense='both') - for region in overlappingRegions: - signal += signalDict[region.ID()] - locusTable[i].append(signal) - - utils.unParseTable(locusTable, output, '\t') - - -#================================================================== -#=========================MAIN METHOD============================== -#================================================================== -def main(): - ''' - main run call - ''' - debug = False - - from optparse import OptionParser - usage = "usage: %prog [options] -g [GENOME] -i [INPUT_REGION_GFF] -r [RANKBY_BAM_FILE] -o [OUTPUT_FOLDER] [OPTIONAL_FLAGS]" - parser = OptionParser(usage=usage) - # required flags - parser.add_option("-i", "--i", dest="input", nargs=1, default=None, - help="Enter a .gff or .bed file of binding sites used to make enhancers") - parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, - help="bamfile to rank enhancer by") - parser.add_option("-o", "--out", dest="out", nargs=1, default=None, - help="Enter an output folder") - parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, - help="Enter the genome build (MM9,MM8,HG18,HG19)") - - # optional flags - parser.add_option("-b", "--bams", dest="bams", nargs=1, default=None, - help="Enter a comma separated list of additional bam files to map to") - parser.add_option("-c", "--control", dest="control", nargs=1, default=None, - help="bamfile to rank enhancer by") - parser.add_option("-s", "--stitch", dest="stitch", nargs=1, default='', - help="Enter a max linking distance for stitching. Default will determine optimal stitching parameter") - parser.add_option("-t", "--tss", dest="tss", nargs=1, default=0, - help="Enter a distance from TSS to exclude. 0 = no TSS exclusion") - - parser.add_option("--mask", dest="mask", nargs=1, default=None, - help="Mask a set of regions from analysis. Provide a .bed or .gff of masking regions") - - # RETRIEVING FLAGS - (options, args) = parser.parse_args() - - if not options.input or not options.rankby or not options.out or not options.genome: - print('hi there') - parser.print_help() - exit() - - # making the out folder if it doesn't exist - outFolder = utils.formatFolder(options.out, True) - - # figuring out folder schema - gffFolder = utils.formatFolder(outFolder + 'gff/', True) - mappedFolder = utils.formatFolder(outFolder + 'mappedGFF/', True) - - # GETTING INPUT FILE - if options.input.split('.')[-1] == 'bed': - # CONVERTING A BED TO GFF - inputGFFName = options.input.split('/')[-1][0:-4] - inputGFFFile = '%s%s.gff' % (gffFolder, inputGFFName) - utils.bedToGFF(options.input, inputGFFFile) - elif options.input.split('.')[-1] == 'gff': - # COPY THE INPUT GFF TO THE GFF FOLDER - inputGFFFile = options.input - os.system('cp %s %s' % (inputGFFFile, gffFolder)) - - else: - print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') - # COPY THE INPUT GFF TO THE GFF FOLDER - inputGFFFile = options.input - os.system('cp %s %s' % (inputGFFFile, gffFolder)) - - # GETTING THE LIST OF BAMFILES TO PROCESS - if options.control: - bamFileList = [options.rankby, options.control] - - else: - bamFileList = [options.rankby] - - if options.bams: - bamFileList += options.bams.split(',') - #bamFileList = utils.uniquify(bamFileList) # makes sad when you have the same control bam over and over again - - for bam in bamFileList: - if not os.path.isfile('{}.bai'.format(bam)): - print('INDEX FILE FOR %s IS MISSING' % bam) - exit() - - # optional args - - # Stitch parameter - if options.stitch == '': - stitchWindow = '' - else: - stitchWindow = int(options.stitch) - - # tss options - tssWindow = int(options.tss) - if tssWindow != 0: - removeTSS = True - else: - removeTSS = False - - # GETTING THE BOUND REGION FILE USED TO DEFINE ENHANCERS - print('USING %s AS THE INPUT GFF' % (inputGFFFile)) - inputName = inputGFFFile.split('/')[-1].split('.')[0] - - # GETTING THE GENOME - genome = options.genome - print('USING %s AS THE GENOME' % genome) - - # GETTING THE CORRECT ANNOT FILE - - genomeDict = { - 'HG18': '%shg18_refseq.ucsc' % (annotationDirectory), - 'MM9': '%smm9_refseq.ucsc' % (annotationDirectory), - 'HG19': '%shg19_refseq.ucsc' % (annotationDirectory), - 'MM8': '%smm8_refseq.ucsc' % (annotationDirectory), - 'MM10': '%smm10_refseq.ucsc' % (annotationDirectory), - 'RN4': '%srn4_refseq.ucsc' % (annotationDirectory), - 'RN6': '%srn6_refseq.ucsc' % (annotationDirectory), - } - - annotFile = genomeDict[genome.upper()] - - # MAKING THE START DICT - print('MAKING START DICT') - startDict = utils.makeStartDict(annotFile) - - - #GET CHROMS FOUND IN THE BAMS - print('GETTING CHROMS IN BAMFILES') - bamChromList = getBamChromList(bamFileList) - print("USING THE FOLLOWING CHROMS") - print(bamChromList) - - #LOADING IN THE GFF AND FILTERING BY CHROM - print('LOADING AND FILTERING THE GFF') - inputGFF = filterGFF(inputGFFFile,bamChromList) - # LOADING IN THE BOUND REGION REFERENCE COLLECTION - print('LOADING IN GFF REGIONS') - referenceCollection = utils.gffToLocusCollection(inputGFF) - - print('CHECKING REFERENCE COLLECTION:') - checkRefCollection(referenceCollection) - - - # MASKING REFERENCE COLLECTION - # see if there's a mask - if options.mask: - maskFile = options.mask - # if it's a bed file - if maskFile.split('.')[-1].upper() == 'BED': - maskGFF = utils.bedToGFF(maskFile) - elif maskFile.split('.')[-1].upper() == 'GFF': - maskGFF = utils.parseTable(maskFile, '\t') - else: - print("MASK MUST BE A .gff or .bed FILE") - sys.exit() - maskCollection = utils.gffToLocusCollection(maskGFF) - - # now mask the reference loci - referenceLoci = referenceCollection.getLoci() - filteredLoci = [locus for locus in referenceLoci if len(maskCollection.getOverlap(locus, 'both')) == 0] - print("FILTERED OUT %s LOCI THAT WERE MASKED IN %s" % (len(referenceLoci) - len(filteredLoci), maskFile)) - referenceCollection = utils.LocusCollection(filteredLoci, 50) - - # NOW STITCH REGIONS - print('STITCHING REGIONS TOGETHER') - stitchedCollection, debugOutput, stitchWindow = regionStitching(referenceCollection, inputName, outFolder, stitchWindow, tssWindow, annotFile, removeTSS) - - # NOW MAKE A STITCHED COLLECTION GFF - print('MAKING GFF FROM STITCHED COLLECTION') - stitchedGFF = utils.locusCollectionToGFF(stitchedCollection) - # making sure start/stop ordering are correct - for i in range(len(stitchedGFF)): - - line = stitchedGFF[i] - start = int(line[3]) - stop = int(line[4]) - if start > stop: - line[3] = stop - line[4] = start - - print(stitchWindow) - print(type(stitchWindow)) - if not removeTSS: - stitchedGFFFile = '%s%s_%sKB_STITCHED.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) - stitchedGFFName = '%s_%sKB_STITCHED' % (inputName, str(stitchWindow / 1000)) - debugOutFile = '%s%s_%sKB_STITCHED.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) - else: - stitchedGFFFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.gff' % (gffFolder, inputName, str(stitchWindow / 1000)) - stitchedGFFName = '%s_%sKB_STITCHED_TSS_DISTAL' % (inputName, str(stitchWindow / 1000)) - debugOutFile = '%s%s_%sKB_STITCHED_TSS_DISTAL.debug' % (gffFolder, inputName, str(stitchWindow / 1000)) - - # WRITING DEBUG OUTPUT TO DISK - - if debug: - print('WRITING DEBUG OUTPUT TO DISK AS %s' % (debugOutFile)) - utils.unParseTable(debugOutput, debugOutFile, '\t') - - # WRITE THE GFF TO DISK - print('WRITING STITCHED GFF TO DISK AS %s' % (stitchedGFFFile)) - utils.unParseTable(stitchedGFF, stitchedGFFFile, '\t') - - # SETTING UP THE OVERALL OUTPUT FILE - outputFile1 = outFolder + stitchedGFFName + '_ENHANCER_REGION_MAP.txt' - print('OUTPUT WILL BE WRITTEN TO %s' % (outputFile1)) - - # MAPPING TO THE NON STITCHED (ORIGINAL GFF) - # MAPPING TO THE STITCHED GFF - - # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. - bamliquidator_path = 'bamliquidator_batch' - - - bamFileListUnique = list(bamFileList) - bamFileListUnique = utils.uniquify(bamFileListUnique) - #prevent redundant mapping - print("MAPPING TO THE FOLLOWING BAMS:") - print(bamFileListUnique) - for bamFile in bamFileListUnique: - - bamFileName = bamFile.split('/')[-1] - - # MAPPING TO THE STITCHED GFF - mappedOut1Folder = '%s%s_%s_MAPPED' % (mappedFolder, stitchedGFFName, bamFileName) - mappedOut1File = '%s%s_%s_MAPPED/matrix.txt' % (mappedFolder, stitchedGFFName, bamFileName) - if utils.checkOutput(mappedOut1File, 0.2, 0.2): - print("FOUND %s MAPPING DATA FOR BAM: %s" % (stitchedGFFFile, mappedOut1File)) - else: - cmd1 = bamliquidator_path + " --sense . -e 200 --match_bamToGFF -r %s -o %s %s" % (stitchedGFFFile, mappedOut1Folder, bamFile) - print(cmd1) - - os.system(cmd1) - if utils.checkOutput(mappedOut1File,0.2,5): - print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) - else: - print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (stitchedGFFFile, bamFileName)) - sys.exit() - - print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') - # CALCULATE DENSITY BY REGION - # NEED TO FIX THIS FUNCTION TO ACCOUNT FOR DIFFERENT OUTPUTS OF LIQUIDATOR - mapCollection(stitchedCollection, referenceCollection, bamFileList, mappedFolder, outputFile1, refName=stitchedGFFName) - - print('CALLING AND PLOTTING SUPER-ENHANCERS') - - if options.control: - - rankbyName = options.rankby.split('/')[-1] - controlName = options.control.split('/')[-1] - cmd = 'ROSE2_callSuper.R %s %s %s %s' % (outFolder, outputFile1, inputName, controlName) - - - else: - rankbyName = options.rankby.split('/')[-1] - controlName = 'NONE' - cmd = 'ROSE2_callSuper.R %s %s %s %s' % (outFolder, outputFile1, inputName, controlName) - - print(cmd) - - os.system(cmd) - - # calling the gene mapper - time.sleep(5) - superTableFile = "%s_SuperEnhancers.table.txt" % (inputName) - if options.control: - #cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (genome, options.rankby, options.control, outFolder, superTableFile) - genemapper.map(os.path.join(outFolder, superTableFile), genome, options.rankby, options.control) - else: - #cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (genome, options.rankby, outFolder, superTableFile) - genemapper.map(os.path.join(outFolder,superTableFile), genome, options.rankby, '') - #os.system(cmd) - - - stretchTableFile = "%s_StretchEnhancers.table.txt" % (inputName) - if options.control: - #cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (genome, options.rankby, options.control, outFolder, stretchTableFile) - genemapper.map(os.path.join(outFolder,stretchTableFile), genome, options.rankby, options.control) - else: - #cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (genome, options.rankby, outFolder, stretchTableFile) - genemapper.map(os.path.join(outFolder,stretchTableFile), genome, options.rankby, '') - #os.system(cmd) - - - superStretchTableFile = "%s_SuperStretchEnhancers.table.txt" % (inputName) - if options.control: - #cmd = "python ROSE2_geneMapper.py -g %s -r %s -c %s -i %s%s &" % (genome, options.rankby, options.control, outFolder, superStretchTableFile) - genemapper.map(os.path.join(outFolder,superStretchTableFile), genome, options.rankby, options.control) - else: - #cmd = "python ROSE2_geneMapper.py -g %s -r %s -i %s%s &" % (genome, options.rankby, outFolder, superStretchTableFile) - genemapper.map(os.path.join(outFolder,superStretchTableFile), genome, options.rankby, '') - #os.system(cmd) +from rose2.rose2_main import rose diff --git a/rose2/definitions.py b/rose2/definitions.py new file mode 100644 index 0000000..896fd08 --- /dev/null +++ b/rose2/definitions.py @@ -0,0 +1,5 @@ +"""Constants are defined here.""" + +import os + +ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) # This is your Project Root diff --git a/rose2/genemapper.py b/rose2/genemapper.py deleted file mode 100644 index d61db13..0000000 --- a/rose2/genemapper.py +++ /dev/null @@ -1,817 +0,0 @@ -#!/usr/bin/python - -''' -The MIT License (MIT) - -Copyright (c) 2013 Charles Lin - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -''' - - -# 140306_singleGeneMapper.py -# main method wrapped script to take the enhancer region table output of ROSE_Main and map genes to it -# will create two outputs a gene mapped region table where each row is an enhancer -# and a gene table where each row is a gene -# does this by default for super-enhancers only -# update to the gene mapper that finds nearest gene w/ highest signal -# also switching to using the pipeline utils module as opposed to the -# stripped down ROSE_utils module -import os -import subprocess -import sys - -from collections import defaultdict -from string import join - -from . import utils - -#========================================================================== -#===================FORMATTING FOLDERS===================================== -#========================================================================== - -def formatFolder(folderName,create=False): - - ''' - makes sure a folder exists and if not makes it - returns a bool for folder - ''' - - if folderName[-1] != '/': - folderName +='/' - - try: - foo = os.listdir(folderName) - return folderName - except OSError: - print('folder %s does not exist' % (folderName)) - if create: - os.system('mkdir %s' % (folderName)) - return folderName - else: - - return False - - -#================================================================== -#=========================GLOBAL=================================== -#================================================================== - -# Get the script's full local path -whereAmI = os.path.dirname(os.path.realpath(__file__)) -print(whereAmI) -# Get the script folder -codeFolder = utils.formatFolder(whereAmI, False) -annotationDirectory = os.path.join(whereAmI, 'annotation') + '/' - -print('RUNNING ROSE2_META.py FROM %s' % (whereAmI)) - - - -#================================================================== -#===========MAPPING GENES TO ENHANCERS WITHOUT BAM RANKING========= -#================================================================== - -#this is the traditional way of running gene mapper - - -def mapEnhancerToGene(annotFile,enhancerFile,transcribedFile='',uniqueGenes=True,searchWindow =50000,noFormatTable = False): - - ''' - maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq - ''' - startDict = utils.makeStartDict(annotFile) - enhancerTable = utils.parseTable(enhancerFile,'\t') - - #internal parameter for debugging - byRefseq = False - - - if len(transcribedFile) > 0: - transcribedTable = utils.parseTable(transcribedFile,'\t') - transcribedGenes = [line[1] for line in transcribedTable] - else: - transcribedGenes = startDict.keys() - - print('MAKING TRANSCRIPT COLLECTION') - transcribedCollection = utils.makeTranscriptCollection(annotFile,0,0,500,transcribedGenes) - - - print('MAKING TSS COLLECTION') - tssLoci = [] - for geneID in transcribedGenes: - tssLoci.append(utils.makeTSSLocus(geneID,startDict,0,0)) - - - #this turns the tssLoci list into a LocusCollection - #50 is the internal parameter for LocusCollection and doesn't really matter - tssCollection = utils.LocusCollection(tssLoci,50) - - - - geneDict = {'overlapping':defaultdict(list),'proximal':defaultdict(list)} - - #dictionaries to hold ranks and superstatus of gene nearby enhancers - rankDict = defaultdict(list) - superDict= defaultdict(list) - - #list of all genes that appear in this analysis - overallGeneList = [] - - if noFormatTable: - #set up the output tables - #first by enhancer - enhancerToGeneTable = [enhancerTable[0]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE']] - - - else: - #set up the output tables - #first by enhancer - enhancerToGeneTable = [enhancerTable[0][0:9]+['OVERLAP_GENES','PROXIMAL_GENES','CLOSEST_GENE'] + enhancerTable[5][-2:]] - - #next by gene - geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS']] - - #next make the gene to enhancer table - geneToEnhancerTable = [['GENE_NAME','REFSEQ_ID','PROXIMAL_ENHANCERS','ENHANCER_RANKS','IS_SUPER']] - - - - - for line in enhancerTable: - if line[0][0] =='#' or line[0][0] == 'R': - continue - - enhancerString = '%s:%s-%s' % (line[1],line[2],line[3]) - - enhancerLocus = utils.Locus(line[1],line[2],line[3],'.',line[0]) - - #overlapping genes are transcribed genes whose transcript is directly in the stitchedLocus - overlappingLoci = transcribedCollection.getOverlap(enhancerLocus,'both') - overlappingGenes =[] - for overlapLocus in overlappingLoci: - overlappingGenes.append(overlapLocus.ID()) - - #proximalGenes are transcribed genes where the tss is within 50kb of the boundary of the stitched loci - proximalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,searchWindow,searchWindow),'both') - proximalGenes =[] - for proxLocus in proximalLoci: - proximalGenes.append(proxLocus.ID()) - - - distalLoci = tssCollection.getOverlap(utils.makeSearchLocus(enhancerLocus,1000000,1000000),'both') - distalGenes =[] - for proxLocus in distalLoci: - distalGenes.append(proxLocus.ID()) - - - - overlappingGenes = utils.uniquify(overlappingGenes) - proximalGenes = utils.uniquify(proximalGenes) - distalGenes = utils.uniquify(distalGenes) - allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes - #these checks make sure each gene list is unique. - #technically it is possible for a gene to be overlapping, but not proximal since the - #gene could be longer than the 50kb window, but we'll let that slide here - for refID in overlappingGenes: - if proximalGenes.count(refID) == 1: - proximalGenes.remove(refID) - - for refID in proximalGenes: - if distalGenes.count(refID) == 1: - distalGenes.remove(refID) - - - #Now find the closest gene - if len(allEnhancerGenes) == 0: - closestGene = '' - else: - #get enhancerCenter - enhancerCenter = (int(line[2]) + int(line[3]))/2 - - #get absolute distance to enhancer center - distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) for geneID in allEnhancerGenes] - #get the ID and convert to name - closestGene = startDict[allEnhancerGenes[distList.index(min(distList))]]['name'] - - #NOW WRITE THE ROW FOR THE ENHANCER TABLE - if noFormatTable: - - newEnhancerLine = list(line) - newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) - newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) - newEnhancerLine.append(closestGene) - - else: - newEnhancerLine = line[0:9] - newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]),',')) - newEnhancerLine.append(join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]),',')) - newEnhancerLine.append(closestGene) - newEnhancerLine += line[-2:] - - enhancerToGeneTable.append(newEnhancerLine) - #Now grab all overlapping and proximal genes for the gene ordered table - - overallGeneList +=overlappingGenes - for refID in overlappingGenes: - geneDict['overlapping'][refID].append(enhancerString) - rankDict[refID].append(int(line[-2])) - superDict[refID].append(int(line[-1])) - - overallGeneList+=proximalGenes - for refID in proximalGenes: - geneDict['proximal'][refID].append(enhancerString) - rankDict[refID].append(int(line[-2])) - superDict[refID].append(int(line[-1])) - - - - #End loop through - - #Make table by gene - overallGeneList = utils.uniquify(overallGeneList) - - #use enhancer rank to order - rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) - - usedNames = [] - for i in rankOrder: - refID = overallGeneList[i] - geneName = startDict[refID]['name'] - if usedNames.count(geneName) > 0 and uniqueGenes == True: - - continue - else: - usedNames.append(geneName) - - proxEnhancers = geneDict['overlapping'][refID]+geneDict['proximal'][refID] - - superStatus = max(superDict[refID]) - enhancerRanks = join([str(x) for x in rankDict[refID]],',') - - newLine = [geneName,refID,join(proxEnhancers,','),enhancerRanks,superStatus] - geneToEnhancerTable.append(newLine) - - #resort enhancerToGeneTable - if noFormatTable: - return enhancerToGeneTable,geneToEnhancerTable - else: - enhancerOrder = utils.order([int(line[-2]) for line in enhancerToGeneTable[1:]]) - sortedTable = [enhancerToGeneTable[0]] - for i in enhancerOrder: - sortedTable.append(enhancerToGeneTable[(i+1)]) - - return sortedTable,geneToEnhancerTable - - - - - -#================================================================== -#===========MAPPING GENES TO ENHANCERS WITH BAM RANKING============ -#================================================================== - - -def makeSignalDict(mappedGFFFile, controlMappedGFFFile=''): - ''' - makes a signal dict - ''' - print('\t called makeSignalDict on %s (ctrl: %s)' % (mappedGFFFile, controlMappedGFFFile)) - signalDict = defaultdict(float) - - mappedGFF = utils.parseTable(mappedGFFFile, '\t') - if len(controlMappedGFFFile) > 0: - controlGFF = utils.parseTable(controlMappedGFFFile, '\t') - - for i in range(1, len(mappedGFF)): - - signal = float(mappedGFF[i][2]) - float(controlGFF[i][2]) - if signal < 0: - signal = 0.0 - signalDict[mappedGFF[i][0]] = signal - else: - for i in range(1, len(mappedGFF)): - signal = float(mappedGFF[i][2]) - signalDict[mappedGFF[i][0]] = signal - - return signalDict - -#makeSignalDict('../sshfs/x_rose/mm9_TSS_ENHANCER_GENES_-5000_+5000_CONV3_CD4.nomito.rmdup.bam.gff') - -def mapEnhancerToGeneTop(rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile='', uniqueGenes=True, searchWindow=50000, noFormatTable=False): - ''' - maps genes to enhancers. if uniqueGenes, reduces to gene name only. Otherwise, gives for each refseq - ''' - startDict = utils.makeStartDict(annotFile) - enhancerName = enhancerFile.split('/')[-1].split('.')[0] - enhancerTable = utils.parseTable(enhancerFile, '\t') - - # internal parameter for debugging - byRefseq = False - - if len(transcribedFile) > 0: - transcribedTable = utils.parseTable(transcribedFile, '\t') - transcribedGenes = [line[1] for line in transcribedTable] - else: - transcribedGenes = startDict.keys() - - print('MAKING TRANSCRIPT COLLECTION') - transcribedCollection = utils.makeTranscriptCollection( - annotFile, 0, 0, 500, transcribedGenes) - - print('MAKING TSS COLLECTION') - tssLoci = [] - for geneID in transcribedGenes: - tssLoci.append(utils.makeTSSLocus(geneID, startDict, 0, 0)) - - # this turns the tssLoci list into a LocusCollection - # 50 is the internal parameter for LocusCollection and doesn't really - # matter - tssCollection = utils.LocusCollection(tssLoci, 50) - - geneDict = {'overlapping': defaultdict( - list), 'proximal': defaultdict(list)} - - # dictionaries to hold ranks and superstatus of gene nearby enhancers - rankDict = defaultdict(list) - superDict = defaultdict(list) - - # list of all genes that appear in this analysis - overallGeneList = [] - - # find the damn header - for line in enhancerTable: - if line[0][0] == '#': - continue - else: - header = line - break - - if noFormatTable: - # set up the output tables - # first by enhancer - enhancerToGeneTable = [ - header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] - - else: - # set up the output tables - # first by enhancer - enhancerToGeneTable = [ - header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] - - # next by gene - geneToEnhancerTable = [ - ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] - - # next make the gene to enhancer table - geneToEnhancerTable = [ - ['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS', 'ENHANCER_RANKS', 'IS_SUPER', 'ENHANCER_SIGNAL']] - - for line in enhancerTable: - if line[0][0] == '#' or line[0][0] == 'R': - continue - - enhancerString = '%s:%s-%s' % (line[1], line[2], line[3]) - - enhancerLocus = utils.Locus(line[1], line[2], line[3], '.', line[0]) - - # overlapping genes are transcribed genes whose transcript is directly - # in the stitchedLocus - overlappingLoci = transcribedCollection.getOverlap( - enhancerLocus, 'both') - overlappingGenes = [] - for overlapLocus in overlappingLoci: - overlappingGenes.append(overlapLocus.ID()) - - # proximalGenes are transcribed genes where the tss is within 50kb of - # the boundary of the stitched loci - proximalLoci = tssCollection.getOverlap( - utils.makeSearchLocus(enhancerLocus, searchWindow, searchWindow), 'both') - proximalGenes = [] - for proxLocus in proximalLoci: - proximalGenes.append(proxLocus.ID()) - - distalLoci = tssCollection.getOverlap( - utils.makeSearchLocus(enhancerLocus, 1000000, 1000000), 'both') - distalGenes = [] - for proxLocus in distalLoci: - distalGenes.append(proxLocus.ID()) - - overlappingGenes = utils.uniquify(overlappingGenes) - proximalGenes = utils.uniquify(proximalGenes) - distalGenes = utils.uniquify(distalGenes) - allEnhancerGenes = overlappingGenes + proximalGenes + distalGenes - # these checks make sure each gene list is unique. - # technically it is possible for a gene to be overlapping, but not proximal since the - # gene could be longer than the 50kb window, but we'll let that slide - # here - for refID in overlappingGenes: - if proximalGenes.count(refID) == 1: - proximalGenes.remove(refID) - - for refID in proximalGenes: - if distalGenes.count(refID) == 1: - distalGenes.remove(refID) - - # Now find the closest gene - if len(allEnhancerGenes) == 0: - closestGene = '' - else: - # get enhancerCenter - enhancerCenter = (int(line[2]) + int(line[3])) / 2 - - # get absolute distance to enhancer center - distList = [abs(enhancerCenter - startDict[geneID]['start'][0]) - for geneID in allEnhancerGenes] - # get the ID and convert to name - closestGene = startDict[ - allEnhancerGenes[distList.index(min(distList))]]['name'] - - # NOW WRITE THE ROW FOR THE ENHANCER TABLE - if noFormatTable: - - newEnhancerLine = list(line) - newEnhancerLine.append( - join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) - newEnhancerLine.append( - join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) - newEnhancerLine.append(closestGene) - - else: - newEnhancerLine = line[0:9] - newEnhancerLine.append( - join(utils.uniquify([startDict[x]['name'] for x in overlappingGenes]), ',')) - newEnhancerLine.append( - join(utils.uniquify([startDict[x]['name'] for x in proximalGenes]), ',')) - newEnhancerLine.append(closestGene) - newEnhancerLine += line[-2:] - - enhancerToGeneTable.append(newEnhancerLine) - # Now grab all overlapping and proximal genes for the gene ordered - # table - - overallGeneList += overlappingGenes - for refID in overlappingGenes: - geneDict['overlapping'][refID].append(enhancerString) - rankDict[refID].append(int(line[-2])) - superDict[refID].append(int(line[-1])) - - overallGeneList += proximalGenes - for refID in proximalGenes: - geneDict['proximal'][refID].append(enhancerString) - rankDict[refID].append(int(line[-2])) - superDict[refID].append(int(line[-1])) - - # End loop through - # Make table by gene - print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') - overallGeneList = utils.uniquify(overallGeneList) - - #get the chromLists from the various bams here - cmd = 'samtools idxstats %s' % (rankByBamFile) - idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) - idxStats= idxStats.communicate() - bamChromList = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] - - if len(controlBamFile) > 0: - cmd = 'samtools idxstats %s' % (controlBamFile) - idxStats = subprocess.Popen(cmd,stdout=subprocess.PIPE,shell=True) - idxStats= idxStats.communicate() - bamChromListControl = [line.split('\t')[0] for line in idxStats[0].split('\n')[0:-2]] - bamChromList = [chrom for chrom in bamChromList if bamChromListControl.count(chrom) != 0] - - - - #now make sure no genes have a bad chrom - overallGeneList = [gene for gene in overallGeneList if bamChromList.count(startDict[gene]['chr']) != 0] - - - #now make an enhancer collection of all transcripts - enhancerGeneCollection = utils.makeTranscriptCollection( - annotFile, 5000, 5000, 500, overallGeneList) - - enhancerGeneGFF = utils.locusCollectionToGFF(enhancerGeneCollection) - - # dump the gff to file - enhancerFolder = utils.getParentFolder(enhancerFile) - gffRootName = "%s_TSS_ENHANCER_GENES_-5000_+5000" % (genome) - enhancerGeneGFFFile = "%s%s_%s.gff" % (enhancerFolder, enhancerName,gffRootName) - utils.unParseTable(enhancerGeneGFF, enhancerGeneGFFFile, '\t') - - # now we need to run bamToGFF - - # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local (in path), otherwise fail. - bamliquidator_path = 'bamliquidator_batch' - - - print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') - # map density at genes in the +/- 5kb tss region - # first on the rankBy bam - bamName = rankByBamFile.split('/')[-1] - mappedRankByFolder = "%s%s_%s_%s/" % (enhancerFolder, enhancerName,gffRootName, bamName) - mappedRankByFile = "%s%s_%s_%s/matrix.txt" % (enhancerFolder,enhancerName, gffRootName, bamName) - cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedRankByFolder,rankByBamFile) - print("Mapping rankby bam %s" % (rankByBamFile)) - print(cmd) - os.system(cmd) - - #check for completion - if utils.checkOutput(mappedRankByFile,0.2,5): - print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) - else: - print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, rankByBamFile)) - sys.exit() - - # next on the control bam if it exists - if len(controlBamFile) > 0: - controlName = controlBamFile.split('/')[-1] - mappedControlFolder = "%s%s_%s_%s/" % ( - enhancerFolder, enhancerName,gffRootName, controlName) - mappedControlFile = "%s%s_%s_%s/matrix.txt" % ( - enhancerFolder, enhancerName,gffRootName, controlName) - cmd = bamliquidator_path + ' --sense . -e 200 --match_bamToGFF -r %s -o %s %s' % (enhancerGeneGFFFile, mappedControlFolder,controlBamFile) - print("Mapping control bam %s" % (controlBamFile)) - print(cmd) - os.system(cmd) - - #check for completion - if utils.checkOutput(mappedControlFile,0.2,5): - print("SUCCESSFULLY MAPPED TO %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) - else: - print("ERROR: FAILED TO MAP %s FROM BAM: %s" % (enhancerGeneGFFFile, controlBamFile)) - sys.exit() - - # now get the appropriate output files - if len(controlBamFile) > 0: - print("CHECKING FOR MAPPED OUTPUT AT %s AND %s" % - (mappedRankByFile, mappedControlFile)) - if utils.checkOutput(mappedRankByFile, 1, 1) and utils.checkOutput(mappedControlFile, 1, 1): - print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') - signalDict = makeSignalDict(mappedRankByFile, mappedControlFile) - else: - print("NO MAPPING OUTPUT DETECTED") - sys.exit() - else: - print("CHECKING FOR MAPPED OUTPUT AT %s" % (mappedRankByFile)) - if utils.checkOutput(mappedRankByFile, 1, 30): - print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') - signalDict = makeSignalDict(mappedRankByFile) - else: - print("NO MAPPING OUTPUT DETECTED") - sys.exit() - - # use enhancer rank to order - - rankOrder = utils.order([min(rankDict[x]) for x in overallGeneList]) - - usedNames = [] - - # make a new dict to hold TSS signal by max per geneName - geneNameSigDict = defaultdict(list) - print('MAKING GENE TABLE') - for i in rankOrder: - refID = overallGeneList[i] - geneName = startDict[refID]['name'] - if usedNames.count(geneName) > 0 and uniqueGenes == True: - continue - else: - usedNames.append(geneName) - - proxEnhancers = geneDict['overlapping'][ - refID] + geneDict['proximal'][refID] - - superStatus = max(superDict[refID]) - enhancerRanks = join([str(x) for x in rankDict[refID]], ',') - - enhancerSignal = signalDict[refID] - geneNameSigDict[geneName].append(enhancerSignal) - - newLine = [geneName, refID, join( - proxEnhancers, ','), enhancerRanks, superStatus, enhancerSignal] - geneToEnhancerTable.append(newLine) - #utils.unParseTable(geneToEnhancerTable,'/grail/projects/newRose/geneMapper/foo.txt','\t') - print('MAKING ENHANCER TO TOP GENE TABLE') - - if noFormatTable: - enhancerToTopGeneTable = [ - enhancerToGeneTable[0] + ['TOP_GENE', 'TSS_SIGNAL']] - else: - enhancerToTopGeneTable = [enhancerToGeneTable[0][0:12] + [ - 'TOP_GENE', 'TSS_SIGNAL'] + enhancerToGeneTable[0][-2:]] - - for line in enhancerToGeneTable[1:]: - - geneList = [] - if noFormatTable: - geneList += line[-3].split(',') - geneList += line[-2].split(',') - - else: - geneList += line[10].split(',') - geneList += line[11].split(',') - - geneList = utils.uniquify([x for x in geneList if len(x) > 0]) - if len(geneList) > 0: - try: - sigVector = [max(geneNameSigDict[x]) for x in geneList] - maxIndex = sigVector.index(max(sigVector)) - maxGene = geneList[maxIndex] - maxSig = sigVector[maxIndex] - if maxSig == 0.0: - maxGene = 'NONE' - maxSig = 'NONE' - except ValueError: - if len(geneList) == 1: - maxGene = geneList[0] - maxSig = 'NONE' - else: - maxGene = 'NONE' - maxSig = 'NONE' - else: - maxGene = 'NONE' - maxSig = 'NONE' - if noFormatTable: - newLine = line + [maxGene, maxSig] - else: - newLine = line[0:12] + [maxGene, maxSig] + line[-2:] - enhancerToTopGeneTable.append(newLine) - - # resort enhancerToGeneTable - if noFormatTable: - return enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable - else: - enhancerOrder = utils.order([int(line[-2]) - for line in enhancerToGeneTable[1:]]) - sortedTable = [enhancerToGeneTable[0]] - sortedTopGeneTable = [enhancerToTopGeneTable[0]] - for i in enhancerOrder: - sortedTable.append(enhancerToGeneTable[(i + 1)]) - sortedTopGeneTable.append(enhancerToTopGeneTable[(i + 1)]) - - return sortedTable, sortedTopGeneTable, geneToEnhancerTable - - -#================================================================== -#=========================MAIN METHOD============================== -#================================================================== -def map(infile, genome, rankByBamFile, controlBamFile, geneList=None, out=None, window=None, formatTable=None): - ''' - main run call - ''' - - print(infile) - print(genome) - print(rankByBamFile) - print(controlBamFile) - print(geneList) - print(out) - print(window) - print(formatTable) - - window = 50000 - print('THE WINDOW IS REALLY AN INTEGER I SWEAR') - print(window) - - # from optparse import OptionParser - # usage = "usage: %prog [options] -g [GENOME] -i [INPUT_ENHANCER_FILE]" - # parser = OptionParser(usage=usage) - # # required flags - # parser.add_option("-i", "--i", dest="input", nargs=1, default=None, - # help="Enter a ROSE ranked enhancer or super-enhancer file") - # parser.add_option("-g", "--genome", dest="genome", nargs=1, default=None, - # help="Enter the genome build (MM9,MM8,HG18,HG19)") - - # # optional flags - # parser.add_option("-r", "--rankby", dest="rankby", nargs=1, default=None, - # help="Enter the bam used to rank enhancers") - # parser.add_option("-c", "--control", dest="control", nargs=1, default='', - # help="Enter a background bam for background correction") - - # parser.add_option("-l", "--list", dest="geneList", nargs=1, default=None, - # help="Enter a gene list to filter through") - # parser.add_option("-o", "--out", dest="out", nargs=1, default=None, - # help="Enter an output folder. Default will be same folder as input file") - # parser.add_option( - # "-w", "--window", dest="window", nargs=1, default=50000, - # help="Enter a search distance for genes. Default is 50,000bp") - # parser.add_option( - # "-f", "--format", dest="formatTable", action="store_true", default=False, - # help="If flagged, maintains original formatting of input table") - - # # RETRIEVING FLAGS - # (options, args) = parser.parse_args() - - # GETTING THE GENOME - - print('USING %s AS THE GENOME' % genome) - - # GETTING THE CORRECT ANNOT FILE - - genomeDict = { - 'HG18': '%shg18_refseq.ucsc' % (annotationDirectory), - 'MM9': '%smm9_refseq.ucsc' % (annotationDirectory), - 'HG19': '%shg19_refseq.ucsc' % (annotationDirectory), - 'MM8': '%smm8_refseq.ucsc' % (annotationDirectory), - 'MM10': '%smm10_refseq.ucsc' % (annotationDirectory), - 'RN4': '%srn4_refseq.ucsc' % (annotationDirectory), - 'RN6': '%srn6_refseq.ucsc' % (annotationDirectory), - } - - annotFile = genomeDict[genome.upper()] - - # GETTING THE INPUT - enhancerFile = infile - - # making the out folder if it doesn't exist - if out: - outFolder = utils.formatFolder(out, True) - else: - outFolder = join(enhancerFile.split('/')[0:-1], '/') + '/' - - # CHECK FORMATTING FLAG - if formatTable: - noFormatTable = True - else: - noFormatTable = False - - # GETTING THE TRANSCRIBED LIST - transcribedFile = geneList if geneList else '' - - if rankByBamFile: - print("MAPPING GENES TO ENHANCERS USING CLOSEST ACTIVE GENE") - enhancerToGeneTable, enhancerToTopGeneTable, geneToEnhancerTable = mapEnhancerToGeneTop( - rankByBamFile, controlBamFile, genome, annotFile, enhancerFile, transcribedFile, True, window, noFormatTable) - - # Writing enhancer output - enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] - - if window != 50000: - # writing the enhancer table - - out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % ( - outFolder, enhancerFileName, window / 1000) - print("writing output to %s" % (out1)) - utils.unParseTable(enhancerToGeneTable, out1, '\t') - - # writing enhancer top gene table - out2 = '%s%s_ENHANCER_TO_TOP_GENE_%sKB.txt' % ( - outFolder, enhancerFileName, window / 1000) - utils.unParseTable(enhancerToTopGeneTable, out2, '\t') - - # writing the gene table - out3 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % ( - outFolder, enhancerFileName, window / 1000) - utils.unParseTable(geneToEnhancerTable, out3, '\t') - else: - # writing the enhancer table - out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder, enhancerFileName) - utils.unParseTable(enhancerToGeneTable, out1, '\t') - - # writing the enhancer table - out2 = '%s%s_ENHANCER_TO_TOP_GENE.txt' % (outFolder, enhancerFileName) - utils.unParseTable(enhancerToTopGeneTable, out2, '\t') - - # writing the gene table - out3 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder, enhancerFileName) - utils.unParseTable(geneToEnhancerTable, out3, '\t') - else: - #do traditional mapping - print("MAPPING GENES TO ENHANCERS USING PROXIMITY RULE") - enhancerToGeneTable,geneToEnhancerTable = mapEnhancerToGene(annotFile,enhancerFile,transcribedFile,True,window,noFormatTable) - - #Writing enhancer output - enhancerFileName = enhancerFile.split('/')[-1].split('.')[0] - - if window != 50000: - #writing the enhancer table - out1 = '%s%s_ENHANCER_TO_GENE_%sKB.txt' % (outFolder,enhancerFileName,window/1000) - utils.unParseTable(enhancerToGeneTable,out1,'\t') - - #writing the gene table - out2 = '%s%s_GENE_TO_ENHANCER_%sKB.txt' % (outFolder,enhancerFileName,window/1000) - utils.unParseTable(geneToEnhancerTable,out2,'\t') - else: - #writing the enhancer table - out1 = '%s%s_ENHANCER_TO_GENE.txt' % (outFolder,enhancerFileName) - utils.unParseTable(enhancerToGeneTable,out1,'\t') - - #writing the gene table - out2 = '%s%s_GENE_TO_ENHANCER.txt' % (outFolder,enhancerFileName) - utils.unParseTable(geneToEnhancerTable,out2,'\t') - diff --git a/rose2/resources/__init__.py b/rose2/resources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rose2/resources/genemapper.py b/rose2/resources/genemapper.py new file mode 100644 index 0000000..960ad80 --- /dev/null +++ b/rose2/resources/genemapper.py @@ -0,0 +1,852 @@ +"""GENEMAPPER FUNCTIONS.""" + +# Functions require bamliquidator and samtools + +# Main method wrapped script to take the enhancer region table output of ROSE_Main and map genes +# to it +# Will create two outputs a gene mapped region table where each row is an enhancer and a gene +# table where each row is a gene +# Does this by default for super-enhancers only +# Update to the gene mapper that finds nearest gene w/ highest signal +# Also switching to using the pipeline utils module as opposed to the stripped down ROSE_utils +# module + +import os +import subprocess +import sys + +from collections import defaultdict + +from rose2.definitions import ROOT_DIR +from rose2.resources import utils + +# ================================================================== +# ===========MAPPING GENES TO ENHANCERS WITHOUT BAM RANKING========= +# ================================================================== + + +# This is the traditional way of running gene mapper +def map_enhancer_to_gene(annot_file, enhancer_file, transcribed_file=None, unique_genes=True, + search_window=50000, no_format_table=False): + """Maps genes to enhancers. + + If unique_genes, reduces to gene name only. Otherwise, gives for each refseq. + + """ + start_dict = utils.make_start_dict(annot_file) + enhancer_table = utils.parse_table(enhancer_file, '\t') + + if transcribed_file: + transcribed_table = utils.parse_table(transcribed_file, '\t') + transcribed_genes = [line[1] for line in transcribed_table] + else: + transcribed_genes = start_dict.keys() + + print('MAKING TRANSCRIPT COLLECTION') + transcribed_collection = utils.make_transcript_collection( + annot_file, + 0, + 0, + 500, + transcribed_genes, + ) + + print('MAKING TSS COLLECTION') + tss_loci = [] + for gene_id in transcribed_genes: + tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 0, 0)) + + # This turns the tss_loci list into a LocusCollection + # 50 is the internal parameter for LocusCollection and doesn't really matter + tss_collection = utils.LocusCollection(tss_loci, 50) + + gene_dict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)} + + # Dictionaries to hold ranks and super_status of gene nearby enhancers + rank_dict = defaultdict(list) + super_dict = defaultdict(list) + + # List of all genes that appear in this analysis + overall_gene_list = [] + + # Find the header + for line in enhancer_table: + if line[0][0] != '#': + header = line + print('This is the header') + print(header) + break + + if no_format_table: + # Set up the output tables + # First by enhancer + enhancer_to_gene_table = [ + header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + ] + + else: + # Set up the output tables + # First by enhancer + enhancer_to_gene_table = [ + header[0:9] + + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + + header[-2:] + ] + + # Next by gene + gene_to_enhancer_table = [['GENE_NAME', 'REFSEQ_ID', 'PROXIMAL_ENHANCERS']] + + # Next make the gene to enhancer table + gene_to_enhancer_table = [ + [ + 'GENE_NAME', + 'REFSEQ_ID', + 'PROXIMAL_ENHANCERS', + 'ENHANCER_RANKS', + 'IS_SUPER', + 'TSS_SIGNAL', + ] + ] + + for line in enhancer_table: + if line[0][0] == '#' or line[0][0] == 'R': + continue + + enhancer_string = '{}:{}-{}'.format(line[1], line[2], line[3]) + + enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0]) + + # Overlapping genes are transcribed genes whose transcript is directly in the + # stitched locus + overlapping_loci = transcribed_collection.get_overlap(enhancer_locus, 'both') + overlapping_genes = [] + for overlap_locus in overlapping_loci: + overlapping_genes.append(overlap_locus.id) + + # Proximal_genes are transcribed genes where the tss is within 50kb of the boundary of the + # stitched loci + proximal_loci = tss_collection.get_overlap( + utils.make_search_locus(enhancer_locus, search_window, search_window), + 'both', + ) + proximal_genes = [] + for prox_locus in proximal_loci: + proximal_genes.append(prox_locus.id) + + distal_loci = tss_collection.get_overlap( + utils.make_search_locus(enhancer_locus, 1000000, 1000000), + 'both', + ) + distal_genes = [] + for prox_locus in distal_loci: + distal_genes.append(prox_locus.id) + + overlapping_genes = utils.uniquify(overlapping_genes) + proximal_genes = utils.uniquify(proximal_genes) + distal_genes = utils.uniquify(distal_genes) + all_enhancer_genes = overlapping_genes + proximal_genes + distal_genes + + # These checks make sure each gene list is unique + # Technically it is possible for a gene to be overlapping, but not proximal since the gene + # could be longer than the 50kb window, but we'll let that slide here + for ref_id in overlapping_genes: + if proximal_genes.count(ref_id) == 1: + proximal_genes.remove(ref_id) + + for ref_id in proximal_genes: + if distal_genes.count(ref_id) == 1: + distal_genes.remove(ref_id) + + # Now find the closest gene + if not all_enhancer_genes: + closest_gene = '' + else: + # Get enhancer_center + enhancer_center = (int(line[2]) + int(line[3])) / 2 + + # Get absolute distance to enhancer center + dist_list = [ + abs( + enhancer_center - start_dict[gene_id]['start'][0] + ) for gene_id in all_enhancer_genes + ] + # Get the ID and convert to name + closest_gene = start_dict[ + all_enhancer_genes[dist_list.index(min(dist_list))] + ]['name'] + + # Now write the row for the enhancer table + if no_format_table: + new_enhancer_line = list(line) + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) + ) + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) + ) + new_enhancer_line.append(closest_gene) + + else: + new_enhancer_line = line[0:9] + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) + ) + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) + ) + new_enhancer_line.append(closest_gene) + new_enhancer_line += line[-2:] + + enhancer_to_gene_table.append(new_enhancer_line) + + # Now grab all overlapping and proximal genes for the gene ordered table + overall_gene_list += overlapping_genes + for ref_id in overlapping_genes: + gene_dict['overlapping'][ref_id].append(enhancer_string) + rank_dict[ref_id].append(int(line[-2])) + super_dict[ref_id].append(int(line[-1])) + + overall_gene_list += proximal_genes + for ref_id in proximal_genes: + gene_dict['proximal'][ref_id].append(enhancer_string) + rank_dict[ref_id].append(int(line[-2])) + super_dict[ref_id].append(int(line[-1])) + + # End loop through + + # Make table by gene + overall_gene_list = utils.uniquify(overall_gene_list) + + # Use enhancer rank to order + rank_order = utils.order([min(rank_dict[x]) for x in overall_gene_list]) + + used_names = [] + for i in rank_order: + ref_id = overall_gene_list[i] + gene_name = start_dict[ref_id]['name'] + if used_names.count(gene_name) and unique_genes: + continue + else: + used_names.append(gene_name) + + prox_enhancers = gene_dict['overlapping'][ref_id] + gene_dict['proximal'][ref_id] + + super_status = max(super_dict[ref_id]) + enhancer_ranks = ','.join([str(x) for x in rank_dict[ref_id]]) + + new_line = [gene_name, ref_id, ','.join(prox_enhancers), enhancer_ranks, super_status] + gene_to_enhancer_table.append(new_line) + + # Resort enhancer_to_gene_table + if no_format_table: + return enhancer_to_gene_table, gene_to_enhancer_table + else: + enhancer_order = utils.order([int(line[-2]) for line in enhancer_to_gene_table[1:]]) + sorted_table = [enhancer_to_gene_table[0]] + for i in enhancer_order: + sorted_table.append(enhancer_to_gene_table[(i+1)]) + + return sorted_table, gene_to_enhancer_table + + +# ================================================================== +# ===========MAPPING GENES TO ENHANCERS WITH BAM RANKING============ +# ================================================================== + +def make_signal_dict(mapped_gff_file, control_mapped_gff_file=''): + """Makes a signal dict.""" + print( + '\t called make_signal_dict on {} (ctrl: {})' + ''.format(mapped_gff_file, control_mapped_gff_file) + ) + signal_dict = defaultdict(float) + + mapped_gff = utils.parse_table(mapped_gff_file, '\t') + if control_mapped_gff_file: + control_gff = utils.parse_table(control_mapped_gff_file, '\t') + + for i in range(1, len(mapped_gff)): + signal = float(mapped_gff[i][2]) - float(control_gff[i][2]) + if signal < 0: + signal = 0.0 + signal_dict[mapped_gff[i][0]] = signal + else: + for i in range(1, len(mapped_gff)): + signal = float(mapped_gff[i][2]) + signal_dict[mapped_gff[i][0]] = signal + + return signal_dict + + +def map_enhancer_to_gene_top(rank_by_bam_file, control_bam_file, genome, annot_file, + enhancer_file, transcribed_file=None, unique_genes=True, + search_window=50000, no_format_table=False): + """Maps genes to enhancers. + + If unique_genes, reduces to gene name only. Otherwise, gives for each refseq. + + """ + start_dict = utils.make_start_dict(annot_file) + enhancer_name = enhancer_file.split('/')[-1].split('.')[0] + enhancer_table = utils.parse_table(enhancer_file, '\t') + + if transcribed_file: + transcribed_table = utils.parse_table(transcribed_file, '\t') + transcribed_genes = [line[1] for line in transcribed_table] + else: + transcribed_genes = start_dict.keys() + + print('MAKING TRANSCRIPT COLLECTION') + transcribed_collection = utils.make_transcript_collection( + annot_file, 0, 0, 500, transcribed_genes) + + print('MAKING TSS COLLECTION') + tss_loci = [] + for gene_id in transcribed_genes: + tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 0, 0)) + + # This turns the tss_loci list into a LocusCollection + # 50 is the internal parameter for LocusCollection and doesn't really matter + tss_collection = utils.LocusCollection(tss_loci, 50) + + gene_dict = {'overlapping': defaultdict(list), 'proximal': defaultdict(list)} + + # Dictionaries to hold ranks and super_status of gene nearby enhancers + rank_dict = defaultdict(list) + super_dict = defaultdict(list) + + # List of all genes that appear in this analysis + overall_gene_list = [] + + # Find the header + for line in enhancer_table: + if line[0][0] != '#': + header = line + print('this is the header') + print(header) + break + + if no_format_table: + # Set up the output tables + # First by enhancer + enhancer_to_gene_table = [ + header + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE']] + + else: + # Set up the output tables + # First by enhancer + enhancer_to_gene_table = [ + header[0:9] + ['OVERLAP_GENES', 'PROXIMAL_GENES', 'CLOSEST_GENE'] + header[-2:]] + + # Next make the gene to enhancer table + gene_to_enhancer_table = [ + [ + 'GENE_NAME', + 'REFSEQ_ID', + 'PROXIMAL_ENHANCERS', + 'ENHANCER_RANKS', + 'IS_SUPER', + 'ENHANCER_SIGNAL', + ] + ] + + for line in enhancer_table: + if line[0][0] == '#' or line[0][0] == 'R': + continue + + enhancer_string = '{}:{}-{}'.format(line[1], line[2], line[3]) + + enhancer_locus = utils.Locus(line[1], line[2], line[3], '.', line[0]) + + # Overlapping genes are transcribed genes whose transcript is directly in the + # stitched_locus + overlapping_loci = transcribed_collection.get_overlap(enhancer_locus, 'both') + overlapping_genes = [] + for overlap_locus in overlapping_loci: + overlapping_genes.append(overlap_locus.id) + + # Proximal_genes are transcribed genes where the tss is within 50kb of the boundary of the + # stitched loci + proximal_loci = tss_collection.get_overlap( + utils.make_search_locus(enhancer_locus, search_window, search_window), + 'both', + ) + proximal_genes = [] + for prox_locus in proximal_loci: + proximal_genes.append(prox_locus.id) + + distal_loci = tss_collection.get_overlap( + utils.make_search_locus(enhancer_locus, 1000000, 1000000), + 'both', + ) + distal_genes = [] + for prox_locus in distal_loci: + distal_genes.append(prox_locus.id) + + overlapping_genes = utils.uniquify(overlapping_genes) + proximal_genes = utils.uniquify(proximal_genes) + distal_genes = utils.uniquify(distal_genes) + all_enhancer_genes = overlapping_genes + proximal_genes + distal_genes + # These checks make sure each gene list is unique + # Technically it is possible for a gene to be overlapping, but not proximal since the gene + # could be longer than the 50kb window, but we'll let that slide here + for ref_id in overlapping_genes: + if proximal_genes.count(ref_id) == 1: + proximal_genes.remove(ref_id) + + for ref_id in proximal_genes: + if distal_genes.count(ref_id) == 1: + distal_genes.remove(ref_id) + + # Now find the closest gene + if not all_enhancer_genes: + closest_gene = '' + else: + # Get enhancer_center + enhancer_center = (int(line[2]) + int(line[3])) / 2 + + # Get absolute distance to enhancer center + dist_list = [abs(enhancer_center - start_dict[gene_id]['start'][0]) + for gene_id in all_enhancer_genes] + # Get the ID and convert to name + closest_gene = start_dict[all_enhancer_genes[dist_list.index(min(dist_list))]]['name'] + + # Now write the row for the enhancer table + if no_format_table: + new_enhancer_line = list(line) + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) + ) + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) + ) + new_enhancer_line.append(closest_gene) + + else: + new_enhancer_line = line[0:9] + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in overlapping_genes])) + ) + new_enhancer_line.append( + ','.join(utils.uniquify([start_dict[x]['name'] for x in proximal_genes])) + ) + new_enhancer_line.append(closest_gene) + new_enhancer_line += line[-2:] + + enhancer_to_gene_table.append(new_enhancer_line) + + # Now grab all overlapping and proximal genes for the gene ordered table + overall_gene_list += overlapping_genes + for ref_id in overlapping_genes: + gene_dict['overlapping'][ref_id].append(enhancer_string) + rank_dict[ref_id].append(int(line[-2])) + super_dict[ref_id].append(int(line[-1])) + + overall_gene_list += proximal_genes + for ref_id in proximal_genes: + gene_dict['proximal'][ref_id].append(enhancer_string) + rank_dict[ref_id].append(int(line[-2])) + super_dict[ref_id].append(int(line[-1])) + + # End loop through + # Make table by gene + print('MAKING ENHANCER ASSOCIATED GENE TSS COLLECTION') + overall_gene_list = utils.uniquify(overall_gene_list) + + # Get the chrom_lists from the various bams here + cmd = 'samtools idxstats {}'.format(rank_by_bam_file) + idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) + idx_stats = idx_stats.communicate() + bam_chrom_list = [ + line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2] + ] + + if control_bam_file: + cmd = 'samtools idxstats {}'.format(control_bam_file) + idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) + idx_stats = idx_stats.communicate() + bam_chrom_list_control = [ + line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2] + ] + bam_chrom_list = [ + chrom for chrom in bam_chrom_list if bam_chrom_list_control.count(chrom) != 0 + ] + + # Now make sure no genes have a bad chrom + overall_gene_list = [ + gene for gene in overall_gene_list if bam_chrom_list.count(start_dict[gene]['chr']) != 0 + ] + + # Now make an enhancer collection of all transcripts + enhancer_gene_collection = utils.make_transcript_collection( + annot_file, + 5000, + 5000, + 500, + overall_gene_list, + ) + + enhancer_gene_gff = utils.locus_collection_to_gff(enhancer_gene_collection) + + # Dump the gff to file + enhancer_folder = utils.get_parent_folder(enhancer_file) + gff_root_name = "{}_TSS_ENHANCER_GENES_-5000_+5000".format(genome) + enhancer_gene_gff_file = "{}{}_{}.gff".format(enhancer_folder, enhancer_name, gff_root_name) + utils.unparse_table(enhancer_gene_gff, enhancer_gene_gff_file, '\t') + + # Now we need to run bam_to_gff + + # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local + # (in path), otherwise fail + bamliquidator_path = 'bamliquidator_batch' + + print('MAPPING SIGNAL AT ENHANCER ASSOCIATED GENE TSS') + # Map density at genes in the +/- 5kb tss region + # First on the rank_by bam + bam_name = rank_by_bam_file.split('/')[-1] + mapped_rank_by_folder = "{}{}_{}_{}/".format( + enhancer_folder, + enhancer_name, + gff_root_name, + bam_name, + ) + mapped_rank_by_file = "{}{}_{}_{}/matrix.txt".format( + enhancer_folder, + enhancer_name, + gff_root_name, + bam_name, + ) + cmd = '{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}'.format( + bamliquidator_path, + enhancer_gene_gff_file, + mapped_rank_by_folder, + rank_by_bam_file, + ) + print("Mapping rankby bam {}".format(rank_by_bam_file)) + print(cmd) + os.system(cmd) + + # Check for completion + if utils.check_output(mapped_rank_by_file, 0.2, 5): + print( + "SUCCESSFULLY MAPPED TO {} FROM BAM: {}" + "".format(enhancer_gene_gff_file, rank_by_bam_file) + ) + else: + print( + "ERROR: FAILED TO MAP {} FROM BAM: {}" + "".format(enhancer_gene_gff_file, rank_by_bam_file) + ) + sys.exit() + + # Next on the control bam if it exists + if control_bam_file: + control_name = control_bam_file.split('/')[-1] + mapped_control_folder = "{}{}_{}_{}/".format( + enhancer_folder, + enhancer_name, + gff_root_name, + control_name, + ) + mapped_control_file = "{}{}_{}_{}/matrix.txt".format( + enhancer_folder, + enhancer_name, + gff_root_name, + control_name, + ) + cmd = '{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}'.format( + bamliquidator_path, + enhancer_gene_gff_file, + mapped_control_folder, + control_bam_file, + ) + print("Mapping control bam {}".format(control_bam_file)) + print(cmd) + os.system(cmd) + + # Check for completion + if utils.check_output(mapped_control_file, 0.2, 5): + print( + "SUCCESSFULLY MAPPED TO {} FROM BAM: {}" + "".format(enhancer_gene_gff_file, control_bam_file) + ) + else: + print( + "ERROR: FAILED TO MAP {} FROM BAM: {}" + .format(enhancer_gene_gff_file, control_bam_file) + ) + sys.exit() + + # Now get the appropriate output files + if control_bam_file: + print( + "CHECKING FOR MAPPED OUTPUT AT {} AND {}" + "".format(mapped_rank_by_file, mapped_control_file) + ) + if (utils.check_output(mapped_rank_by_file, 1, 1) and + utils.check_output(mapped_control_file, 1, 1)): + print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') + signal_dict = make_signal_dict(mapped_rank_by_file, mapped_control_file) + else: + print("NO MAPPING OUTPUT DETECTED") + sys.exit() + else: + print("CHECKING FOR MAPPED OUTPUT AT {}".format(mapped_rank_by_file)) + if utils.check_output(mapped_rank_by_file, 1, 30): + print('MAKING ENHANCER ASSOCIATED GENE TSS SIGNAL DICTIONARIES') + signal_dict = make_signal_dict(mapped_rank_by_file) + else: + print("NO MAPPING OUTPUT DETECTED") + sys.exit() + + # Use enhancer rank to order + rank_order = utils.order([min(rank_dict[x]) for x in overall_gene_list]) + + used_names = [] + + # Make a new dict to hold TSS signal by max per gene_name + gene_name_sig_dict = defaultdict(list) + print('MAKING GENE TABLE') + for i in rank_order: + ref_id = overall_gene_list[i] + gene_name = start_dict[ref_id]['name'] + if used_names.count(gene_name) and unique_genes: + continue + else: + used_names.append(gene_name) + + prox_enhancers = gene_dict['overlapping'][ref_id] + gene_dict['proximal'][ref_id] + + super_status = max(super_dict[ref_id]) + enhancer_ranks = ','.join([str(x) for x in rank_dict[ref_id]]) + + enhancer_signal = signal_dict[ref_id] + gene_name_sig_dict[gene_name].append(enhancer_signal) + + new_line = [ + gene_name, + ref_id, + ','.join(prox_enhancers), + enhancer_ranks, + super_status, + enhancer_signal + ] + gene_to_enhancer_table.append(new_line) + + print('MAKING ENHANCER TO TOP GENE TABLE') + + if no_format_table: + enhancer_to_top_gene_table = [ + enhancer_to_gene_table[0] + ['TOP_GENE', 'TSS_SIGNAL'] + ] + else: + enhancer_to_top_gene_table = [ + enhancer_to_gene_table[0][0:12] + + ['TOP_GENE', 'TSS_SIGNAL'] + + enhancer_to_gene_table[0][-2:] + ] + + for line in enhancer_to_gene_table[1:]: + gene_list = [] + if no_format_table: + gene_list += line[-3].split(',') + gene_list += line[-2].split(',') + + else: + gene_list += line[10].split(',') + gene_list += line[11].split(',') + + gene_list = utils.uniquify([x for x in gene_list if x]) + if gene_list: + try: + sig_vector = [max(gene_name_sig_dict[x]) for x in gene_list] + max_index = sig_vector.index(max(sig_vector)) + max_gene = gene_list[max_index] + max_sig = sig_vector[max_index] + if max_sig == 0.0: + max_gene = 'NONE' + max_sig = 'NONE' + except ValueError: + if len(gene_list) == 1: + max_gene = gene_list[0] + max_sig = 'NONE' + else: + max_gene = 'NONE' + max_sig = 'NONE' + else: + max_gene = 'NONE' + max_sig = 'NONE' + if no_format_table: + new_line = line + [max_gene, max_sig] + else: + new_line = line[0:12] + [max_gene, max_sig] + line[-2:] + enhancer_to_top_gene_table.append(new_line) + + # Resort enhancer_to_gene_table + if no_format_table: + return enhancer_to_gene_table, enhancer_to_top_gene_table, gene_to_enhancer_table + else: + enhancer_order = utils.order( + [int(line[-2]) for line in enhancer_to_gene_table[1:]] + ) + sorted_table = [enhancer_to_gene_table[0]] + sorted_top_gene_table = [enhancer_to_top_gene_table[0]] + for i in enhancer_order: + sorted_table.append(enhancer_to_gene_table[(i + 1)]) + sorted_top_gene_table.append(enhancer_to_top_gene_table[(i + 1)]) + + return sorted_table, sorted_top_gene_table, gene_to_enhancer_table + + +# ================================================================= +# =========================MAP METHOD============================== +# ================================================================= + +def map(infile, genome, rank_by_bam_file, control_bam_file, gene_list=None, out=None, window=None, + format_table=None): + """Map method.""" + + print(infile) + print(genome) + print(rank_by_bam_file) + print(control_bam_file) + print(gene_list) + print(out) + print(window) + print(format_table) + + window = 50000 + print('THE WINDOW IS REALLY AN INTEGER I SWEAR') + print(window) + + # Getting the genome + print('USING {} AS THE GENOME'.format(genome)) + + # Getting the correct annot file + annotation_path = '{}/annotation'.format(ROOT_DIR) + genome_dict = { + 'HG18': '{}/hg18_refseq.ucsc'.format(annotation_path), + 'MM9': '{}/mm9_refseq.ucsc'.format(annotation_path), + 'HG19': '{}/hg19_refseq.ucsc'.format(annotation_path), + 'MM8': '{}/mm8_refseq.ucsc'.format(annotation_path), + 'MM10': '{}/mm10_refseq.ucsc'.format(annotation_path), + 'RN4': '{}/rn4_refseq.ucsc'.format(annotation_path), + 'RN6': '{}/rn6_refseq.ucsc'.format(annotation_path), + } + + annot_file = genome_dict[genome.upper()] + + # Getting the input + enhancer_file = infile + + # Making the out folder if it doesn't exist + if out: + out_folder = utils.format_folder(out, True) + else: + out_folder = '/'.join(enhancer_file.split('/')[0:-1]) + '/' + + # Check formatting flag + if format_table: + no_format_table = True + else: + no_format_table = False + + # Getting the transcribed list + transcribed_file = gene_list if gene_list else '' + + if rank_by_bam_file: + print("MAPPING GENES TO ENHANCERS USING CLOSEST ACTIVE GENE") + ( + enhancer_to_gene_table, + enhancer_to_top_gene_table, + gene_to_enhancer_table, + ) = map_enhancer_to_gene_top( + rank_by_bam_file, + control_bam_file, + genome, annot_file, + enhancer_file, + transcribed_file, + True, + window, + no_format_table, + ) + + # Writing enhancer output + enhancer_file_name = enhancer_file.split('/')[-1].split('.')[0] + + if window != 50000: + # Writing the enhancer table + out1 = '{}{}_ENHANCER_TO_GENE_{}KB.txt'.format( + out_folder, + enhancer_file_name, + window / 1000, + ) + print("writing output to {}".format(out1)) + utils.unparse_table(enhancer_to_gene_table, out1, '\t') + + # Writing enhancer top gene table + out2 = '{}{}_ENHANCER_TO_TOP_GENE_{}KB.txt'.format( + out_folder, + enhancer_file_name, + window / 1000, + ) + utils.unparse_table(enhancer_to_top_gene_table, out2, '\t') + + # Writing the gene table + out3 = '{}{}_GENE_TO_ENHANCER_{}KB.txt'.format( + out_folder, + enhancer_file_name, + window / 1000, + ) + utils.unparse_table(gene_to_enhancer_table, out3, '\t') + else: + # Writing the enhancer table + out1 = '{}{}_ENHANCER_TO_GENE.txt'.format(out_folder, enhancer_file_name) + utils.unparse_table(enhancer_to_gene_table, out1, '\t') + + # Writing the enhancer table + out2 = '{}{}_ENHANCER_TO_TOP_GENE.txt'.format(out_folder, enhancer_file_name) + utils.unparse_table(enhancer_to_top_gene_table, out2, '\t') + + # Writing the gene table + out3 = '{}{}_GENE_TO_ENHANCER.txt'.format(out_folder, enhancer_file_name) + utils.unparse_table(gene_to_enhancer_table, out3, '\t') + else: + # Do traditional mapping + print("MAPPING GENES TO ENHANCERS USING PROXIMITY RULE") + enhancer_to_gene_table, gene_to_enhancer_table = map_enhancer_to_gene( + annot_file, + enhancer_file, + transcribed_file, + True, + window, + no_format_table, + ) + + # Writing enhancer output + enhancer_file_name = enhancer_file.split('/')[-1].split('.')[0] + + if window != 50000: + # Writing the enhancer table + out1 = '{}{}_ENHANCER_TO_GENE_{}KB.txt'.format( + out_folder, + enhancer_file_name, + window / 1000, + ) + utils.unparse_table(enhancer_to_gene_table, out1, '\t') + + # Writing the gene table + out2 = '{}{}_GENE_TO_ENHANCER_{}KB.txt'.format( + out_folder, + enhancer_file_name, + window / 1000, + ) + utils.unparse_table(gene_to_enhancer_table, out2, '\t') + else: + # Writing the enhancer table + out1 = '{}{}_ENHANCER_TO_GENE.txt'.format(out_folder, enhancer_file_name) + utils.unparse_table(enhancer_to_gene_table, out1, '\t') + + # Writing the gene table + out2 = '{}{}_GENE_TO_ENHANCER.txt'.format(out_folder, enhancer_file_name) + utils.unparse_table(gene_to_enhancer_table, out2, '\t') diff --git a/rose2/resources/rose2_utils.py b/rose2/resources/rose2_utils.py new file mode 100644 index 0000000..0f65024 --- /dev/null +++ b/rose2/resources/rose2_utils.py @@ -0,0 +1,348 @@ +"""ROSE2 UTILITY FUNCTIONS.""" + +# Functions require samtools + +import copy +import numpy +import subprocess +import sys + +from collections import defaultdict +from rose2.resources import utils + +# ================================================================== +# =====================HELPER FUNCTIONS============================= +# ================================================================== + + +def check_ref_collection(reference_collection): + """Makes sure the names of all loci in the reference collection are unique.""" + names_list = [locus.id for locus in reference_collection.get_loci()] + + if len(names_list) != len(utils.uniquify(names_list)): + print("ERROR: REGIONS HAVE NON-UNIQUE IDENTIFIERS") + sys.exit() + else: + print("REFERENCE COLLECTION PASSES QC") + return + + +def get_bam_chrom_list(bam_file_list): + """Gets the consensus list of chromosomes mapped by the bams.""" + # Start w/ the first bam + cmd = 'samtools idxstats {}'.format(bam_file_list[0]) + idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) + idx_stats = idx_stats.communicate() + final_chrom_list = [ + line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2] + ] + + # Now go through each additional bam + for bam_file in bam_file_list: + cmd = 'samtools idxstats {}'.format(bam_file) + idx_stats = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) + idx_stats = idx_stats.communicate() + chrom_list = [ + line.split('\t')[0] for line in idx_stats[0].decode('utf-8').split('\n')[0:-2] + ] + final_chrom_list = [chrom for chrom in final_chrom_list if chrom_list.count(chrom)] + + return utils.uniquify(final_chrom_list) + + +def filter_gff(gff_file, chrom_list): + """Takes in a gff and filters out all lines that don't belong to a chrom in the chrom_list.""" + gff = utils.parse_table(gff_file, '\t') + filtered_gff = [] + exclude_list = [] + for line in gff: + if chrom_list.count(line[0]) == 1: + filtered_gff.append(line) + else: + exclude_list.append(line[0]) + + exclude_list = utils.uniquify(exclude_list) + if exclude_list: + print("EXCLUDED GFF REGIONS FROM THE FALLING CHROMS: {}".format(','.join(exclude_list))) + + return filtered_gff + + +# ================================================================== +# =====================REGION STITCHING============================= +# ================================================================== + +def optimize_stitching(locus_collection, name, out_folder, step_size=500): + """Takes a locus collection and starts writing out stitching stats at step sized intervals.""" + max_stitch = 5000 # set a hard wired match stitching parameter + + stitch_table = [ + [ + 'STEP', + 'NUM_REGIONS', + 'TOTAL_CONSTIT', + 'TOTAL_REGION', + 'MEAN_CONSTIT', + 'MEDIAN_CONSTIT', + 'MEAN_REGION', + 'MEDIAN_REGION', + 'MEAN_STITCH_FRACTION', + 'MEDIAN_STITCH_FRACTION', + ] + ] + # First consolidate the collection + locus_collection = locus_collection.stitch_collection(stitch_window=0) + total_constit = sum([locus.len() for locus in locus_collection.get_loci()]) + step = 0 + while step <= max_stitch: + print("Getting stitch stats for {} (bp)".format(step)) + stitch_collection = locus_collection.stitch_collection(stitch_window=step) + num_regions = len(stitch_collection) + stitch_loci = stitch_collection.get_loci() + region_lengths = [locus.len() for locus in stitch_loci] + total_region = sum(region_lengths) + constit_lengths = [] + for locus in stitch_loci: + + constit_loci = locus_collection.get_overlap(locus) + constit_lengths.append(sum([locus.len() for locus in constit_loci])) + + mean_constit = round(numpy.mean(constit_lengths), 2) + median_constit = round(numpy.median(constit_lengths), 2) + + mean_region = round(numpy.mean(region_lengths), 2) + median_region = round(numpy.median(region_lengths), 2) + + stitch_fractions = [ + float(constit_length) / float(region_length) + for constit_length, region_length in zip(constit_lengths, region_lengths) + ] + mean_stitch_fraction = round(numpy.mean(stitch_fractions), 2) + median_stitch_fraction = round(numpy.median(stitch_fractions), 2) + + new_line = [ + step, + num_regions, + total_constit, + total_region, + mean_constit, + median_constit, + mean_region, + median_region, + mean_stitch_fraction, + median_stitch_fraction, + ] + + stitch_table.append(new_line) + + step += step_size + + # Write the stitch table to disk + stitch_param_file = '{}{}_stitch_params.tmp'.format(out_folder, name) + utils.unparse_table(stitch_table, stitch_param_file, '\t') + # Call the rscript + r_cmd = 'ROSE2_stitchOpt.R {} {} {}'.format(stitch_param_file, out_folder, name) + print(r_cmd) + # Get back the stitch parameter + r_output = subprocess.Popen(r_cmd, stdout=subprocess.PIPE, shell=True) + r_output_test = r_output.communicate() + + print(r_output_test) + + stitch_param = r_output_test[0].decode('utf-8').split('\n')[2] + try: + stitch_param = int(stitch_param) + except ValueError: + print("INVALID STITCHING PARAMETER. STITCHING OPTIMIZATION FAILED") + sys.exit() + + return stitch_param + + +def region_stitching(reference_collection, name, out_folder, stitch_window, tss_window, + annot_file, remove_tss=True): + """Region stitching.""" + print('PERFORMING REGION STITCHING') + # First have to turn bound region file into a locus collection + + # Need to make sure this names correctly... Each region should have a unique name + # reference_collection + + debug_output = [] + # Filter out all bound regions that overlap the TSS of an ACTIVE GENE + if remove_tss: + print('REMOVING TSS FROM REGIONS USING AN EXCLUSION WINDOW OF {}BP'.format(tss_window)) + + # First make a locus collection of TSS + start_dict = utils.make_start_dict(annot_file) + + # Now make_TSS loci for active genes + remove_ticker = 0 + # This loop makes a locus centered around +/- tss_window of transcribed genes then adds it + # to the list tss_loci + tss_loci = [] + for gene_id in start_dict.keys(): + tss_loci.append(utils.make_tss_locus(gene_id, start_dict, tss_window, tss_window)) + + # This turns the tss_loci list into a LocusCollection + # 50 is the internal parameter for LocusCollection and doesn't really matter + tss_collection = utils.LocusCollection(tss_loci, 50) + + # Gives all the loci in reference_collection + bound_loci = reference_collection.get_loci() + + # This loop will check if each bound region is contained by the TSS exclusion zone + # This will drop out a lot of the promoter only regions that are tiny + # Typical exclusion window is around 2kb + for locus in bound_loci: + if tss_collection.get_containers(locus, 'both'): + # If true, the bound locus overlaps an active gene + reference_collection.remove(locus) + debug_output.append([str(locus), locus.id, 'CONTAINED']) + remove_ticker += 1 + print('REMOVED {} LOCI BECAUSE THEY WERE CONTAINED BY A TSS'.format(remove_ticker)) + + # Reference_collection is now all enriched region loci that don't overlap an active TSS + + if not stitch_window: + print('DETERMINING OPTIMUM STITCHING PARAMTER') + opt_collection = copy.deepcopy(reference_collection) + stitch_window = optimize_stitching(opt_collection, name, out_folder, step_size=500) + print('USING A STITCHING PARAMETER OF {}'.format(stitch_window)) + stitched_collection = reference_collection.stitch_collection(stitch_window, 'both') + + if remove_tss: + # Now replace any stitched region that overlap 2 distinct genes with the original loci + # that were there + fixed_loci = [] + tss_loci = [] + for gene_id in start_dict.keys(): + tss_loci.append(utils.make_tss_locus(gene_id, start_dict, 50, 50)) + + # This turns the tss_loci list into a LocusCollection + # 50 is the internal parameter for LocusCollection and doesn't really matter + tss_collection = utils.LocusCollection(tss_loci, 50) + remove_ticker = 0 + original_ticker = 0 + for stitched_locus in stitched_collection.get_loci(): + overlapping_tss_loci = tss_collection.get_overlap(stitched_locus, 'both') + tss_names = [start_dict[tss_locus.id]['name'] for tss_locus in overlapping_tss_loci] + tss_names = utils.uniquify(tss_names) + if len(tss_names) > 2: + # Stitched_collection.remove(stitched_locus) + original_loci = reference_collection.get_overlap(stitched_locus, 'both') + original_ticker += len(original_loci) + fixed_loci += original_loci + debug_output.append([str(stitched_locus), stitched_locus.id, 'MULTIPLE_TSS']) + remove_ticker += 1 + else: + fixed_loci.append(stitched_locus) + + print( + 'REMOVED {} STITCHED LOCI BECAUSE THEY OVERLAPPED MULTIPLE TSSs'.format(remove_ticker) + ) + print('ADDED BACK {} ORIGINAL LOCI'.format(original_ticker)) + fixed_collection = utils.LocusCollection(fixed_loci, 50) + return fixed_collection, debug_output, stitch_window + + else: + return stitched_collection, debug_output, stitch_window + + +# ================================================================== +# =====================REGION LINKING MAPPING======================= +# ================================================================== + +def map_collection(stitched_collection, reference_collection, bam_file_list, mapped_folder, + output, ref_name): + """Makes a table of factor density in a stitched locus. + + Also ranks table by number of loci stitched together. + + """ + print('FORMATTING TABLE') + loci = stitched_collection.get_loci() + + locus_table = [['REGION_ID', 'CHROM', 'START', 'STOP', 'NUM_LOCI', 'CONSTITUENT_SIZE']] + + loci_len_list = [] + + # Strip out any that are in chrY + for locus in list(loci): + if locus.chr == 'chrY': + loci.remove(locus) + + for locus in loci: + loci_len_list.append(locus.len()) + + len_order = utils.order(loci_len_list, decreasing=True) + ticker = 0 + for i in len_order: + ticker += 1 + if not ticker % 1000: + print(ticker) + locus = loci[i] + + # First get the size of the enriched regions within the stitched locus + ref_enrich_size = 0 + ref_overlapping_loci = reference_collection.get_overlap(locus, 'both') + for ref_locus in ref_overlapping_loci: + ref_enrich_size += ref_locus.len() + + try: + stitch_count = int(locus.id.split('_')[0]) + except ValueError: + stitch_count = 1 + coords = [int(x) for x in locus.coords()] + + locus_table.append( + [locus.id, locus.chr, min(coords), max(coords), stitch_count, ref_enrich_size] + ) + + print('GETTING MAPPED DATA') + print("USING A BAM FILE LIST:") + print(bam_file_list) + for bam_file in bam_file_list: + + bam_file_name = bam_file.split('/')[-1] + + print('GETTING MAPPING DATA FOR {}'.format(bam_file)) + # Assumes standard convention for naming enriched region gffs + + # Opening up the mapped GFF + print('OPENING {}{}_{}_MAPPED/matrix.txt'.format(mapped_folder, ref_name, bam_file_name)) + + mapped_gff = utils.parse_table( + '{}{}_{}_MAPPED/matrix.txt'.format(mapped_folder, ref_name, bam_file_name), + '\t', + ) + + signal_dict = defaultdict(float) + print('MAKING SIGNAL DICT FOR {}'.format(bam_file)) + mapped_loci = [] + for line in mapped_gff[1:]: + + chrom = line[1].split('(')[0] + start = int(line[1].split(':')[-1].split('-')[0]) + end = int(line[1].split(':')[-1].split('-')[1]) + mapped_loci.append(utils.Locus(chrom, start, end, '.', line[0])) + try: + signal_dict[line[0]] = float(line[2]) * (abs(end - start)) + except ValueError: + print('WARNING NO SIGNAL FOR LINE:') + print(line) + continue + + mapped_collection = utils.LocusCollection(mapped_loci, 500) + locus_table[0].append(bam_file_name) + + for i in range(1, len(locus_table)): + signal = 0.0 + line = locus_table[i] + line_locus = utils.Locus(line[1], line[2], line[3], '.') + overlapping_regions = mapped_collection.get_overlap(line_locus, sense='both') + for region in overlapping_regions: + signal += signal_dict[region.id] + locus_table[i].append(signal) + + utils.unparse_table(locus_table, output, '\t') diff --git a/rose2/resources/utils.py b/rose2/resources/utils.py new file mode 100755 index 0000000..bbd29e9 --- /dev/null +++ b/rose2/resources/utils.py @@ -0,0 +1,775 @@ +"""SET OF GENERAL UTILITY FUNCTIONS FOR SEQ DATA.""" + +# Locus and LocusCollection classes were generously provided by Graham Ruby and Charles Lin +# Additional functions are found from online sources and are noted in the comments + +import os +import gzip +import sys +import time + +# ================================================================== +# ==========================I/O FUNCTIONS=========================== +# ================================================================== + + +def open_plus(file_name, mode='r'): + """Open function that can handle gzip files.""" + if file_name.split('.')[-1] == 'gz': + return gzip.open(file_name, mode + 'b') + return open(file_name, mode) + + +def parse_table(file_name, sep, header=False, excel=False): + """Opens standard delimited files and outputs a nested list.""" + with open_plus(file_name) as infile: + line = infile.readline() + + # First figure out if this table came from excel and has \r for line breaks + if line.count('\r') > 0: + excel = True + + with open_plus(file_name) as infile: + if header: + header = infile.readline() # disposes of the header + + table = [] + if excel: + lines_raw = infile.readlines() + for line in lines_raw: + line = line.replace('\r', '') + line = line.rstrip() + table.append(line.split(sep)) + else: + for line in infile: + line = line.rstrip().split(sep) + table.append(line) + + return table + + +def unparse_table(table, output, sep): + """Transforms a 'parse_table' output into a text file.""" + with open_plus(output, 'w') as fh_out: + if not sep: + for i in table: + fh_out.write(str(i) + '\n') + else: + for line in table: + fh_out.write(sep.join([str(x) for x in line]) + '\n') + + +def format_bed(bed, output=''): + """Formats a bed file from UCSC or MACS into a WUSTL gateway compatible bed.""" + new_bed = [] + + if isinstance(bed, str): + bed = parse_table(bed, '\t') + + index_ticker = 1 + for line in bed: + new_line = line[0:4] + try: + strand = line[5] + except IndexError: + strand = '.' + new_line += [index_ticker, strand] + index_ticker += 1 + new_bed.append(new_line) + + if output: + unparse_table(new_bed, output, '\t') + else: + return new_bed + + +def bed_to_gff(bed, output=''): + """Turns a bed into a gff file.""" + if isinstance(bed, str): + bed = parse_table(bed, '\t') + + bed = format_bed(bed) + + # Determine if this is a long bed or a short bed + if len(bed[0]) == 6: # this is a full format bed + bed_style = 'long' + elif len(bed[0]) == 5: # this is the medium length bed with strand + bed_style = 'medium' + elif len(bed[0]) == 3: # this is the minimum length bed + bed_style = 'short' + else: + print('this is probably not actually a bed') + print(bed[0]) + print('exiting now because the bed is sad') + sys.exit() + print('this bed has {} columns and is a {} bed'.format(len(bed[0]), bed_style)) + + gff = [] + for line in bed: + if bed_style == 'long': + gff_line = [ + line[0], + line[3], + '', + line[1], + line[2], + line[4], + line[5], + '', + line[3], + ] + elif bed_style == 'medium': + gff_line = [line[0], '', '', line[1], line[2], '', line[4], '', ''] + else: + gff_line = [line[0], '', '', line[1], line[2], '', '.', '', ''] + gff.append(gff_line) + + if output: + unparse_table(gff, output, '\t') + else: + return gff + + +def format_folder(folder_name, create=False): + """Checks if a folder exists and returns a bool. + + If 'create=True' it creates a folder if it doesn't exist already. + + """ + if folder_name[-1] != '/': + folder_name += '/' + + try: + os.listdir(folder_name) + return folder_name + except OSError: + print('Folder {} does not exist'.format(folder_name)) + if create: + os.mkdir(folder_name) + return folder_name + return False + + +def check_output(file_name, wait_time=1, time_out=30): + """Checks for the presence of a file every N minutes. + + If it exists, returns True. + Default is 1 minute with a max timeOut of 30 minutes. + + """ + wait_time = int(wait_time * 60) + + time_out = int(time_out * 60) + + max_ticker = time_out / wait_time + ticker = 0 + + file_exists = False + while not file_exists: + try: + size1 = os.stat(file_name).st_size + time.sleep(.5) + size2 = os.stat(file_name).st_size + if size1 == size2: + file_exists = True + else: + time.sleep(wait_time) + ticker += 1 + except OSError: + time.sleep(wait_time) + ticker += 1 + if ticker == max_ticker: + break + + time.sleep(.1) + if file_exists: + return True + else: + print('OPERATION TIMED OUT. FILE {} NOT FOUND'.format(file_name)) + return False + + +def get_parent_folder(input_file): + """Returns the parent folder for any file.""" + parent_folder = '/'.join(input_file.split('/')[:-1]) + '/' + if parent_folder == '/': + return './' + else: + return parent_folder + + +# ================================================================== +# ===================ANNOTATION FUNCTIONS=========================== +# ================================================================== + + +def make_start_dict(annot_file, gene_list=[]): + """Makes a dictionary keyed by refseq ID. + + It contains information about chrom/start/stop/strand/common name. + + """ + if isinstance(gene_list, str): + gene_list = parse_table(gene_list, '\t') + gene_list = [line[0] for line in gene_list] + + if annot_file.upper().count('REFSEQ') == 1: + refseq_table, refseq_dict = import_refseq(annot_file) + if not gene_list: + gene_list = refseq_dict.keys() + + start_dict = {} + for gene in gene_list: + if gene not in refseq_dict: + continue + start_dict[gene] = {} + start_dict[gene]['sense'] = refseq_table[refseq_dict[gene][0]][3] + start_dict[gene]['chr'] = refseq_table[refseq_dict[gene][0]][2] + start_dict[gene]['start'] = get_tsss([gene], refseq_table, refseq_dict) + if start_dict[gene]['sense'] == '+': + start_dict[gene]['end'] = [int(refseq_table[refseq_dict[gene][0]][5])] + else: + start_dict[gene]['end'] = [int(refseq_table[refseq_dict[gene][0]][4])] + start_dict[gene]['name'] = refseq_table[refseq_dict[gene][0]][12] + + return start_dict + + +def get_tsss(gene_list, refseq_table, refseq_dict): + """Generic function to get the TSS of any gene.""" + if not gene_list: + refseq = refseq_table + else: + refseq = refseq_from_key(gene_list, refseq_dict, refseq_table) + tss = [] + for line in refseq: + if line[3] == '+': + tss.append(line[4]) + if line[3] == '-': + tss.append(line[5]) + tss = map(int, tss) + + return list(tss) + + +def import_refseq(refseq_file, return_multiples=False): + """Opens up a refseq file downloaded by UCSC. + + Takes in a refseq table and makes a refseq table and a refseq dictionary for keying the table. + + """ + refseq_table = parse_table(refseq_file, '\t') + refseq_dict = {} + ticker = 1 + for line in refseq_table[1:]: + if line[1] in refseq_dict: + refseq_dict[line[1]].append(ticker) + else: + refseq_dict[line[1]] = [ticker] + ticker = ticker + 1 + + multiples = [] + for i in refseq_dict: + if len(refseq_dict[i]) > 1: + multiples.append(i) + + if return_multiples: + return refseq_table, refseq_dict, multiples + else: + return refseq_table, refseq_dict + + +def refseq_from_key(refseq_key_list, refseq_dict, refseq_table): + """Function that grabs refseq lines from refseq IDs.""" + type_refseq = [] + for name in refseq_key_list: + if name in refseq_dict: + type_refseq.append(refseq_table[refseq_dict[name][0]]) + return type_refseq + + +def make_transcript_collection(annot_file, up_search, down_search, window=500, gene_list=[]): + """Makes a LocusCollection w/ each transcript as a locus. + + Takes in either a refseqfile or an ensemblGFF. + + """ + if annot_file.upper().count('REFSEQ') == 1: + refseq_table, refseq_dict = import_refseq(annot_file) + locus_list = [] + ticker = 0 + if not gene_list: + gene_list = refseq_dict.keys() + + for line in refseq_table[1:]: + if line[1] in gene_list: + if line[3] == '-': + locus = Locus( + line[2], + int(line[4]) - down_search, + int(line[5]) + up_search, + line[3], + line[1], + ) + else: + locus = Locus( + line[2], + int(line[4]) - up_search, + int(line[5]) + down_search, + line[3], + line[1], + ) + locus_list.append(locus) + ticker = ticker + 1 + if not ticker % 1000: + print(ticker) + + trans_collection = LocusCollection(locus_list, window) + return trans_collection + + +# ================================================================== +# ========================LOCUS INSTANCE============================ +# ================================================================== + +# Locus and LocusCollection instances courtesy of Graham Ruby + +class Locus(object): + """Standard locus class for tracking genomic loci.""" + __chr_dict = dict() + __sense_dict = {'+': '+', '-': '-', '.': '.'} + + def __init__(self, chr, start, end, sense, ID='', score=0): + """Initialize attributes.""" + coords = sorted([int(start), int(end)]) + # This method for assigning chromosome should help avoid storage of redundant strings. + if chr not in self.__chr_dict: + self.__chr_dict[chr] = chr + self.chr = self.__chr_dict[chr] + self.sense = self.__sense_dict[sense] + self.start = int(coords[0]) + self.end = int(coords[1]) + self.id = ID + self.score = score + + def len(self): + """get locus length.""" + return self.end - self.start + 1 + + def get_antisense_locus(self): + """Get antisense locus.""" + if self.sense == '.': + return self + switch = {'+': '-', '-': '+'} + return Locus(self.chr, self.start, self.end, switch[self.sense]) + + def coords(self): + """Returns a sorted list of the coordinates.""" + return [self.start, self.end] + + def overlaps(self, other_locus): + """Find if loci overlap. + + True if two loci share any coordinates in common. + + """ + if self.chr != other_locus.chr: + return False + if not(self.sense == '.' or + other_locus.sense == '.' or + self.sense == other_locus.sense): + return False + if self.start > other_locus.end or other_locus.start > self.end: + return False + return True + + def contains(self, other_locus): + """Find if locus contains other locus. + + True if all the nucleotides of the given locus overlap with the self locus. + + """ + if self.chr != other_locus.chr: + return False + if not(self.sense == '.' or + other_locus.sense == '.' or + self.sense == other_locus.sense): + return False + if self.start > other_locus.start or other_locus.end > self.end: + return False + return True + + def overlaps_antisense(self, other_locus): + """Same as overlaps, but considers the opposite strand.""" + return self.get_antisense_locus().overlaps(other_locus) + + def contains_antisense(self, other_locus): + """Same as contains, but considers the opposite strand.""" + return self.get_antisense_locus().contains(other_locus) + + def __hash__(self): + """Define hash.""" + return self.start + self.end + + def __eq__(self, other): + """Define Locus objects equality.""" + if self.__class__ != other.__class__: + return False + if self.chr != other.chr: + return False + if self.start != other.start: + return False + if self.end != other.end: + return False + if self.sense != other.sense: + return False + return True + + def __ne__(self, other): + """Opposite to equality.""" + return not self.__eq__(other) + + def __str__(self): + """Locus string representation.""" + return self.chr + '(' + self.sense + '):' + '-'.join(map(str, self.coords())) + + +class LocusCollection(object): + """A collection of locus objects used for querying large sets of loci.""" + def __init__(self, loci, window_size=50): + """Initialize attributes.""" + # Top-level keys are chr, then strand, no space + self.__chr_to_coord_to_loci = dict() + self.__loci = dict() + self.__win_size = window_size + for lcs in loci: + self.__add_locus(lcs) + + def __add_locus(self, lcs): + """Add locus to the collection.""" + if lcs not in self.__loci: + self.__loci[lcs] = None + if lcs.sense == '.': + chr_key_list = [lcs.chr + '+', lcs.chr + '-'] + else: + chr_key_list = [lcs.chr + lcs.sense] + for chr_key in chr_key_list: + if chr_key not in self.__chr_to_coord_to_loci: + self.__chr_to_coord_to_loci[chr_key] = dict() + for num in self.__get_key_range(lcs): + if num not in self.__chr_to_coord_to_loci[chr_key]: + self.__chr_to_coord_to_loci[chr_key][num] = [] + self.__chr_to_coord_to_loci[chr_key][num].append(lcs) + + def __get_key_range(self, locus): + """Get key range.""" + start = locus.start / self.__win_size + end = locus.end / self.__win_size + 1 # add 1 because of the range + return range(int(start), int(end)) + + def __len__(self): + """Define length.""" + return len(self.__loci) + + def append(self, new): + """Append locus.""" + self.__add_locus(new) + + def has_locus(self, locus): + """Check if locus in the loci dict""" + return locus in self.__loci + + def remove(self, old): + """Remove locus from the collection.""" + if old not in self.__loci: + raise ValueError("Requested locus isn't in collection.") + del self.__loci[old] + if old.sense == '.': + sense_list = ['+', '-'] + else: + sense_list = [old.sense] + for k in self.__get_key_range(old): + for sense in sense_list: + self.__chr_to_coord_to_loci[old.chr + sense][k].remove(old) + + def get_loci(self): + """Get loci.""" + return self.__loci.keys() + + def __subset_helper(self, locus, sense): + """""" + sense = sense.lower() + if ['sense', 'antisense', 'both'].count(sense) != 1: + raise ValueError("Sense command invalid: '{}'.".format(sense)) + matches = dict() + senses = ['+', '-'] + if locus.sense == '.' or sense == 'both': + def lamb(s): return True + elif sense == 'sense': + def lamb(s): return bool(s == locus.sense) + elif sense == 'antisense': + def lamb(s): return bool(s != locus.sense) + else: + raise ValueError("Sense value was inappropriate: '{}'.".format(sense)) + + for s in filter(lamb, senses): + chr_key = locus.chr + s + if chr_key in self.__chr_to_coord_to_loci: + for n in self.__get_key_range(locus): + if n in self.__chr_to_coord_to_loci[chr_key]: + for lcs in self.__chr_to_coord_to_loci[chr_key][n]: + matches[lcs] = None + + return matches.keys() + + def get_overlap(self, locus, sense='sense'): + """Returns all members of the collection that overlap the locus. + + :param sense: can be 'sense' (default), 'antisense', or 'both'. + + """ + matches = self.__subset_helper(locus, sense) + # Now, get rid of the ones that don't really overlap + real_matches = dict() + if sense == 'sense' or sense == 'both': + for i in filter(lambda lcs: lcs.overlaps(locus), matches): + real_matches[i] = None + if sense == 'antisense' or sense == 'both': + for i in filter(lambda lcs: lcs.overlaps_antisense(locus), matches): + real_matches[i] = None + return real_matches.keys() + + def get_containers(self, locus, sense='sense'): + """Returns all members of the collection that contain the locus. + + Sense can be 'sense' (default), 'antisense', or 'both'. + + """ + matches = self.__subset_helper(locus, sense) + # Now, get rid of the ones that don't really overlap + real_matches = dict() + if sense == 'sense' or sense == 'both': + for i in filter(lambda lcs: lcs.contains(locus), matches): + real_matches[i] = None + if sense == 'antisense' or sense == 'both': + for i in filter(lambda lcs: lcs.contains_antisense(locus), matches): + real_matches[i] = None + return real_matches.keys() + + def stitch_collection(self, stitch_window=1, sense='both'): + """Reduces the collection by stitching together overlapping loci. + + Returns a new collection. Initializing stichWindow to 1 helps collect directly adjacent + loci. + + :param sense: can be 'sense' (default), 'antisense', or 'both'. + + """ + locus_list = self.get_loci() + old_collection = LocusCollection(locus_list, 500) + + stitched_collection = LocusCollection([], 500) + + for locus in locus_list: + if old_collection.has_locus(locus): + old_collection.remove(locus) + overlapping_loci = old_collection.get_overlap( + Locus( + locus.chr, + locus.start - stitch_window, + locus.end + stitch_window, + locus.sense, + locus.id, + ), + sense, + ) + + stitch_ticker = 1 + while overlapping_loci: + stitch_ticker += len(overlapping_loci) + overlap_coords = locus.coords() + + for overlapping_locus in overlapping_loci: + overlap_coords += overlapping_locus.coords() + old_collection.remove(overlapping_locus) + overlap_coords = [int(x) for x in overlap_coords] + if sense == 'both': + locus = Locus( + locus.chr, + min(overlap_coords), + max(overlap_coords), + '.', + locus.id, + ) + else: + locus = Locus( + locus.chr, + min(overlap_coords), + max(overlap_coords), + locus.sense, + locus.id, + ) + overlapping_loci = old_collection.get_overlap( + Locus( + locus.chr, + locus.start - stitch_window, + locus.end + stitch_window, + locus.sense, + ), + sense + ) + locus.id = '{}_{}_lociStitched'.format(stitch_ticker, locus.id) + + stitched_collection.append(locus) + + else: + continue + + return stitched_collection + + +# ================================================================== +# ========================LOCUS FUNCTIONS=========================== +# ================================================================== + +def locus_collection_to_gff(locus_collection): + """Get a gff format list from a LocusCollection object.""" + loci_list = locus_collection.get_loci() + gff = [] + for locus in loci_list: + new_line = [ + locus.chr, + locus.id, + '', + locus.coords()[0], + locus.coords()[1], + '', + locus.sense, + '', + locus.id, + ] + gff.append(new_line) + return gff + + +def gff_to_locus_collection(gff, window=500): + """Opens up a gff file and turns it into a LocusCollection instance.""" + loci_list = [] + if isinstance(gff, str): + gff = parse_table(gff, '\t') + + for line in gff: + # Use line[1] as the locus ID. If that is empty use line[8] + if line[1]: + name = line[1] + elif line[8]: + name = line[8] + else: + name = '{}:{}:{}-{}'.format(line[0], line[6], line[3], line[4]) + + loci_list.append(Locus(line[0], line[3], line[4], line[6], name)) + return LocusCollection(loci_list, window) + + +def make_tss_locus(gene, start_dict, upstream, downstream): + """Given a start_dict, make a locus for any gene's TSS w/ upstream and downstream windows.""" + start = start_dict[gene]['start'][0] + if start_dict[gene]['sense'] == '-': + return Locus(start_dict[gene]['chr'], start - downstream, start + upstream, '-', gene) + else: + return Locus(start_dict[gene]['chr'], start - upstream, start + downstream, '+', gene) + + +def make_search_locus(locus, up_search, down_search): + """Takes a locus and expands it by a fixed upstream/downstream amount. + + Spits out the new larger locus. + + """ + if locus.sense == '-': + search_locus = Locus( + locus.chr, + locus.start - down_search, + locus.end + up_search, + locus.sense, + locus.id, + ) + else: + search_locus = Locus( + locus.chr, + locus.start - up_search, + locus.end + down_search, + locus.sense, + locus.id, + ) + return search_locus + + +# ================================================================== +# ========================MISC FUNCTIONS============================ +# ================================================================== + +# uniquify function +# by Peter Bengtsson +# Used under a creative commons license +# sourced from here: http://www.peterbe.com/plog/uniqifiers-benchmark + +def uniquify(seq, idfun=None): + """Makes a list unique. + + Order preserving. + + """ + seen = {} + result = [] + for item in seq: + if idfun is None: + marker = item + else: + marker = idfun(item) + if marker in seen: + continue + seen[marker] = 1 + result.append(item) + return result + + +# Taken from http://code.activestate.com/recipes/491268/ + +def order(x, none_is_last=True, decreasing=False): + """Returns the ordering of the elements of x. + + The list [ x[j] for j in order(x) ] is a sorted version of x. + + Missing values in x are indicated by None. If none_is_last is true, then missing values are + ordered to be at the end. Otherwise, they are ordered at the beginning. + + """ + omit_none = False + if none_is_last is None: + none_is_last = True + omit_none = True + + n = len(x) + ix = range(n) + if None not in x: + ix.sort(reverse=decreasing, key=lambda j: x[j]) + else: + # Handle None values properly. + def key(i, x=x): + elem = x[i] + # Valid values are True or False only. + if decreasing == none_is_last: + return not(elem is None), elem + else: + return elem is None, elem + ix = range(n) + ix.sort(key=key, reverse=decreasing) + + if omit_none: + n = len(x) + for i in range(n-1, -1, -1): + if x[ix[i]] is None: + n -= 1 + return ix[:n] + return ix diff --git a/rose2/rose2_main.py b/rose2/rose2_main.py new file mode 100644 index 0000000..304f1e8 --- /dev/null +++ b/rose2/rose2_main.py @@ -0,0 +1,352 @@ +#!/usr/bin/env python + +# PROGRAM TO STITCH TOGETHER REGIONS TO FORM ENHANCERS, MAP READ DENSITY TO STITCHED REGIONS, AND +# RANK ENHANCERS BY READ DENSITY TO DISCOVER SUPER-ENHANCERS + +import argparse +import sys +import time +import os + +from rose2.definitions import ROOT_DIR +from rose2.resources import genemapper, rose2_utils, utils + + +def parse_args(args=None): + """Argument parser.""" + parser = argparse.ArgumentParser( + usage=( + "rose2 [options]" + " -g [GENOME]" + " -i [INPUT_REGION_GFF]" + " -r [RANKBY_BAM_FILE]" + " -o [OUTPUT_FOLDER]" + " [OPTIONAL_FLAGS]" + ) + ) + + # Required flags + parser.add_argument("-i", "--i", dest="input", default=None, type=str, + help=( + "Enter a .gff or .bed file of binding sites used to make enhancers" + ), required=True) + parser.add_argument("-r", "--rankby", dest="rankby", default=None, type=str, + help="bamfile to rank enhancer by", required=True) + parser.add_argument("-o", "--out", dest="out", default=None, type=str, + help="Enter an output folder", required=True) + parser.add_argument("-g", "--genome", dest="genome", default=None, type=str, + help="Enter the genome build (MM9,MM8,HG18,HG19)", required=True) + + # Optional flags + parser.add_argument("-b", "--bams", dest="bams", default=None, type=str, + help="Enter a comma separated list of additional bam files to map to", + required=False) + parser.add_argument("-c", "--control", dest="control", default='', type=str, + help="bamfile to rank enhancer by", required=False) + parser.add_argument("-s", "--stitch", dest="stitch", default=None, type=str, + help=( + "Enter a max linking distance for stitching. Default will determine " + "optimal stitching parameter" + ), required=False) + parser.add_argument("-t", "--tss", dest="tss", default=0, type=int, + help="Enter a distance from TSS to exclude. 0 = no TSS exclusion", + required=False) + parser.add_argument("--mask", dest="mask", default=None, type=str, + help=( + "Mask a set of regions from analysis. Provide a .bed or .gff of " + "masking regions" + ), required=False) + + return parser.parse_args(args) + + +def rose(input_file, rankby, output_folder, genome, bams=None, control='', stitch=None, tss=0, + mask_file=None): + """ROSE2 main function.""" + debug = False + + # Making the out folder if it doesn't exist + out_folder = utils.format_folder(output_folder, True) + + # Figuring out folder schema + gff_folder = utils.format_folder(out_folder + 'gff/', True) + mapped_folder = utils.format_folder(out_folder + 'mappedGFF/', True) + + # Getting input file + if input_file.split('.')[-1] == 'bed': + # Converting a BED file + input_gff_name = input_file.split('/')[-1][0:-4] + input_gff_file = '{}{}.gff'.format(gff_folder, input_gff_name) + utils.bed_to_gff(input_file, input_gff_file) + elif input_file.split('.')[-1] == 'gff': + # Copy the input GFF to the GFF folder + input_gff_file = input_file + os.system('cp {} {}'.format(input_gff_file, gff_folder)) + else: + print('WARNING: INPUT FILE DOES NOT END IN .gff or .bed. ASSUMING .gff FILE FORMAT') + # Copy the input GFF to the GFF folder + input_gff_file = input_file + os.system('cp {} {}'.format(input_gff_file, gff_folder)) + + # Getting the list of BAM files to process + bam_file_list = [rankby] + if control: + bam_file_list.append(control) + + if bams: + bam_file_list += bams.split(',') + + for bam in bam_file_list: + if not os.path.isfile('{}.bai'.format(bam)): + print('INDEX FILE FOR {} IS MISSING'.format(bam)) + sys.exit() + + # Optional args + + # Stitch parameter + stitch_window = '' if not stitch else int(stitch) + + # TSS options + tss_window = int(tss) + remove_tss = True if not tss_window else False + + # Getting the Bound region file used to define enhancers + print('USING {} AS THE INPUT GFF'.format(input_gff_file)) + input_name = input_gff_file.split('/')[-1].split('.')[0] + + # Getting the genome + print('USING {} AS THE GENOME'.format(genome)) + + # Getting the correct annot file + annotation_path = '{}/annotation'.format(ROOT_DIR) + genome_dict = { + 'HG18': '{}/hg18_refseq.ucsc'.format(annotation_path), + 'MM9': '{}/mm9_refseq.ucsc'.format(annotation_path), + 'HG19': '{}/hg19_refseq.ucsc'.format(annotation_path), + 'MM8': '{}/mm8_refseq.ucsc'.format(annotation_path), + 'MM10': '{}/mm10_refseq.ucsc'.format(annotation_path), + 'RN4': '{}/rn4_refseq.ucsc'.format(annotation_path), + 'RN6': '{}/rn6_refseq.ucsc'.format(annotation_path), + } + + annot_file = genome_dict[genome.upper()] + + # Get chroms found in the bams + print('GETTING CHROMS IN BAMFILES') + bam_chrom_list = rose2_utils.get_bam_chrom_list(bam_file_list) + print("USING THE FOLLOWING CHROMS") + print(bam_chrom_list) + + # Loading in the GFF and filtering by chrom + print('LOADING AND FILTERING THE GFF') + input_gff = rose2_utils.filter_gff(input_gff_file, bam_chrom_list) + + # Loading in the bound region reference collection + print('LOADING IN GFF REGIONS') + reference_collection = utils.gff_to_locus_collection(input_gff) + + print('STARTING WITH {} INPUT REGIONS'.format(len(reference_collection))) + print('CHECKING REFERENCE COLLECTION:') + rose2_utils.check_ref_collection(reference_collection) + + # Masking reference collection + # See if there's a mask + if mask_file: + print('USING MASK FILE {}'.format(mask_file)) + # if it's a bed file + if mask_file.split('.')[-1].upper() == 'BED': + mask_gff = utils.bed_to_gff(mask_file) + elif mask_file.split('.')[-1].upper() == 'GFF': + mask_gff = utils.parse_table(mask_file, '\t') + else: + print("MASK MUST BE A .gff or .bed FILE") + + mask_collection = utils.gff_to_locus_collection(mask_gff) + print('LOADING {} MASK REGIONS'.format(len(mask_collection))) + + # Now mask the reference loci + reference_loci = reference_collection.get_loci() + filtered_loci = [ + locus for locus in reference_loci if not mask_collection.get_overlap(locus, 'both') + ] + print( + "FILTERED OUT {} LOCI THAT WERE MASKED IN {}" + "".format(len(reference_loci) - len(filtered_loci), mask_file) + ) + reference_collection = utils.LocusCollection(filtered_loci, 50) + + # Now stitch regions + print('STITCHING REGIONS TOGETHER') + stitched_collection, debug_output, stitch_window = rose2_utils.region_stitching( + reference_collection, + input_name, + out_folder, + stitch_window, + tss_window, + annot_file, + remove_tss, + ) + + # Now make a stitched collection GFF + print('MAKING GFF FROM STITCHED COLLECTION') + stitched_gff = utils.locus_collection_to_gff(stitched_collection) + # Making sure start/stop ordering are correct + for line in stitched_gff: + start = int(line[3]) + stop = int(line[4]) + if start > stop: + line[3] = stop + line[4] = start + + print(stitch_window) + print(type(stitch_window)) + if not remove_tss: + stitched_gff_file = '{}{}_{}KB_STITCHED.gff'.format( + gff_folder, + input_name, + str(stitch_window / 1000), + ) + stitched_gff_name = '{}_{}KB_STITCHED'.format(input_name, str(stitch_window / 1000)) + debug_out_file = '{}{}_{}KB_STITCHED.debug'.format( + gff_folder, + input_name, + str(stitch_window / 1000), + ) + else: + stitched_gff_file = '{}{}_{}KB_STITCHED_TSS_DISTAL.gff'.format( + gff_folder, + input_name, + str(stitch_window / 1000), + ) + stitched_gff_name = '{}_{}KB_STITCHED_TSS_DISTAL'.format( + input_name, + str(stitch_window / 1000), + ) + debug_out_file = '{}{}_{}KB_STITCHED_TSS_DISTAL.debug'.format( + gff_folder, + input_name, + str(stitch_window / 1000), + ) + + # Writing debug output to disk + if debug: + print('WRITING DEBUG OUTPUT TO DISK AS {}'.format(debug_out_file)) + utils.unparse_table(debug_output, debug_out_file, '\t') + + # Write the GFF to disk + print('WRITING STITCHED GFF TO DISK AS {}'.format(stitched_gff_file)) + utils.unparse_table(stitched_gff, stitched_gff_file, '\t') + + # Setting up the overall output file + output_file_1 = out_folder + stitched_gff_name + '_ENHANCER_REGION_MAP.txt' + print('OUTPUT WILL BE WRITTEN TO {}'.format(output_file_1)) + + # Try to use the bamliquidatior_path.py script on cluster, otherwise, failover to local + # (in path), otherwise fail + bamliquidator_path = 'bamliquidator_batch' + + bam_file_list_unique = list(bam_file_list) + bam_file_list_unique = utils.uniquify(bam_file_list_unique) + # Prevent redundant mapping + print("MAPPING TO THE FOLLOWING BAMS:") + print(bam_file_list_unique) + for bam_file in bam_file_list_unique: + + bam_file_name = bam_file.split('/')[-1] + + # Mapping to the stitched GFF + mapped_out_1_folder = '{}{}_{}_MAPPED'.format( + mapped_folder, + stitched_gff_name, + bam_file_name, + ) + mapped_out_1_file = '{}{}_{}_MAPPED/matrix.txt'.format( + mapped_folder, + stitched_gff_name, + bam_file_name, + ) + if utils.check_output(mapped_out_1_file, 0.2, 0.2): + print("FOUND {} MAPPING DATA FOR BAM: {}".format(stitched_gff_file, mapped_out_1_file)) + else: + cmd1 = "{} --sense . -e 200 --match_bamToGFF -r {} -o {} {}".format( + bamliquidator_path, + stitched_gff_file, + mapped_out_1_folder, + bam_file, + ) + print(cmd1) + + os.system(cmd1) + if utils.check_output(mapped_out_1_file, 0.2, 5): + print( + "SUCCESSFULLY MAPPED TO {} FROM BAM: {}" + "".format(stitched_gff_file, bam_file_name) + ) + else: + print( + "ERROR: FAILED TO MAP {} FROM BAM: {}" + "".format(stitched_gff_file, bam_file_name) + ) + sys.exit() + + print('BAM MAPPING COMPLETED NOW MAPPING DATA TO REGIONS') + # Calculate density by region + # TODO: Need to fix this function to account for different outputs of liquidator + rose2_utils.map_collection( + stitched_collection, + reference_collection, + bam_file_list, + mapped_folder, + output_file_1, + ref_name=stitched_gff_name, + ) + + print('CALLING AND PLOTTING SUPER-ENHANCERS') + + control_name = control.split('/')[-1] if control else 'NONE' + + cmd = 'ROSE2_callSuper.R {} {} {} {}'.format( + out_folder, + output_file_1, + input_name, + control_name, + ) + + print(cmd) + + os.system(cmd) + + # Calling the gene mapper + time.sleep(5) + tables = [ + "_SuperEnhancers.table.txt", + "_StretchEnhancers.table.txt", + "_SuperStretchEnhancers.table.txt", + ] + for table in tables: + table_file = "{}{}".format(input_name, table) + genemapper.map( + os.path.join(out_folder, table_file), + genome, + rankby, + control, + ) + + +def main(args=None): + """Main run call.""" + + # Retrieving flags + args = parse_args(args) + + rose( + args.input, + args.rankby, + args.out, + args.genome, + bams=args.bams, + control=args.control, + stitch=args.stitch, + tss=args.tss, + mask_file=args.mask, + ) diff --git a/rose2/utils.py b/rose2/utils.py deleted file mode 100755 index a40f705..0000000 --- a/rose2/utils.py +++ /dev/null @@ -1,1415 +0,0 @@ -#!/usr/bin/python -#SET OF GENERAL UTILITY FUNCTIONS FOR SEQ DATA -#last modified 141217 - -#please edit this to the location of the samtools program -samtoolsString ='samtools' - -''' -The MIT License (MIT) - -Copyright (c) 2013 Charles Lin - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -''' - - -#Locus, LocusCollection, and Gene classes were generously provided by Graham Ruby and Charles Lin -#Additional functions are found from online sources and are noted in the comments - - - -#================================================================== -#===========================DEPENDENCIES=========================== -#================================================================== - -import os -import gzip -import time -import re -import sys -import math - -# Very pretty error reporting, where available -try: - from IPython.core import ultratb - sys.excepthook = ultratb.FormattedTB(mode='Context', color_scheme='Linux') -except ImportError: - pass - -from string import join, maketrans - -import subprocess -import datetime - -from collections import defaultdict - -#================================================================== -#======================TABLE OF CONTENTS=========================== -#================================================================== - -#1. Input/Output and file handling functions - -#def open(file,mode='r'): <- replaces open with a version that can handle gzipped files -#def parseTable(fn, sep, header = False,excel = False): <- opens standard delimited files -#def unParseTable(table, output, sep): <- writes standard delimited files, opposite of parseTable -#def formatBed(bed,output=''): -#def bedToGFF(bed,output=''): -#def gffToBed(gff,output= ''): <- converts standard UCSC gff format files to UCSC bed format files -#def formatFolder(folderName,create=False): <- checks for the presence of any folder and makes it if create =True - -#2. Gene annotation functions -#def makeStartDict(annotFile,geneList = []): <- takes a standard UCSC refseq table and creates a dictionary keyed by refseq ID with info about each transcript -#def getTSSs(geneList,refseqTable,refseqDict): <- returns the TSS location of any gene -#def importRefseq(refseqFile, returnMultiples = False): <- imports a standard UCSC refseq annotation file into a dictionary -#def makeGenes(annotFile,geneList=[],asDict = False): <- takes a UCSC refseq annotation file and a gene list and makes a list or dictionary of Gene class objects -#def makeTranscriptCollection(annotFile,upSearch,downSearch,window = 500,geneList = []): <- takes a UCSC refseq annotation file and makes a LocusCollection where each locus is a full transcript -#def importBoundRegion(boundRegionFile,name): <- imports a bound region file (a standard bed or macs output bed) - -#3. Locus class -#class Locus(chr,start,end,sense,ID) <- standard locus class for tracking genomic loci -#class LocusCollection(lociList,windowSize=500) <- a collection of locus objects used for querying large sets of loci - -#4. Gene class -#class Gene(name,chr,sense,txCoords,cdCoords,exStarts,exEnds,commonName=''): <- gene class object that contains all annotation information about a given transcript - -#5. Locus functions -#def locusCollectionToGFF(locusCollection): <- turns a locus collection into a gff -#def gffToLocusCollection(gff,window =500): <- turns a gff into a locus collection (reverse of gff) -#def makeTSSLocus(gene,startDict,upstream,downstream): <- from a start dict makes a locus surrounding the tss -#def makeSearchLocus(locus,upSearch,downSearch): <- takes an existing locus and makes a larger flanking locus -#def makeSECollection(enhancerFile,name,top=0): - - -#6. Bam class -#class Bam(bamFile) <- a class for handling and manipulating bam objects. requires samtools - -#7. Misc. functions -#def uniquify(seq, idfun=None): <- makes a list unique -#def order(x, NoneIsLast = True, decreasing = False): <- returns the ascending or descending order of a list - - -#8 Sequence functions -#def fetchSeq(directory,chrom,start,end,UCSC=False,lineBreaks=True,header = True): <- grabs sequence from a region -#def gffToFasta(species,directory,gff,UCSC = True): <- converts a gff to a fasta -#def revComp(seq,rev = True, RNA=False): <- is awesome - - -#================================================================== -#==========================I/O FUNCTIONS=========================== -#================================================================== - -# TODO: Overriding internal functions is evil! -bopen=open -def open(fileName,mode='r'): - if fileName.split('.')[-1] == 'gz': - return gzip.open(fileName, mode + 'b') - else: - return bopen(fileName, mode) - -#parseTable 4/14/08 -#takes in a table where columns are separated by a given symbol and outputs -#a nested list such that list[row][col] -#example call: -#table = parseTable('file.txt','\t') -def parseTable(fn, sep, header = False,excel = False): - fh = open(fn) - if header == True: - header = fh.readline() #disposes of the header - - table = [] - for line in fh: - line = line.rstrip().split(sep) - table.append(line) - - fh.close() - - return table - - - -#unParseTable 4/14/08 -#takes in a table generated by parseTable and writes it to an output file -#takes as parameters (table, output, sep), where sep is how the file is delimited -#example call unParseTable(table, 'table.txt', '\t') for a tab del file - -def unParseTable(table, output, sep): - fh_out = open(output, 'w') - if len(sep) == 0: - for i in table: - fh_out.write(str(i) + '\n') - else: - for line in table: - fh_out.write(sep.join([str(x) for x in line]) + '\n') - - fh_out.close() - - - - -def formatBed(bed,output=''): - - ''' - formats a bed file from UCSC or MACS into a WUSTL gateway compatible bed - ''' - - - newBed = [] - - if type(bed) == str: - bed = parseTable(bed,'\t') - - indexTicker = 1 - for line in bed: - - newLine = line[0:4] - try: - strand = line[5] - except IndexError: - strand = '.' - newLine+= [indexTicker,strand] - indexTicker +=1 - newBed.append(newLine) - - if len(output) > 0: - unParseTable(newBed,output,'\t') - else: - return newBed - -def bedToGFF(bed, output=''): - - ''' - turns a bed into a gff file - ''' - if isinstance(bed, str): - bed = parseTable(bed, '\t') - - bed = formatBed(bed) - - gff = [] - - for line in bed: - - gffLine = [line[0],line[3],'',line[1],line[2],line[4],line[5],'',line[3]] - gff.append(gffLine) - - - if len(output) > 0: - unParseTable(gff,output,'\t') - else: - return gff - - - -#100912 -#gffToBed - -def gffToBed(gff,output= ''): - ''' - turns a gff to a bed file - ''' - bed = [] - for line in gff: - newLine = [line[0],line[3],line[4],line[1],0,line[6]] - bed.append(newLine) - if len(output) == 0: - return bed - else: - unParseTable(bed,output,'\t') - -def formatFolder(folderName, create=False): - - ''' - makes sure a folder exists and if not makes it - returns a bool for folder - ''' - - if folderName[-1] != '/': - folderName +='/' - - try: - os.listdir(folderName) - return folderName - except OSError: - print('folder %s does not exist' % (folderName)) - if create: - os.mkdir(folderName) - return folderName - return False - - -def checkOutput(fileName, waitTime = 1, timeOut = 30): - - ''' - checks for the presence of a file every N minutes - if it exists, returns True - default is 1 minute with a max timeOut of 30 minutes - ''' - waitTime = int(waitTime*60) - - timeOut = int(timeOut*60) - - maxTicker = timeOut/waitTime - ticker = 0 - - fileExists = False - while not fileExists: - try: - size1 = os.stat(fileName).st_size - time.sleep(.5) - size2 = os.stat(fileName).st_size - if size1 == size2: - fileExists = True - else: - time.sleep(waitTime) - ticker+=1 - except OSError: - time.sleep(waitTime) - ticker+=1 - if ticker == maxTicker: - break - - - time.sleep(.1) - if fileExists: - return True - else: - print('OPERATION TIMED OUT. FILE %s NOT FOUND' % (fileName)) - return False - -def getParentFolder(inputFile): - - ''' - returns the parent folder for any file - ''' - - parentFolder = join(inputFile.split('/')[:-1],'/') +'/' - if parentFolder =='': - return './' - else: - return parentFolder - - -#================================================================== -#===================ANNOTATION FUNCTIONS=========================== -#================================================================== - - -def makeStartDict(annotFile, geneList=[]): - ''' - makes a dictionary keyed by refseq ID that contains information about - chrom/start/stop/strand/common name - ''' - - if type(geneList) == str: - geneList = parseTable(geneList, '\t') - geneList = [line[0] for line in geneList] - - if annotFile.upper().count('REFSEQ') == 1: - refseqTable,refseqDict = importRefseq(annotFile) - if len(geneList) == 0: - geneList = refseqDict.keys() - startDict = {} - for gene in geneList: - if refseqDict.has_key(gene) == False: - continue - startDict[gene]={} - startDict[gene]['sense'] = refseqTable[refseqDict[gene][0]][3] - startDict[gene]['chr'] = refseqTable[refseqDict[gene][0]][2] - startDict[gene]['start'] = getTSSs([gene],refseqTable,refseqDict) - if startDict[gene]['sense'] == '+': - startDict[gene]['end'] =[int(refseqTable[refseqDict[gene][0]][5])] - else: - startDict[gene]['end'] = [int(refseqTable[refseqDict[gene][0]][4])] - startDict[gene]['name'] = refseqTable[refseqDict[gene][0]][12] - return startDict - - -#generic function to get the TSS of any gene -def getTSSs(geneList,refseqTable,refseqDict): - #refseqTable,refseqDict = importRefseq(refseqFile) - if len(geneList) == 0: - refseq = refseqTable - else: - refseq = refseqFromKey(geneList,refseqDict,refseqTable) - TSS = [] - for line in refseq: - if line[3] == '+': - TSS.append(line[4]) - if line[3] == '-': - TSS.append(line[5]) - TSS = map(int,TSS) - - return TSS - - - - -#10/13/08 -#importRefseq -#takes in a refseq table and makes a refseq table and a refseq dictionary for keying the table - -def importRefseq(refseqFile, returnMultiples = False): - - ''' - opens up a refseq file downloaded by UCSC - ''' - refseqTable = parseTable(refseqFile,'\t') - refseqDict = {} - ticker = 1 - for line in refseqTable[1:]: - if refseqDict.has_key(line[1]): - refseqDict[line[1]].append(ticker) - else: - refseqDict[line[1]] = [ticker] - ticker = ticker + 1 - - multiples = [] - for i in refseqDict: - if len(refseqDict[i]) > 1: - multiples.append(i) - - if returnMultiples == True: - return refseqTable,refseqDict,multiples - else: - return refseqTable,refseqDict - - -#12/29/08 -#refseqFromKey(refseqKeyList,refseqDict,refseqTable) -#function that grabs refseq lines from refseq IDs -def refseqFromKey(refseqKeyList,refseqDict,refseqTable): - typeRefseq = [] - for name in refseqKeyList: - if refseqDict.has_key(name): - typeRefseq.append(refseqTable[refseqDict[name][0]]) - return typeRefseq - - - - -#10/13/08 -#make genes -#turns a refseq ID into a Gene object from utility module -def makeGenes(annotFile,geneList=[],asDict = False): - ''' - takes in a refseq or ensembl annotation file and enters all identifiers in the geneList into a list as gene objects - ''' - if asDict: - genes = {} - else: - genes = [] - if type(geneList) == str: - print('importing gene list from %s' % (geneList)) - geneList = parseTable(geneList,'\t') - geneList = [line[0] for line in geneList] - - - if annotFile.upper().count('REFSEQ') == 1: - refTable,refDict = importRefseq(annotFile) - - if len(geneList) == 0: - geneList = refDict.keys() - - for refseqID in geneList: - if refseqID not in refDict: - #print('no such gene ' + str(refseqID)) - continue - - geneIndex = refDict[refseqID][0] - geneLine = refTable[int(geneIndex)] - exonStarts = map(int,geneLine[9].split(',')[:-1]) - exonEnds = map(int,geneLine[10].split(',')[:-1]) - - gene = Gene(refseqID,geneLine[2],geneLine[3],[int(geneLine[4]),int(geneLine[5])],[int(geneLine[6]),int(geneLine[7])],exonStarts,exonEnds,geneLine[12]) - - if asDict: - genes[refseqID] = gene - else: - genes.append(gene) - - return genes - - -#04/07/09 -#makes a LocusCollection w/ each transcript as a locus -#bob = makeTranscriptCollection('/Users/chazlin/genomes/mm8/mm8refseq.txt') -def makeTranscriptCollection(annotFile,upSearch,downSearch,window = 500,geneList = []): - ''' - makes a LocusCollection w/ each transcript as a locus - takes in either a refseqfile or an ensemblGFF - ''' - - if annotFile.upper().count('REFSEQ') == 1: - refseqTable,refseqDict = importRefseq(annotFile) - locusList = [] - ticker = 0 - if len(geneList) == 0: - geneList = refseqDict.keys() - - for line in refseqTable[1:]: - if line[1] in geneList: - if line[3] == '-': - locus = Locus(line[2],int(line[4])-downSearch,int(line[5])+upSearch,line[3],line[1]) - else: - locus = Locus(line[2],int(line[4])-upSearch,int(line[5])+downSearch,line[3],line[1]) - locusList.append(locus) - ticker = ticker + 1 - if ticker%1000 == 0: - print(ticker) - - transCollection = LocusCollection(locusList, window) - return transCollection - - -#140213 - -def nameToRefseq(geneNamesList,annotFile,unique=True): - - ''' - takes a list of names and gets you the refseqID - ''' - startDict= makeStartDict(annotFile) - nameDict = defaultdict(list) - for refID in startDict.keys(): - - name = startDict[refID]['name'] - nameDict[name].append(refID) - - newTable = [] - for name in geneNamesList: - refIDList = nameDict[name] - - #unique preserves the initial number of genes in geneNamesList - #by taking only 1 refID per geneName - #will take the first in the refList, which should usually be the lower - #refID and thus the more relevant, but no guarantees - if unique: - newTable.append([name,refIDList[0]]) - else: - for refID in refIDList: - newTable.append([name,refID]) - - return newTable - - -#06/11/09 -#import bound region -#imports a bound region file and turns it into a locus collection -#bound region files are output by my pipeline as Name_boundFile.txt files -def importBoundRegion(boundRegionFile,name): - ''' - imports bound regions in either bed format or in error model format - ''' - - bound = parseTable(boundRegionFile,'\t') - lociList = [] - ticker = 1 - # - if boundRegionFile.split('.')[-1] == 'bed': - bed = True - else: - bed = False - if bed: - for line in bound: - if ticker%1000 == 0: - print(ticker) - lociList.append(Locus(line[0],int(line[1]),int(line[2]),'.',ID = name + '_' + str(ticker))) - ticker = ticker + 1 - else: - for line in bound: - if ticker%1000 == 0: - print(ticker) - - lociList.append(Locus('chr'+line[0],int(line[1]),int(line[2]),'.',ID = name + '_' + str(ticker))) - ticker = ticker + 1 - return LocusCollection(lociList,500) - - - -#================================================================== -#========================LOCUS INSTANCE============================ -#================================================================== - -#Locus and LocusCollection instances courtesy of Graham Ruby - - -class Locus: - # this may save some space by reducing the number of chromosome strings - # that are associated with Locus instances (see __init__). - __chrDict = dict() - __senseDict = {'+':'+', '-':'-', '.':'.'} - # chr = chromosome name (string) - # sense = '+' or '-' (or '.' for an ambidexterous locus) - # start,end = ints of the start and end coords of the locus; - # end coord is the coord of the last nucleotide. - def __init__(self,chr,start,end,sense,ID='',score=0): - coords = sorted([int(start), int(end)]) - # this method for assigning chromosome should help avoid storage of - # redundant strings. - if not(self.__chrDict.has_key(chr)): self.__chrDict[chr] = chr - self._chr = self.__chrDict[chr] - self._sense = self.__senseDict[sense] - self._start = int(coords[0]) - self._end = int(coords[1]) - self._ID = ID - self._score = score - def ID(self): return self._ID - def chr(self): return self._chr - def start(self): return self._start ## returns the smallest coordinate - def end(self): return self._end ## returns the biggest coordinate - def len(self): return self._end - self._start + 1 - def score(self): return self._score - def getAntisenseLocus(self): - if self._sense=='.': return self - else: - switch = {'+':'-', '-':'+'} - return Locus(self._chr,self._start,self._end,switch[self._sense]) - def coords(self): return [self._start,self._end] ## returns a sorted list of the coordinates - def sense(self): return self._sense - # returns boolean; True if two loci share any coordinates in common - def overlaps(self,otherLocus): - if self.chr()!=otherLocus.chr(): return False - elif not(self._sense=='.' or \ - otherLocus.sense()=='.' or \ - self.sense()==otherLocus.sense()): return False - elif self.start() > otherLocus.end() or otherLocus.start() > self.end(): return False - else: return True - - # returns boolean; True if all the nucleotides of the given locus overlap - # with the self locus - def contains(self,otherLocus): - if self.chr()!=otherLocus.chr(): return False - elif not(self._sense=='.' or \ - otherLocus.sense()=='.' or \ - self.sense()==otherLocus.sense()): return False - elif self.start() > otherLocus.start() or otherLocus.end() > self.end(): return False - else: return True - - # same as overlaps, but considers the opposite strand - def overlapsAntisense(self,otherLocus): - return self.getAntisenseLocus().overlaps(otherLocus) - # same as contains, but considers the opposite strand - def containsAntisense(self,otherLocus): - return self.getAntisenseLocus().contains(otherLocus) - def __hash__(self): return self._start + self._end - def __eq__(self,other): - if self.__class__ != other.__class__: return False - if self.chr()!=other.chr(): return False - if self.start()!=other.start(): return False - if self.end()!=other.end(): return False - if self.sense()!=other.sense(): return False - return True - def __ne__(self,other): return not(self.__eq__(other)) - def __str__(self): return self.chr()+'('+self.sense()+'):'+'-'.join(map(str,self.coords())) - def plotStr(self): return self.chr() + ':' + self.sense() + ':' + '-'.join(map(str,self.coords())) - def checkRep(self): - pass - def gffLine(self): return [self.chr(),self.ID(),'',self.start(),self.end(),'',self.sense(),'',self.ID()] - def getConservation(self,phastConFolder): - ''' - uses tabix to get a per base conservation score from an indexed conservation bedgraph - ''' - tabixString = 'tabix' #set the path/location of tabix - - #figure out which file is the correct one - chrom = self.chr() - - phastFile = [phastConFolder + x for x in os.listdir(phastConFolder) if x.count('bg.gz') == 1 and x.count('tbi') == 0 and x.split('.')[0] == chrom][0] - - tabixCmd = '%s %s %s:%s-%s' % (tabixString,phastFile,self.chr(),self.start(),self.end()) - - phast = subprocess.Popen(tabixCmd,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) - - phastLines = phast.stdout.readlines() - phast.stdout.close() - - phastTable = [x.rstrip().split('\t') for x in phastLines] - - #print phastTable - #set up a conservation sum - phastSum = 0.0 - #and number of bases w/ conservation data - phastBases = 0 - - for line in phastTable: - if int(line[1]) < self.start(): - lineLen = min(int(line[2]),self.end()) - self.start() +1 - phastSum += lineLen*float(line[3]) - phastBases += lineLen - elif int(line[2]) > self.end(): - lineLen = self.end() - max(int(line[1]),self.start()) - phastSum += lineLen*float(line[3]) - phastBases += lineLen - else: - lineLen = int(line[2]) - int(line[1]) - phastSum += lineLen*float(line[3]) - phastBases += lineLen - - if phastBases > self.len(): - print "this locus is sad %s. please debug me" % (self.__str__()) - print "locus length is %s" % (self.len()) - print "phastBases are %s" % (phastBases) - - - return phastSum/self.len() - - -class LocusCollection: - def __init__(self,loci,windowSize=50): - ### top-level keys are chr, then strand, no space - self.__chrToCoordToLoci = dict() - self.__loci = dict() - self.__winSize = windowSize - for lcs in loci: self.__addLocus(lcs) - - def __addLocus(self,lcs): - if not(self.__loci.has_key(lcs)): - self.__loci[lcs] = None - if lcs.sense()=='.': chrKeyList = [lcs.chr()+'+', lcs.chr()+'-'] - else: chrKeyList = [lcs.chr()+lcs.sense()] - for chrKey in chrKeyList: - if not(self.__chrToCoordToLoci.has_key(chrKey)): self.__chrToCoordToLoci[chrKey] = dict() - for n in self.__getKeyRange(lcs): - if not(self.__chrToCoordToLoci[chrKey].has_key(n)): self.__chrToCoordToLoci[chrKey][n] = [] - self.__chrToCoordToLoci[chrKey][n].append(lcs) - - def __getKeyRange(self,locus): - start = locus.start() / self.__winSize - end = locus.end() / self.__winSize + 1 ## add 1 because of the range - return range(start,end) - - def __len__(self): return len(self.__loci) - - def append(self,new): self.__addLocus(new) - def extend(self,newList): - for lcs in newList: self.__addLocus(lcs) - def hasLocus(self,locus): - return self.__loci.has_key(locus) - def remove(self,old): - if not(self.__loci.has_key(old)): raise ValueError("requested locus isn't in collection") - del self.__loci[old] - if old.sense()=='.': senseList = ['+','-'] - else: senseList = [old.sense()] - for k in self.__getKeyRange(old): - for sense in senseList: - self.__chrToCoordToLoci[old.chr()+sense][k].remove(old) - - def getWindowSize(self): return self.__winSize - def getLoci(self): return self.__loci.keys() - - def getSize(self): - size = 0 - for locus in self.__loci: - newsize = int(locus.end())-int(locus.start()) - size += newsize - return size - - def getChrList(self): - # i need to remove the strand info from the chromosome keys and make - # them non-redundant. - tempKeys = dict() - for k in self.__chrToCoordToLoci.keys(): tempKeys[k[:-1]] = None - return tempKeys.keys() - - def __subsetHelper(self,locus,sense): - sense = sense.lower() - if ['sense','antisense','both'].count(sense)!=1: - raise ValueError("sense command invalid: '"+sense+"'.") - matches = dict() - senses = ['+','-'] - if locus.sense()=='.' or sense=='both': lamb = lambda s: True - elif sense=='sense': lamb = lambda s: s==locus.sense() - elif sense=='antisense': lamb = lambda s: s!=locus.sense() - else: raise ValueError("sense value was inappropriate: '"+sense+"'.") - for s in filter(lamb, senses): - chrKey = locus.chr()+s - if self.__chrToCoordToLoci.has_key(chrKey): - for n in self.__getKeyRange(locus): - if self.__chrToCoordToLoci[chrKey].has_key(n): - for lcs in self.__chrToCoordToLoci[chrKey][n]: - matches[lcs] = None - return matches.keys() - - # sense can be 'sense' (default), 'antisense', or 'both' - # returns all members of the collection that overlap the locus - def getOverlap(self,locus,sense='sense'): - matches = self.__subsetHelper(locus,sense) - ### now, get rid of the ones that don't really overlap - realMatches = dict() - if sense=='sense' or sense=='both': - for i in filter(lambda lcs: lcs.overlaps(locus), matches): - realMatches[i] = None - if sense=='antisense' or sense=='both': - for i in filter(lambda lcs: lcs.overlapsAntisense(locus), matches): - realMatches[i] = None - return realMatches.keys() - - # sense can be 'sense' (default), 'antisense', or 'both' - # returns all members of the collection that are contained by the locus - def getContained(self,locus,sense='sense'): - matches = self.__subsetHelper(locus,sense) - ### now, get rid of the ones that don't really overlap - realMatches = dict() - if sense=='sense' or sense=='both': - for i in filter(lambda lcs: locus.contains(lcs), matches): - realMatches[i] = None - if sense=='antisense' or sense=='both': - for i in filter(lambda lcs: locus.containsAntisense(lcs), matches): - realMatches[i] = None - return realMatches.keys() - - # sense can be 'sense' (default), 'antisense', or 'both' - # returns all members of the collection that contain the locus - def getContainers(self,locus,sense='sense'): - matches = self.__subsetHelper(locus,sense) - ### now, get rid of the ones that don't really overlap - realMatches = dict() - if sense=='sense' or sense=='both': - for i in filter(lambda lcs: lcs.contains(locus), matches): - realMatches[i] = None - if sense=='antisense' or sense=='both': - for i in filter(lambda lcs: lcs.containsAntisense(locus), matches): - realMatches[i] = None - return realMatches.keys() - - def stitchCollection(self,stitchWindow=1,sense='both'): - - ''' - reduces the collection by stitching together overlapping loci - returns a new collection - ''' - - #initializing stitchWindow to 1 - #this helps collect directly adjacent loci - - - - locusList = self.getLoci() - oldCollection = LocusCollection(locusList,500) - - stitchedCollection = LocusCollection([],500) - - for locus in locusList: - #print(locus.coords()) - if oldCollection.hasLocus(locus): - oldCollection.remove(locus) - overlappingLoci = oldCollection.getOverlap(Locus(locus.chr(),locus.start()-stitchWindow,locus.end()+stitchWindow,locus.sense(),locus.ID()),sense) - - stitchTicker = 1 - while len(overlappingLoci) > 0: - stitchTicker+=len(overlappingLoci) - overlapCoords = locus.coords() - - - for overlappingLocus in overlappingLoci: - overlapCoords+=overlappingLocus.coords() - oldCollection.remove(overlappingLocus) - overlapCoords = [int(x) for x in overlapCoords] - if sense == 'both': - locus = Locus(locus.chr(),min(overlapCoords),max(overlapCoords),'.',locus.ID()) - else: - locus = Locus(locus.chr(),min(overlapCoords),max(overlapCoords),locus.sense(),locus.ID()) - overlappingLoci = oldCollection.getOverlap(Locus(locus.chr(),locus.start()-stitchWindow,locus.end()+stitchWindow,locus.sense()),sense) - locus._ID = '%s_%s_lociStitched' % (stitchTicker,locus.ID()) - - stitchedCollection.append(locus) - - else: - continue - return stitchedCollection - - - -#================================================================== -#========================GENE INSTANCE============================ -#================================================================== -# this is a gene object. unlike the previous gene_object, this actually represents -# a gene, as opposed to a whole set of genes, which was a poor design in the first place. -class Gene: - # name = name of the gene (string) - # txCoords = list of coords defining the boundaries of the transcipt - # cdCoords = list of coords defining the beginning and end of the coding region - # exStarts = list of coords marking the beginning of each exon - # exEnds = list of coords marking the end of each exon - # IF THIS IS A NON-CODING GENE, cdCoords => [0,0] - # def __init__(self,name,chr,sense,txCoords,cdCoords,exStarts,exEnds): - # self._name = name - # self._txLocus = Locus(chr,min(txCoords),max(txCoords),sense) - # self._cdLocus = Locus(chr,min(cdCoords),max(cdCoords),sense) - # - # exStarts = map(lambda i: i, exStarts) - # exEnds = map(lambda i: i, exEnds) - # exStarts.sort() - # exEnds.sort() - # - # self._txExons = [] - # self._cdExons = [] - # self._introns = [] - # - # for n in range(len(exStarts)): - # if n==0: - # self._txExons.append(Locus(chr,txCoords[0],exEnds[n]-1,sense)) - # self._cdExons.append(Locus(chr,cdCoords[0],exEnds[n]-1,sense)) - # elif n==len(exStarts)-1: - # self._txExons.append(Locus(chr,txCoords[0],txCoords[1],sense)) - # self._cdExons.append(Locus(chr,cdCoords[0],cdCoords[1],sense)) - # else: - # newExon = Locus(chr,exStarts[n],exEnds[n]-1,sense) - # self._txExons.append(newExon) - # self._cdExons.append(newExon) - # if n < len(exStarts)-1: self._introns.append(Locus(chr,exEnds[n],exStarts[n+1]-1,sense)) - # - # if sense=='+': - # self._fpUtr = Locus(chr,txCoords[0],cdCoords[0]-1,sense) - # self._tpUtr = Locus(chr,cdCoords[1]+1,txCoords[1],sense) - # elif sense=='-': - # self._fpUtr = Locus(chr,cdCoords[1]+1,txCoords[1],sense) - # self._tpUtr = Locus(chr,txCoords[0],cdCoords[0]-1,sense) - def __init__(self,name,chr,sense,txCoords,cdCoords,exStarts,exEnds,commonName=''): - self._name = name - self._commonName = commonName - self._txLocus = Locus(chr,min(txCoords),max(txCoords),sense,self._name) - if cdCoords == None: - self._cdLocus = None - else: - self._cdLocus = Locus(chr,min(cdCoords),max(cdCoords),sense) - - exStarts = sorted(map(lambda i: i, exStarts)) - exEnds = sorted(map(lambda i: i, exEnds)) - - self._txExons = [] - self._cdExons = [] - self._introns = [] - - cd_exon_count = 0 - - for n in range(len(exStarts)): - first_locus = Locus(chr,exStarts[n],exStarts[n],sense) - second_locus = Locus(chr,exEnds[n],exEnds[n],sense) - - # Add the transcription unit exon - tx_exon = Locus(chr,exStarts[n],exEnds[n],sense) - - self._txExons.append(tx_exon) - - # Add Coding Exons - # Need to make sure that the current exon is actually in the coding region of the gene first - if self.isCoding() and tx_exon.overlaps(self._cdLocus): - if not first_locus.overlaps(self._cdLocus): - first_coord = min(cdCoords) - else: - first_coord = exStarts[n] - - if not second_locus.overlaps(self._cdLocus): - second_coord = max(cdCoords) - else: - second_coord = exEnds[n] - - new_cd_exon = Locus(chr,first_coord,second_coord,sense) - self._cdExons.append(new_cd_exon) - - # Add Introns - if n < len(exStarts)-1: - self._introns.append(Locus(chr,exEnds[n]+1,exStarts[n +1]-1,sense)) - - if self.isCoding(): - if sense=='+': - self._fpUTR = Locus(chr,min(txCoords),min(cdCoords)-1,sense) - self._tpUTR = Locus(chr,max(cdCoords)+1,max(txCoords),sense) - elif sense=='-': - self._fpUTR = Locus(chr,max(cdCoords)+1,max(txCoords),sense) - self._tpUTR = Locus(chr,min(txCoords),min(cdCoords)-1,sense) - else: - self._fpUTR = None - self._tpUTR = None - - def commonName(self): return self._commonName - def name(self): return self._name - def chr(self): return self._txLocus.chr() - def sense(self): return self._txLocus.sense() - def txLocus(self): return self._txLocus ## locus of full transcript - def cdLocus(self): return self._cdLocus ## locus from start codon to end codon - def txExons(self): return map(lambda i: i, self._txExons) ## list of loci - def cdExons(self): return map(lambda i: i, self._cdExons) ## list of loci - def introns(self): return map(lambda i: i, self._introns) ## list of loci - def fpUtr(self): return self._fpUTR ## locus - def tpUtr(self): return self._tpUTR ## locus - def isCoding(self): return not(self._cdLocus.start()==0 and self._cdLocus.end()==0) # boolean; is this gene protein-coding? - def tss(self,upstream = 0,downstream = 0): - if self._txLocus.sense() == '-': - return Locus(self._txLocus.chr(),self._txLocus.end()-downstream,self._txLocus.end()+upstream,self._txLocus.sense(),self._name) - else: - return Locus(self._txLocus.chr(),self._txLocus.start()-upstream,self._txLocus.start()+downstream,self._txLocus.sense(),self._name) - def __hash__(self): return self._txLocus.__hash__() - - - -#================================================================== -#========================LOCUS FUNCTIONS=========================== -#================================================================== -#06/11/09 -#turns a locusCollection into a gff -#does not write to disk though - - -def locusCollectionToBed(locusCollection): - - lociList = locusCollection.getLoci() - bed = [] - for locus in lociList: - coords = locus.coords() - coords.sort() - newLine = [locus.chr(),coords[0],coords[1],locus.sense(),locus.ID()] - bed.append(newLine) - return bed - - - -def locusCollectionToGFF(locusCollection): - lociList = locusCollection.getLoci() - gff = [] - for locus in lociList: - newLine = [locus.chr(),locus.ID(),'',locus.coords()[0],locus.coords()[1],'',locus.sense(),'',locus.ID()] - gff.append(newLine) - return gff - - - -def bedToLocusCollection(bedfile): - - table = parseTable(bedfile, '\t') - loci = [Locus(x[0], x[1], x[2], '.', ID=x[3]) for x in table] - collection = LocusCollection(loci, 50) - - return collection - -def gffToLocusCollection(gff,window =500): - - ''' - opens up a gff file and turns it into a LocusCollection instance - ''' - - lociList = [] - if type(gff) == str: - gff = parseTable(gff,'\t') - - for line in gff: - #USE line[1] as the locus ID. If that is empty use line[8] - if len(line[1]) > 0: - name = line[1] - elif len(line[8]) >0: - name = line[8] - else: - name = '%s:%s:%s-%s' % (line[0],line[6],line[3],line[4]) - - lociList.append(Locus(line[0],line[3],line[4],line[6],name)) - return LocusCollection(lociList,window) - - - -def makeTSSLocus(gene,startDict,upstream,downstream): - ''' - given a startDict, make a locus for any gene's TSS w/ upstream and downstream windows - ''' - - start = startDict[gene]['start'][0] - if startDict[gene]['sense'] == '-': - return Locus(startDict[gene]['chr'],start-downstream,start+upstream,'-',gene) - else: - return Locus(startDict[gene]['chr'],start-upstream,start+downstream,'+',gene) - - -#06/11/09 -#takes a locus and expands it by a fixed upstream/downstream amount. spits out the new larger locus -def makeSearchLocus(locus,upSearch,downSearch): - if locus.sense() == '-': - searchLocus = Locus(locus.chr(),locus.start()-downSearch,locus.end()+upSearch,locus.sense(),locus.ID()) - else: - searchLocus = Locus(locus.chr(),locus.start()-upSearch,locus.end()+downSearch,locus.sense(),locus.ID()) - return searchLocus - - -def makeSECollection(enhancerFile,name,top=0): - ''' - returns a locus collection from a super table - top gives the number of rows - ''' - enhancerTable = parseTable(enhancerFile,'\t') - superLoci = [] - - ticker = 0 - for line in enhancerTable: - if line[0][0] == '#' or line[0][0] == 'R': - continue - else: - ticker+=1 - - superLoci.append(Locus(line[1],line[2],line[3],'.',name+'_'+line[0])) - - if ticker == top: - break - return LocusCollection(superLoci,50) - - -#================================================================== -#==========================BAM CLASS=============================== -#================================================================== - -#11/11/10 -#makes a new class Bam for dealing with bam files and integrating them into the SolexaRun class - -def convertBitwiseFlag(flag): - if int(flag) & 16: - return "-"; - else: - return "+"; - -class Bam: - '''A class for a sorted and indexed bam file that allows easy analysis of reads''' - def __init__(self,bamFile): - self._bam = bamFile - - def getTotalReads(self,readType = 'mapped'): - command = '%s flagstat %s' % (samtoolsString,self._bam) - stats = subprocess.Popen(command,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) - statLines = stats.stdout.readlines() - stats.stdout.close() - if readType == 'mapped': - for line in statLines: - if line.count('mapped (') == 1: - - return int(line.split(' ')[0]) - if readType == 'total': - return int(statLines[0].split(' ')[0]) - - def convertBitwiseFlag(self,flag): - if flag & 16: - return "-"; - else: - return "+"; - - def getRawReads(self,locus,sense,unique = False,includeJxnReads = False,printCommand = False): - ''' - gets raw reads from the bam using samtools view. - can enforce uniqueness and strandedness - ''' - locusLine = locus.chr()+':'+str(locus.start())+'-'+str(locus.end()) - - command = '%s view %s %s' % (samtoolsString,self._bam,locusLine) - if printCommand: - print(command) - getReads = subprocess.Popen(command,stdin = subprocess.PIPE,stderr = subprocess.PIPE,stdout = subprocess.PIPE,shell = True) - reads = getReads.communicate() - reads = reads[0].split('\n')[:-1] - reads = [read.split('\t') for read in reads] - if includeJxnReads == False: - reads = filter(lambda x: x[5].count('N') < 1,reads) - - #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-'} - #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-','256':'+','272':'-','99':'+','147':'-'} - - - #BJA added 256 and 272, which correspond to 0 and 16 for multi-mapped reads respectively: - #http://onetipperday.blogspot.com/2012/04/understand-flag-code-of-sam-format.html - #convert = string.maketrans('160','--+') - keptReads = [] - seqDict = defaultdict(int) - if sense == '-': - strand = ['+','-'] - strand.remove(locus.sense()) - strand = strand[0] - else: - strand = locus.sense() - for read in reads: - #readStrand = read[1].translate(convert)[0] - #print read[1], read[0] - #readStrand = convertDict[read[1]] - readStrand = convertBitwiseFlag(read[1]) - - if sense == 'both' or sense == '.' or readStrand == strand: - - if unique and seqDict[read[9]] == 0: - keptReads.append(read) - elif not unique: - keptReads.append(read) - seqDict[read[9]]+=1 - - return keptReads - - def readsToLoci(self,reads,IDtag = 'sequence,seqID,none'): - ''' - takes raw read lines from the bam and converts them into loci - ''' - loci = [] - ID = '' - if IDtag == 'sequence,seqID,none': - print('please specify one of the three options: sequence, seqID, none') - return - #convert = string.maketrans('160','--+') - #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-'} - #convertDict = {'16':'-','0':'+','64':'+','65':'+','80':'-','81':'-','129':'+','145':'-','256':'+','272':'-'} - - #BJA added 256 and 272, which correspond to 0 and 16 for multi-mapped reads respectively: - #http://onetipperday.blogspot.com/2012/04/understand-flag-code-of-sam-format.html - #convert = string.maketrans('160','--+') - numPattern = re.compile('\d*') - for read in reads: - chrom = read[2] - #strand = read[1].translate(convert)[0] - #strand = convertDict[read[1]] - strand = convertBitwiseFlag(read[1]) - if IDtag == 'sequence': - ID = read[9] - elif IDtag == 'seqID': - ID = read[0] - else: - ID = '' - - length = len(read[9]) - start = int(read[3]) - if read[5].count('N') == 1: - #this awful oneliner first finds all of the numbers in the read string - #then it filters out the '' and converts them to integers - #only works for reads that span one junction - - [first,gap,second] = [int(x) for x in filter(lambda x: len(x) > 0, re.findall(numPattern,read[5]))][0:3] - if IDtag == 'sequence': - loci.append(Locus(chrom,start,start+first,strand,ID[0:first])) - loci.append(Locus(chrom,start+first+gap,start+first+gap+second,strand,ID[first:])) - else: - loci.append(Locus(chrom,start,start+first,strand,ID)) - loci.append(Locus(chrom,start+first+gap,start+first+gap+second,strand,ID)) - elif read[5].count('N') > 1: - continue - else: - loci.append(Locus(chrom,start,start+length,strand,ID)) - return loci - - def getReadsLocus(self,locus,sense = 'both',unique = True,IDtag = 'sequence,seqID,none',includeJxnReads = False): - ''' - gets all of the reads for a given locus - ''' - reads = self.getRawReads(locus,sense,unique,includeJxnReads) - - loci = self.readsToLoci(reads,IDtag) - - return loci - - def getReadSequences(self,locus,sense = 'both',unique = True,includeJxnReads = False): - - reads = self.getRawReads(locus,sense,unique,includeJxnReads) - - return [read[9] for read in reads] - - def getReadStarts(self,locus,sense = 'both',unique = False,includeJxnReads = False): - reads = self.getRawReads(locus,sense,unique,includeJxnReads) - - return [int(read[3]) for read in reads] - - - def getReadCount(self,locus,sense = 'both',unique = True,includeJxnReads = False): - reads = self.getRawReads(locus,sense,unique,includeJxnReads) - - return len(reads) - - def liquidateLocus(self,locus,sense='.'): - - bamliquidatorCmd = 'bamliquidator %s %s %s %s %s 1 200' % (self._bam, locus.chr(), - str(locus.start()), str(locus.end()), - sense) - - bamliquidatorOut = subprocess.Popen(bamliquidatorCmd, stdout = subprocess.PIPE, shell=True) - score = bamliquidatorOut.communicate()[0] - - return score - -#================================================================== -#========================MISC FUNCTIONS============================ -#================================================================== - - - - -#uniquify function -#by Peter Bengtsson -#Used under a creative commons license -#sourced from here: http://www.peterbe.com/plog/uniqifiers-benchmark - -def uniquify(seq, idfun=None): - # order preserving - if idfun is None: - def idfun(x): return x - seen = {} - result = [] - for item in seq: - marker = idfun(item) - # in old Python versions: - # if seen.has_key(marker) - # but in new ones: - if marker in seen: continue - seen[marker] = 1 - result.append(item) - return result - - -#082009 -#taken from http://code.activestate.com/recipes/491268/ - -def order(x, NoneIsLast = True, decreasing = False): - """ - Returns the ordering of the elements of x. The list - [ x[j] for j in order(x) ] is a sorted version of x. - - Missing values in x are indicated by None. If NoneIsLast is true, - then missing values are ordered to be at the end. - Otherwise, they are ordered at the beginning. - """ - omitNone = False - if NoneIsLast == None: - NoneIsLast = True - omitNone = True - - n = len(x) - ix = range(n) - if None not in x: - ix.sort(reverse = decreasing, key = lambda j : x[j]) - else: - # Handle None values properly. - def key(i, x = x): - elem = x[i] - # Valid values are True or False only. - if decreasing == NoneIsLast: - return not(elem is None), elem - else: - return elem is None, elem - ix = range(n) - ix.sort(key=key, reverse=decreasing) - - if omitNone: - n = len(x) - for i in range(n-1, -1, -1): - if x[ix[i]] == None: - n -= 1 - return ix[:n] - return ix - - - -#================================================================== -#=======================SEQUENCE FUNCTIONS========================= -#================================================================== - - - -#10/22/08 -#fetchSeq -#function that fetches a sequence from a genome directory -#directory that contains individual chrom fasta files - -def fetchSeq(directory,chrom,start,end,UCSC=False,lineBreaks=True,header = True): - fn = directory + chrom + '.fa' - fh = open(fn,'r') - headerOffset = 0 - nStart = 0 - nEnd = 0 - if header: - fh.seek(0) - headerOffset = len(fh.readline()) - if lineBreaks: - - nStart = (start-1)/50 - nEnd = (end-1)/50 - if UCSC: - fh.seek((start+nStart+headerOffset)) - else: - fh.seek((start-1+nStart+headerOffset)) - span = ((end+nEnd-1)-(start+nStart-1)) - # if UCSC: - # span+=1 - - read = fh.read(span) - if lineBreaks: - read = read.replace('\n','') - #print(headerOffset,nStart,nEnd) - return read - fh.close() - - - -#10/22/08 -#gffToFasta -#function that writes a fasta file from a gff file -#directory is the genome directory with the chromosome folders -def gffToFasta(genome,directory,gff,UCSC = True,useID=False): - fastaList = [] - - ticker = 0 - if type(gff) == str: - gff = parseTable(gff,'\t') - for line in gff: - try: - sequence = fetchSeq(directory,line[0],int(line[3]),int(line[4]),UCSC) - except: - continue - if ticker%1000 == 0: print(ticker) - - if useID: - name = '>' + line[1] - else: - name = '>'+ join([genome.lower(),line[0],str(line[3]),str(line[4]),line[6]],'|') - fastaList.append(name) - if line[6] == '-': - #print(line[3]) - #print(line[4]) - fastaList.append(revComp(sequence)) - else: - fastaList.append(sequence) - ticker += 1 - return fastaList - - - -#10/23/08 -#pair(nuc) -#returns the basepair of a nucleotide - -def pair(nuc): - pairDict = {'A':'T','C':'G','G':'C','T':'A','U':'A','a':'t','c':'g','g':'c','t':'a','u':'a'} - if pairDict.has_key(nuc): - return pairDict[nuc] - else: - return nuc - -#10/23/08 -#revComp(seq) -#takes in a sequence and spits out the reverse Complement - -def revComp(seq,rev = True, RNA=False): - if rev: - revComp = join(map(pair,seq[::-1]),'') - else: - revComp = join(map(pair,seq),'') - if RNA: - revComp = revComp.replace('T','U') # Not sure this is defined (!) - revComp = revComp.replace('t','u') - else: - revComp = revComp.replace('U','T') - revComp = revComp.replace('u','t') - return revComp diff --git a/setup.cfg b/setup.cfg index b88034e..3df5d37 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,4 @@ [metadata] description-file = README.md +[flake8] +max-line-length = 99 diff --git a/setup.py b/setup.py index b9c5ac4..7c6e4ae 100644 --- a/setup.py +++ b/setup.py @@ -1,36 +1,34 @@ -import os - -from setuptools import setup, find_packages - - +from setuptools import find_packages, setup setup( - name='rose2', - version='1.0.0', - description='ROSE2 python package', - long_description='PROGRAM TO STITCH TOGETHER REGIONS TO FORM ENHANCERS, MAP READ DENSITY TO STITCHED REGIONS,AND RANK ENHANCERS BY READ DENSITY TO DISCOVER SUPER-ENHANCERS', - url='https://github.com/linlabbcm/rose2', - download_url = 'https://github.com/linlabbcm/rose2/tarball/v1.0.0', - - - classifiers=[], - - keywords=['bioinformatics','super-enhancers'], - - packages=find_packages(), - package_data={ - 'rose2': ['annotation/*'], - }, - - install_requires=['numpy>=1.8.2'], - extras_require={}, - - scripts=['scripts/ROSE2_callSuper.R', 'scripts/ROSE2_stitchOpt.R'], - - entry_points={ - 'console_scripts': [ - 'rose2=rose2:main', - ] - }, - -) \ No newline at end of file + name='rose2', + version='1.1.0', + description='ROSE2 python package', + long_description=( + 'PROGRAM TO STITCH TOGETHER REGIONS TO FORM ENHANCERS, MAP READ DENSITY TO STITCHED ' + 'REGIONS,AND RANK ENHANCERS BY READ DENSITY TO DISCOVER SUPER-ENHANCERS' + ), + url='https://github.com/linlabbcm/rose2', + download_url='https://github.com/linlabbcm/rose2/tarball/v1.1.0', + + classifiers=[], + + keywords=['bioinformatics', 'super-enhancers'], + + packages=find_packages(), + package_data={ + 'rose2': ['annotation/*'], + }, + + install_requires=['numpy>=1.8.2'], + extras_require={}, + + scripts=['scripts/ROSE2_callSuper.R', 'scripts/ROSE2_stitchOpt.R'], + + entry_points={ + 'console_scripts': [ + 'rose2=rose2.rose2_main:main', + ] + }, + include_package_data=True, +)