Merge branch 'release/2.5.1'

deeptools · May 12, 2017 · b5fc407 · b5fc407
2 parents 28cc0ac + 601d5de
commit b5fc407
Show file tree

Hide file tree

Showing 32 changed files with 355 additions and 336 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -1,3 +1,16 @@
+2.5.1
+
+ * Added universal new line support to deeptoolsintervals (issue #506).
+ * Fixed a few issues with correctGCBias under python 3.5 (thanks to @drakeeee)
+ * Setting `--minThreshold 0.0` or `--maxThreshold 0.0` now works properly. Previously, setting either of these to 0 was ignored. (issue #516)
+ * You can now specify the plot width and height in `plotPCA` and `plotCorrelation` (heatmap only) with the `--plotWidth` and `--plotHeight` parameters. (issue #507)
+ * plotCoverage no longer clips the top off of plots. Further, you can now set the plot width and height with `--plotWidth` and `--plotHeight`. (issue #508)
+ * In bamCoverage, specifying `--filterRNAstrand` no longer results in `--extendReads` being ignored. (issue #520)
+ * `plotFingerprint` and `plotEnrichment` no longer require producing a plot, which is useful if you only need QC metrics and are using a LOT of samples (such that matplotlib would crash anyway). This hasn't been implemented in Galaxy, but can if people would like it. (issues #519 and #526)
+ * `computeMatrix` now accepts a `--samplesLabel` option, which is useful in those cases when you aren't immediately running `plotHeatmap` and don't have terribly descriptive file names (issue #523)
+ * If you use `plotFingerprint` with the `--JSDsample` option and forget to list that file under `--bamfiles` it will be added automatically and the file name added to the labels if needed (issue #527)
+ * Various Galaxy wrapper fixes
+
 2.5.0
 
  * Fix a bug where using regions with the same name in multiple BED files in computeMatrix caused downstream problems in plotHeatmap/plotProfile (issue #477).

diff --git a/deeptools/_version.py b/deeptools/_version.py
@@ -2,4 +2,4 @@
 # This file is originally generated from Git information by running 'setup.py
 # version'. Distribution tarballs contain a pre-generated copy of this file.
 
-__version__ = '2.5.0.1'
+__version__ = '2.5.1'
diff --git a/deeptools/bamCoverage.py b/deeptools/bamCoverage.py
@@ -148,6 +148,10 @@ def main(args=None):
 
     func_args = {'scaleFactor': scale_factor}
 
+    # This fixes issue #520, where --extendReads wasn't honored if --filterRNAstrand was used
+    if args.filterRNAstrand and not args.Offset:
+        args.Offset = [1, -1]
+
     if args.MNase:
         # check that library is paired end
         # using getFragmentAndReadSize
@@ -210,26 +214,6 @@ def main(args=None):
                             verbose=args.verbose)
         wr.filter_strand = args.filterRNAstrand
         wr.Offset = args.Offset
-
-    elif args.filterRNAstrand:
-        wr = filterRnaStrand([args.bam],
-                             binLength=args.binSize,
-                             stepSize=args.binSize,
-                             region=args.region,
-                             numberOfProcessors=args.numberOfProcessors,
-                             extendReads=args.extendReads,
-                             minMappingQuality=args.minMappingQuality,
-                             ignoreDuplicates=args.ignoreDuplicates,
-                             center_read=args.centerReads,
-                             zerosToNans=args.skipNonCoveredRegions,
-                             samFlag_include=args.samFlagInclude,
-                             samFlag_exclude=args.samFlagExclude,
-                             minFragmentLength=args.minFragmentLength,
-                             maxFragmentLength=args.maxFragmentLength,
-                             verbose=args.verbose,
-                             )
-
-        wr.filter_strand = args.filterRNAstrand
     else:
         wr = writeBedGraph.WriteBedGraph([args.bam],
                                          binLength=args.binSize,
@@ -399,58 +383,3 @@ def get_fragment_from_read(self, read):
                 fragment_end = fragment_start + 3
 
         return [(fragment_start, fragment_end)]
-
-
-class filterRnaStrand(writeBedGraph.WriteBedGraph):
-    """
-    Class to redefine the get_fragment_from_read for the --filterRNAstrand case
-
-    Only reads either forward or reverse are kept as follows:
-
-    For paired-end
-    --------------
-    reads forward:
-
-     1. alignments of the second in pair (128) if they map to the forward strand (~16)
-     2. alignments of the first in pair (64) if they map to the reverse  strand (~32)
-
-     1. include 128, exclude 16
-     or
-     2. include 64 exclude 32
-
-    reads reverse:
-    1. alignments of the second in pair (128) if it maps to the reverse strand (16) 128 & 16 = 144
-    2. alignments of the first in pair (64) if their mates map to the reverse strand (32) 64 & 32 = 96
-
-     1. include 144
-     or
-     2. include 96
-
-    For single-end
-    --------------
-    forward: include 16 (map forward strand)
-    reverse: exclude 16
-
-    """
-
-    def get_fragment_from_read(self, read):
-        """
-        Gets only reads for the given strand
-        """
-        fragment_start = fragment_end = None
-
-        # only paired forward reads are considered
-        if read.is_paired:
-            if self.filter_strand == 'forward':
-                if (read.flag & 128 == 128 and read.flag & 16 == 0) or (read.flag & 64 == 64 and read.flag & 32 == 0):
-                    return read.get_blocks()
-            else:
-                if read.flag & 144 == 144 or read.flag & 96 == 96:
-                    return read.get_blocks()
-        else:
-            if self.filter_strand == 'forward' and read.flag & 16 == 16:
-                return read.get_blocks()
-            elif self.filter_strand == 'reverse' and read.flag & 16 == 0:
-                return read.get_blocks()
-
-        return [(fragment_start, fragment_end)]
diff --git a/deeptools/bamPEFragmentSize.py b/deeptools/bamPEFragmentSize.py
@@ -83,14 +83,18 @@ def parse_arguments():
     return parser
 
 
-def getFragSize(bam, args):
+def getFragSize(bam, args, idx):
         fragment_len_dict, read_len_dict = get_read_and_fragment_length(bam, return_lengths=True,
                                                                         blackListFileName=args.blackListFileName,
                                                                         numberOfProcessors=args.numberOfProcessors,
                                                                         verbose=args.verbose,
                                                                         binSize=args.binSize,
                                                                         distanceBetweenBins=args.distanceBetweenBins)
-        print("\n\nBAM file : {}".format(bam))
+        if args.samplesLabel and idx < len(args.samplesLabel):
+            print("\n\nSample label: {}".format(args.samplesLabel[idx]))
+        else:
+            print("\n\nBAM file : {}".format(bam))
+
         if fragment_len_dict:
             if fragment_len_dict['mean'] == 0:
                 print("No pairs were found. Is the data from a paired-end sequencing experiment?")
@@ -125,8 +129,8 @@ def main(args=None):
     args = parse_arguments().parse_args(args)
 
     fraglengths = {}
-    for bam in args.bamfiles:
-        fraglengths[bam] = getFragSize(bam, args)
+    for idx, bam in enumerate(args.bamfiles):
+        fraglengths[bam] = getFragSize(bam, args, idx)
 
     if args.histogram:
         import matplotlib

diff --git a/deeptools/computeMatrix.py b/deeptools/computeMatrix.py
@@ -316,6 +316,14 @@ def computeMatrixOptArgs(case=['scale-regions', 'reference-point'][0]):
                           metavar="BED file",
                           required=False)
 
+    optional.add_argument('--samplesLabel',
+                          help='Labels for the samples. This will then be passed to plotHeatmap and plotProfile. The '
+                          'default is to use the file name of the '
+                          'sample. The sample labels should be separated '
+                          'by spaces and quoted if a label itself'
+                          'contains a space E.g. --samplesLabel label-1 "label 2"  ',
+                          nargs='+')
+
     # in contrast to other tools,
     # computeMatrix by default outputs
     # messages and the --quiet flag supresses them

diff --git a/deeptools/correctGCBias.py b/deeptools/correctGCBias.py
@@ -237,12 +237,16 @@ def writeCorrected_worker(chrNameBam, chrNameBit, start, end, step):
 
         cvg_corr[vectorStart:vectorEnd] += float(1) / R_gc[gc]
         i += 1
-    if debug:
-        endTime = time.time()
-        print("{}, processing {} ({:.1f} per sec) ")
-        "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
-                                  i, i / (endTime - startTime),
-                                  chrNameBit, start, end)
+
+    try:
+        if debug:
+            endTime = time.time()
+            print("{}, processing {} ({:.1f} per sec) ")
+            "reads @ {}:{}-{}".format(multiprocessing.current_process().name,
+                                      i, i / (endTime - startTime),
+                                      chrNameBit, start, end)
+    except NameError:
+        pass
 
     if i == 0:
         return None
@@ -661,7 +665,7 @@ def main(args=None):
             res = list(map(writeCorrected_wrapper, mp_args))
 
         # concatenate intermediary bedgraph files
-        _temp_bg_file = open(_temp_bg_file_name, 'w')
+        _temp_bg_file = open(_temp_bg_file_name, 'wb')
         for tempFileName in res:
             if tempFileName:
                 # concatenate all intermediate tempfiles into one

diff --git a/deeptools/correlation.py b/deeptools/correlation.py
@@ -227,7 +227,7 @@ def compute_correlation(self):
 
     def plot_correlation(self, plot_fiilename, plot_title='', vmax=None,
                          vmin=None, colormap='jet', image_format=None,
-                         plot_numbers=False):
+                         plot_numbers=False, plotWidth=11, plotHeight=9.5):
         """
         plots a correlation using a symmetric heatmap
         """
@@ -248,7 +248,7 @@ def plot_correlation(self, plot_fiilename, plot_title='', vmax=None,
             vmin = 0 if corr_matrix .min() >= 0 else -1
 
         # Compute and plot dendrogram.
-        fig = plt.figure(figsize=(11, 9.5))
+        fig = plt.figure(figsize=(plotWidth, plotHeight))
         plt.suptitle(plot_title)
 
         axdendro = fig.add_axes([0.02, 0.12, 0.1, 0.66])
@@ -431,12 +431,12 @@ def plot_scatter(self, plot_fiilename, plot_title='', image_format=None, log1p=F
         plt.savefig(plot_fiilename, format=image_format)
         plt.close()
 
-    def plot_pca(self, plot_filename, plot_title='', image_format=None, log1p=False):
+    def plot_pca(self, plot_filename, plot_title='', image_format=None, log1p=False, plotWidth=5, plotHeight=10):
         """
         Plot the PCA of a matrix
         """
 
-        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(5, 10))
+        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(plotWidth, plotHeight))
         # PCA
         if self.rowCenter:
             _ = self.matrix.mean(axis=1)

diff --git a/deeptools/getFragmentAndReadSize.py b/deeptools/getFragmentAndReadSize.py
@@ -84,6 +84,19 @@ def get_read_and_fragment_length(bamFile, return_lengths=False, blackListFileNam
 
     distanceBetweenBins *= 2
     fl = []
+
+    # Fix issue #522, allow distanceBetweenBins == 0
+    if distanceBetweenBins == 0:
+        imap_res = mapReduce.mapReduce((bam_handle.filename, distanceBetweenBins),
+                                       getFragmentLength_wrapper,
+                                       chrom_sizes,
+                                       genomeChunkLength=binSize,
+                                       blackListFileName=blackListFileName,
+                                       numberOfProcessors=numberOfProcessors,
+                                       verbose=verbose)
+        fl = np.concatenate(imap_res)
+
+    # Try to ensure we have at least 1000 regions from which to compute statistics, halving the intra-bin distance as needed
     while len(fl) < 1000 and distanceBetweenBins > 1:
         distanceBetweenBins /= 2
         stepsize = binSize + distanceBetweenBins

diff --git a/deeptools/getScaleFactor.py b/deeptools/getScaleFactor.py
@@ -168,7 +168,7 @@ def get_scale_factor(args):
     bam_mapped, bam_mapped_total = get_num_kept_reads(args)
     if args.normalizeTo1x:
         # Print output, since normalzation stuff isn't printed to stderr otherwise
-        sys.stderr.write("normalization: 1x\n")
+        sys.stderr.write("normalization: 1x (effective genome size {})\n".format(args.normalizeTo1x))
 
         # try to guess fragment length if the bam file contains paired end reads
         from deeptools.getFragmentAndReadSize import get_read_and_fragment_length

diff --git a/deeptools/heatmapper.py b/deeptools/heatmapper.py
@@ -271,9 +271,7 @@ def computeMatrix(self, score_file_list, regions_file, parameters, blackListFile
             "matrix length does not match regions length"
 
         if len(regions) == 0:
-            sys.stderr.write(
-                "\nERROR: BED file does not contain any valid regions. "
-                "Please check\n")
+            sys.stderr.write("\nERROR: Either the BED file does not contain any valid regions or there are none remaining after filtering.\n")
             exit(1)
         if regions_no_score == len(regions):
             exit("\nERROR: None of the BED regions could be found in the bigWig"
@@ -298,7 +296,10 @@ def computeMatrix(self, score_file_list, regions_file, parameters, blackListFile
         numcols = matrix.shape[1]
         num_ind_cols = self.get_num_individual_matrix_cols()
         sample_boundaries = list(range(0, numcols + num_ind_cols, num_ind_cols))
-        sample_labels = [splitext(basename(x))[0] for x in score_file_list]
+        if allArgs is not None and allArgs['samplesLabel'] is not None:
+            sample_labels = allArgs['samplesLabel']
+        else:
+            sample_labels = [splitext(basename(x))[0] for x in score_file_list]
 
         # Determine the group boundaries
         group_boundaries = []
@@ -555,9 +556,9 @@ def compute_sub_matrix_worker(self, chrom, start, end, score_file_list, paramete
                 if not parameters['missing data as zero']:
                     coverage[:] = np.nan
 
-            if parameters['min threshold'] and coverage.min() <= parameters['min threshold']:
+            if parameters['min threshold'] is not None and coverage.min() <= parameters['min threshold']:
                 continue
-            if parameters['max threshold'] and coverage.max() >= parameters['max threshold']:
+            if parameters['max threshold'] is not None and coverage.max() >= parameters['max threshold']:
                 continue
             if parameters['scale'] != 1:
                 coverage = parameters['scale'] * coverage

diff --git a/deeptools/plotCorrelation.py b/deeptools/plotCorrelation.py
@@ -134,6 +134,16 @@ def heatmap_options():
     parser = argparse.ArgumentParser(add_help=False)
     heatmap = parser.add_argument_group('Heatmap options')
 
+    heatmap.add_argument('--plotHeight',
+                         help='Plot height in cm.',
+                         type=float,
+                         default=9.5)
+
+    heatmap.add_argument('--plotWidth',
+                         help='Plot width in cm. The minimum value is 1 cm.',
+                         type=float,
+                         default=11)
+
     heatmap.add_argument('--zMin', '-min',
                          default=None,
                          help='Minimum value for the heatmap intensities. '
@@ -206,7 +216,9 @@ def main(args=None):
                               colormap=args.colorMap,
                               plot_title=args.plotTitle,
                               image_format=args.plotFileFormat,
-                              plot_numbers=args.plotNumbers)
+                              plot_numbers=args.plotNumbers,
+                              plotWidth=args.plotWidth,
+                              plotHeight=args.plotHeight)
 
     if args.outFileCorMatrix:
         corr.save_corr_matrix(args.outFileCorMatrix)
diff --git a/deeptools/plotCoverage.py b/deeptools/plotCoverage.py
@@ -40,7 +40,7 @@ def parse_arguments(args=None):
 
 """,
             epilog='example usages:\nplotCoverage '
-                   '--bamfiles file1.bam file2.bam -out results.png\n\n'
+                   '--bamfiles file1.bam file2.bam -o results.png\n\n'
                    ' \n\n',
             conflict_handler='resolve')
 
@@ -110,6 +110,16 @@ def required_args():
                           help='Save raw counts (coverages) to file.',
                           metavar='FILE')
 
+    optional.add_argument('--plotHeight',
+                          help='Plot height in cm.',
+                          type=float,
+                          default=5.0)
+
+    optional.add_argument('--plotWidth',
+                          help='Plot width in cm. The minimum value is 1 cm.',
+                          type=float,
+                          default=15.0)
+
     optional.add_argument('--plotFileFormat',
                           metavar='FILETYPE',
                           help='Image format type. If given, this option '
@@ -165,7 +175,7 @@ def main(args=None):
     if args.skipZeros:
         num_reads_per_bin = countR.remove_row_of_zeros(num_reads_per_bin)
 
-    fig, axs = plt.subplots(1, 2, figsize=(15, 5))
+    fig, axs = plt.subplots(1, 2, figsize=(args.plotWidth, args.plotHeight))
     plt.suptitle(args.plotTitle)
     # plot up to two std from mean
     num_reads_per_bin = num_reads_per_bin.astype(int)
@@ -212,9 +222,8 @@ def main(args=None):
                                                                   sample_max[idx],
                                                                   ))
 
-    # The 'good' x-axis is computed for each sample. The lower value is favored in which
-    # distributions with a wider x-range can better be seen.
-    y_max = min(y_max)
+    # Don't clip plots
+    y_max = max(y_max)
     axs[0].set_ylim(0, min(1, y_max + (y_max * 0.10)))
     axs[0].set_xlim(0, x_max)
     axs[0].set_xlabel('coverage (#reads per bp)')