Handle very large number of files using find cmd

cokelaer · cokelaer · commit 394f7ad3c59b · 2023-05-17T10:40:26.000+02:00
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -40,11 +40,6 @@ jobs:
         # $CONDA is an environment variable pointing to the root of the miniconda directory
         echo $CONDA/bin >> $GITHUB_PATH
 
-    #- name: conda
-    #  run: |
-    #    conda install -c conda-forge -c bioconda --quiet -y python=${{ matrix.python }} mamba
-    #    mamba install pycoqc
-
     - name: Install dependencies
       run: |
         pip install pycoQC
diff --git a/README.rst b/README.rst
@@ -147,6 +147,8 @@ Changelog
 ========= ====================================================================
 Version   Description
 ========= ====================================================================
+1.2.0     * handle large promethium run by using find+cat instead of just cat
+            to cope with very large number of input files.
 1.1.0     * add subsample option and set to 1,000,000 reads to handle large 
             runs such as promethion
 1.0.1     * CSV can now handle sample or samplename column name in samplesheet.
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,2 @@
-sequana_pipetools>=0.8.1
+sequana_pipetools>=0.12.2
 sequana
diff --git a/sequana_pipelines/nanomerge/nanomerge.rules b/sequana_pipelines/nanomerge/nanomerge.rules
@@ -96,6 +96,13 @@ def get_input_merge(wildcards):
         filenames = list((input_directory).glob(input_pattern))
     return filenames
 
+def get_input_directory(wildcards):
+    if samples.barcoded:
+        barcode = samples.get_barcode_from_sample(wildcards.sample)
+        return input_directory / barcode
+    else:
+        return input_directory
+
 
 if config["summary"]:
     rule pyco:
@@ -120,9 +127,11 @@ rule merge:
         get_input_merge
     output:
         "./{project}/{sample}.fastq.gz"
+    params:
+        indir=get_input_directory
     shell:
         """
-        cat {input} > {output}
+        find {params.indir} -type f -name "*fastq.gz" -exec cat {{}} + > {output}
         """
 
 
@@ -168,7 +177,7 @@ rule html_report:
                 pycodata = fout.read()
                 pycodata = '<div class="columns">' + pycodata.split('<div class="columns">')[-1].replace("</div>\n</body>\n</html>","")
 
-            s = SummaryModule2(data, f"Your data are available in {dirs} directories" + pycodata)
+            s = SummaryModule2(data, f"""Your data are available in {dirs} directories. Please see the summary plots here below (if sequence summary was provided), generated with <a href="https://github.com/a-slide/pycoQC">pycoQC</a> software.""" + pycodata)
         else:
             s = SummaryModule2(data, f"no summary found. Please checkout the sub directories {dirs}. They should contain your final fastq files for each project.")
 
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 
 
 _MAJOR               = 1
-_MINOR               = 1
+_MINOR               = 2
 _MICRO               = 0
 version = f"{_MAJOR}.{_MINOR}.{_MICRO}"
 release = f"{_MAJOR}.{_MINOR}"

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`		`-sequana_pipetools>=0.8.1`
	`1`	`+sequana_pipetools>=0.12.2`
`2`	`2`	`sequana`