From d47ad64166e8fdeabbf0085475d39accf391322a Mon Sep 17 00:00:00 2001
From: cellgeni <cellgeni@sanger.ac.uk>
Date: Thu, 19 Dec 2024 16:07:41 +0000
Subject: [PATCH 1/4] Made a separate script to parse SRA matadata

---
 scripts/collect_metadata.sh  | 45 ++++++++++++++++++++++++------------
 scripts/curl_ena_metadata.sh | 25 +++-----------------
 scripts/curl_sra_metadata.sh | 26 +++++++++++++++++++++
 3 files changed, 59 insertions(+), 37 deletions(-)
 create mode 100755 scripts/curl_sra_metadata.sh

diff --git a/scripts/collect_metadata.sh b/scripts/collect_metadata.sh
index 24f1c38..10b02c6 100755
--- a/scripts/collect_metadata.sh
+++ b/scripts/collect_metadata.sh
@@ -50,24 +50,30 @@ then
     >&2 echo "WARNING: No secondary run/experiment (SRP/SRX) IDs in the family.soft file; this often happens in datasets that are restricted access (dbGap, etc)."
   fi
 
-  ## curl info about each run (SRR/ERR/DRR) from ENA API; v2 pulls GSM data etc 
+  ## curl info about each run (SRR/ERR/DRR) from ENA and SRA APIs; v2 pulls GSM data etc 
   RET=1
+  RET_SRA=1
   TRIES=1
-  until (( $RET == 0 )) 
+  until (( $RET == 0)) 
   do
-    ./curl_ena_metadata.sh $SERIES
+    ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv 
     RET=$?
 
+    if [[ $RET_SRA -eq 1 ]]
+    then
+      ./curl_sra_metadata.sh $SERIES
+      RET_SRA=$?
+    fi
+
     ## this either pulls sub-series data (and replaces $SERIES.project.list with useful PRJNA* IDs), or just quits after 5 tries
     TRIES=$((TRIES+1))
     if (( $TRIES > 5 )) 
     then
       >&2 echo "WARNING: No ENA records can be retrieved for GEO projects listed in $SERIES.project.list!"
       if [[ $SUBGSE == "" ]]
-      then 
+      then
         >&2 echo "ERROR: No GSE subseries were listed in ${SERIES}_family.soft - no alternative PRJNA* to be found, and no ENA entries can be retrieved!"
-        exit 1
-      else 
+      else
         >&2 echo "WARNING: replacing $SERIES.project.list with sub-series projects.."
         rm $SERIES.project.list 
         for i in $SUBGSE
@@ -84,22 +90,31 @@ then
         TRIES=1
         until (( $RET == 0 ))
         do
-          ./curl_ena_metadata.sh $SERIES
+          ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
           RET=$?
           TRIES=$((TRIES+1))
           if (( $TRIES > 5 ))
           then
-            >&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list, I quit!"
-            exit 1
+            >&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list!"
           fi
         done
       fi
+
+      ## ena metadata loading failed, now check if sra was loaded
+      if [[ $RET_SRA -eq 1 ]]
+      then
+        >&2 echo "ERROR: No SRA records can be retrieved for the $SERIES, I quit!"
+        exit 1
+      else
+        META="$SERIES.sra.tsv"
+        RET=0
+      fi
     fi 
     sleep 1
   done
 
-  # Checking which metadata file was loaded
-  if [[ -s $SERIES.sra.tsv ]]
+  # checking if ENA metadata file is empty
+  if [[ ! -s $SERIES.ena.tsv ]]
   then
     META="$SERIES.sra.tsv"
   fi
@@ -144,7 +159,7 @@ then
   until (( $RET == 0 )) 
   do
     ## for ArrayExpress, we query by sample ID because sdrf doesn't list the BioProject ID
-    ./curl_ena_metadata.sh $SERIES
+    ./curl_ena_metadata.sh $SERIES.sample.list > $SERIES.ena.tsv
     RET=$?
     TRIES=$((TRIES+1))
     if (( $TRIES > 5 )) 
@@ -182,7 +197,7 @@ then
   TRIES=1
   until (( $RET == 0 )) 
   do
-    ./curl_ena_metadata.sh $SERIES
+    ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
     RET=$?
     TRIES=$((TRIES+1))
     if (( $TRIES > 5 )) 
@@ -226,8 +241,8 @@ then
   >&2 cat $SUBSET 
   grep -f $SUBSET $SERIES.sample.list > $SERIES.sample.list.tmp
   mv $SERIES.sample.list.tmp $SERIES.sample.list
-  grep -f $SUBSET $SERIES.ena.tsv > $SERIES.ena.tsv.tmp 
-  mv $SERIES.ena.tsv.tmp $SERIES.ena.tsv 
+  grep -f $SUBSET $META > $META.tmp 
+  mv $META.tmp $META
   grep -f $SUBSET $SERIES.accessions.tsv > $SERIES.accessions.tsv.tmp 
   mv $SERIES.accessions.tsv.tmp $SERIES.accessions.tsv
 
diff --git a/scripts/curl_ena_metadata.sh b/scripts/curl_ena_metadata.sh
index 175b2e1..a2530aa 100755
--- a/scripts/curl_ena_metadata.sh
+++ b/scripts/curl_ena_metadata.sh
@@ -1,7 +1,6 @@
 #!/bin/bash -e
 
-SERIES=$1
-RUNS="$SERIES.project.list"
+RUNS=$1
 
 if [[ ! -f $RUNS ]]
 then
@@ -11,28 +10,10 @@ fi
 
 KK=`cat $RUNS`
 
-: > $SERIES.ena.tsv
-
 for i in $KK
 do
   >&2 echo "Processing run ID $i.."
-  curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | sed '/study_accession/d' >> $SERIES.ena.tsv
+  curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | grep -v study_accession
 done
 
-
-if [[ ! -s $SERIES.ena.tsv &&  $SERIES == GSE* ]]
-then
-  >&2 echo "WARNING: Was not able to load metadata from ENA. Loading it from SRA..."
-  rm $SERIES.ena.tsv
-  : > $SERIES.sra.tsv
-  for i in $KK
-  do
-    wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
-    WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
-    QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
-    rm -f $i.xml
-    curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed  's/,/\t/g' > $SERIES.sra.tsv
-  done
-fi
-
->&2 echo "CURL METADATA: ALL DONE!"
+>&2 echo "CURL ENA METADATA: ALL DONE!"
diff --git a/scripts/curl_sra_metadata.sh b/scripts/curl_sra_metadata.sh
new file mode 100755
index 0000000..97d9876
--- /dev/null
+++ b/scripts/curl_sra_metadata.sh
@@ -0,0 +1,26 @@
+#!/bin/bash -e
+
+SERIES=$1
+RUNS="$SERIES.project.list"
+
+if [[ ! -f $RUNS ]]
+then
+  >&2 echo "ERROR: file $RUNS not found!"
+  exit 1
+fi
+
+KK=`cat $RUNS`
+
+for i in $KK
+do
+  >&2 echo "Processing run ID $i.."
+  wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
+  WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
+  QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
+  rm -f $i.xml
+  curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed  's/,/\t/g' > $i.sra.tsv
+done
+
+cat *.sra.tsv > $SERIES.sra.tsv
+
+>&2 echo "CURL SRA METADATA: ALL DONE!"

From ec7566edd5885d666b5844fd299044d0202b8c74 Mon Sep 17 00:00:00 2001
From: cellgeni <cellgeni@sanger.ac.uk>
Date: Thu, 19 Dec 2024 16:08:31 +0000
Subject: [PATCH 2/4] Now metadata parsing script are looking for BAM files
 through SRA's API

---
 scripts/parse_ena_metadata.sh | 16 ++++++++++++++--
 scripts/parse_sra_metadata.sh | 17 +++++++++++++++--
 2 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/scripts/parse_ena_metadata.sh b/scripts/parse_ena_metadata.sh
index eedde6c..ffe60eb 100755
--- a/scripts/parse_ena_metadata.sh
+++ b/scripts/parse_ena_metadata.sh
@@ -21,8 +21,14 @@ do
   ENAGZ=`grep -w $i $SERIES.ena.tsv | cut -f11 | grep "_1\.fastq.gz" | grep "_2\.fastq.gz"`     ## ENA formatting is strict
   ORIFQ=`grep -w $i $SERIES.ena.tsv | cut -f12 | grep "f.*q"`                                   ## ppl name files *all kinds of random shiz*, really
   ORIBAM=`grep -w $i $SERIES.ena.tsv | cut -f12 | tr ';' '\n' | grep -v "\.bai" | grep "\.bam"` ## don't need the BAM index which is often there
-  SRA=`grep -w $i $SERIES.ena.tsv | cut -f13` 
-  
+  SRA=`grep -w $i $SERIES.ena.tsv | cut -f13`
+  SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
+          .result[].files[] | 
+          select(.name | contains("bam")) | 
+          .locations[] | 
+          select((.rehydrationRequired // false) == false and (.payRequired // false) == false) | 
+          .link
+        '`
 
   if [[ $AEGZ != "" ]]
   then
@@ -48,6 +54,12 @@ do
     LOC=$ORIBAM
     echo $ORIBAM >> $SERIES.urls.list
     >&2 echo "Sample $i is available via ENA as an original submitter's BAM file: $LOC"
+  elif [[ $SRABAM != "" ]]
+  then
+    TYPE="BAM"
+    LOC=$SRABAM
+    echo $SRABAM >> $SERIES.urls.list
+    >&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
   elif [[ $SRA != "" ]]
   then 
     TYPE="SRA"
diff --git a/scripts/parse_sra_metadata.sh b/scripts/parse_sra_metadata.sh
index f1bf334..2f1d9dc 100755
--- a/scripts/parse_sra_metadata.sh
+++ b/scripts/parse_sra_metadata.sh
@@ -15,8 +15,21 @@ do
 
   SPECIES=`grep -w $i $SERIES.sra.tsv | cut -f29`
   SRA=`grep -w $i $SERIES.sra.tsv | cut -f10` 
-  
-  if [[ $SRA != "" ]]
+  SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
+          .result[].files[] | 
+          select(.name | contains("bam")) | 
+          .locations[] | 
+          select((.rehydrationRequired // false) == false and (.payRequired // false) == false) | 
+          .link
+        '`
+
+  if [[ $SRABAM != "" ]]
+  then
+    TYPE="BAM"
+    LOC=$SRABAM
+    echo $SRABAM >> $SERIES.urls.list
+    >&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
+  elif [[ $SRA != "" ]]
   then 
     LOC=$SRA
     echo $SRA >> $SERIES.urls.list

From 486ee1f3e000467e89f56922304cb7977379ee2a Mon Sep 17 00:00:00 2001
From: cellgeni <cellgeni@sanger.ac.uk>
Date: Mon, 13 Jan 2025 15:30:43 +0000
Subject: [PATCH 3/4] Added memory sharing to starsolo runs

---
 scripts/starsolo_10x_auto.sh | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh
index 0259c96..dcec9dd 100755
--- a/scripts/starsolo_10x_auto.sh
+++ b/scripts/starsolo_10x_auto.sh
@@ -204,6 +204,10 @@ then
   >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)."
 fi
 
+## Load genome intho shared memory for StarSolo use
+
+$CMD STAR --genomeDir $REF --genomeLoad LoadAndExit
+
 ## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment. 
 ## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand 
 ## with higher than 50% probability; 2) if not, run the same quantification with "--soloStand Reverse" and calculate the same stat; 
@@ -213,14 +217,14 @@ STRAND=Forward
 
 $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
-     --soloUMIlen $UMILEN --soloStrand Forward \
+     --soloUMIlen $UMILEN --soloStrand Forward --genomeLoad LoadAndKeep \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
      --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
      --soloFeatures Gene GeneFull --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &> /dev/null 
 
 $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \
-     --soloUMIlen $UMILEN --soloStrand Reverse \
+     --soloUMIlen $UMILEN --soloStrand Reverse --genomeLoad LoadAndKeep \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
      --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
      --soloFeatures Gene GeneFull --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &> /dev/null
@@ -266,16 +270,20 @@ then
   $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \
+     --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndKeep \
      --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx
 else 
   $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \
+     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndKeep \
      --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx
 fi
 
+## remove genome from shared memory
+
+$CMD STAR --genomeDir $REF --genomeLoad Remove
+
 ## index the BAM file
 if [[ -s Aligned.sortedByCoord.out.bam ]]
 then

From 51ce855bf2f02e4f08d57edf9242c26ca9f6ef21 Mon Sep 17 00:00:00 2001
From: cellgeni <cellgeni@sanger.ac.uk>
Date: Tue, 14 Jan 2025 14:12:23 +0000
Subject: [PATCH 4/4] Removed explicit genome Load and Removal

---
 scripts/starsolo_10x_auto.sh | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh
index dcec9dd..1146fde 100755
--- a/scripts/starsolo_10x_auto.sh
+++ b/scripts/starsolo_10x_auto.sh
@@ -204,9 +204,6 @@ then
   >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)."
 fi
 
-## Load genome intho shared memory for StarSolo use
-
-$CMD STAR --genomeDir $REF --genomeLoad LoadAndExit
 
 ## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment. 
 ## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand 
@@ -270,20 +267,16 @@ then
   $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndKeep \
+     --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
      --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx
 else 
   $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \
      --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \
      --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \
-     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndKeep \
+     --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \
      --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx
 fi
 
-## remove genome from shared memory
-
-$CMD STAR --genomeDir $REF --genomeLoad Remove
-
 ## index the BAM file
 if [[ -s Aligned.sortedByCoord.out.bam ]]
 then