Merge pull request #6 from cellgeni/bam_metadata

Let's help Dora find BAM files
cellgeni · Jan 14, 2025 · 41f1977 · 41f1977
2 parents 0bc7ec1 + 51ce855
commit 41f1977
Show file tree

Hide file tree

Showing 6 changed files with 89 additions and 41 deletions.
diff --git a/scripts/collect_metadata.sh b/scripts/collect_metadata.sh
@@ -50,24 +50,30 @@ then
     >&2 echo "WARNING: No secondary run/experiment (SRP/SRX) IDs in the family.soft file; this often happens in datasets that are restricted access (dbGap, etc)."
   fi
 
-  ## curl info about each run (SRR/ERR/DRR) from ENA API; v2 pulls GSM data etc 
+  ## curl info about each run (SRR/ERR/DRR) from ENA and SRA APIs; v2 pulls GSM data etc 
   RET=1
+  RET_SRA=1
   TRIES=1
-  until (( $RET == 0 )) 
+  until (( $RET == 0)) 
   do
-    ./curl_ena_metadata.sh $SERIES
+    ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv 
     RET=$?
 
+    if [[ $RET_SRA -eq 1 ]]
+    then
+      ./curl_sra_metadata.sh $SERIES
+      RET_SRA=$?
+    fi
+
     ## this either pulls sub-series data (and replaces $SERIES.project.list with useful PRJNA* IDs), or just quits after 5 tries
     TRIES=$((TRIES+1))
     if (( $TRIES > 5 )) 
     then
       >&2 echo "WARNING: No ENA records can be retrieved for GEO projects listed in $SERIES.project.list!"
       if [[ $SUBGSE == "" ]]
-      then 
+      then
         >&2 echo "ERROR: No GSE subseries were listed in ${SERIES}_family.soft - no alternative PRJNA* to be found, and no ENA entries can be retrieved!"
-        exit 1
-      else 
+      else
         >&2 echo "WARNING: replacing $SERIES.project.list with sub-series projects.."
         rm $SERIES.project.list 
         for i in $SUBGSE
@@ -84,22 +90,31 @@ then
         TRIES=1
         until (( $RET == 0 ))
         do
-          ./curl_ena_metadata.sh $SERIES
+          ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
           RET=$?
           TRIES=$((TRIES+1))
           if (( $TRIES > 5 ))
           then
-            >&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list, I quit!"
-            exit 1
+            >&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list!"
           fi
         done
       fi
+
+      ## ena metadata loading failed, now check if sra was loaded
+      if [[ $RET_SRA -eq 1 ]]
+      then
+        >&2 echo "ERROR: No SRA records can be retrieved for the $SERIES, I quit!"
+        exit 1
+      else
+        META="$SERIES.sra.tsv"
+        RET=0
+      fi
     fi 
     sleep 1
   done
 
-  # Checking which metadata file was loaded
-  if [[ -s $SERIES.sra.tsv ]]
+  # checking if ENA metadata file is empty
+  if [[ ! -s $SERIES.ena.tsv ]]
   then
     META="$SERIES.sra.tsv"
   fi
@@ -144,7 +159,7 @@ then
   until (( $RET == 0 )) 
   do
     ## for ArrayExpress, we query by sample ID because sdrf doesn't list the BioProject ID
-    ./curl_ena_metadata.sh $SERIES
+    ./curl_ena_metadata.sh $SERIES.sample.list > $SERIES.ena.tsv
     RET=$?
     TRIES=$((TRIES+1))
     if (( $TRIES > 5 )) 
@@ -182,7 +197,7 @@ then
   TRIES=1
   until (( $RET == 0 )) 
   do
-    ./curl_ena_metadata.sh $SERIES
+    ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
     RET=$?
     TRIES=$((TRIES+1))
     if (( $TRIES > 5 )) 
@@ -226,8 +241,8 @@ then
   >&2 cat $SUBSET 
   grep -f $SUBSET $SERIES.sample.list > $SERIES.sample.list.tmp
   mv $SERIES.sample.list.tmp $SERIES.sample.list
-  grep -f $SUBSET $SERIES.ena.tsv > $SERIES.ena.tsv.tmp 
-  mv $SERIES.ena.tsv.tmp $SERIES.ena.tsv 
+  grep -f $SUBSET $META > $META.tmp 
+  mv $META.tmp $META
   grep -f $SUBSET $SERIES.accessions.tsv > $SERIES.accessions.tsv.tmp 
   mv $SERIES.accessions.tsv.tmp $SERIES.accessions.tsv
 

diff --git a/scripts/curl_ena_metadata.sh b/scripts/curl_ena_metadata.sh
@@ -1,7 +1,6 @@
 #!/bin/bash -e
 
-SERIES=$1
-RUNS="$SERIES.project.list"
+RUNS=$1
 
 if [[ ! -f $RUNS ]]
 then
@@ -11,28 +10,10 @@ fi
 
 KK=`cat $RUNS`
 
-: > $SERIES.ena.tsv
-
 for i in $KK
 do
   >&2 echo "Processing run ID $i.."
-  curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | sed '/study_accession/d' >> $SERIES.ena.tsv
+  curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | grep -v study_accession
 done
 
-
-if [[ ! -s $SERIES.ena.tsv &&  $SERIES == GSE* ]]
-then
-  >&2 echo "WARNING: Was not able to load metadata from ENA. Loading it from SRA..."
-  rm $SERIES.ena.tsv
-  : > $SERIES.sra.tsv
-  for i in $KK
-  do
-    wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
-    WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
-    QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
-    rm -f $i.xml
-    curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed  's/,/\t/g' > $SERIES.sra.tsv
-  done
-fi
-
->&2 echo "CURL METADATA: ALL DONE!"
+>&2 echo "CURL ENA METADATA: ALL DONE!"
diff --git a/scripts/curl_sra_metadata.sh b/scripts/curl_sra_metadata.sh
@@ -0,0 +1,26 @@
+#!/bin/bash -e
+
+SERIES=$1
+RUNS="$SERIES.project.list"
+
+if [[ ! -f $RUNS ]]
+then
+  >&2 echo "ERROR: file $RUNS not found!"
+  exit 1
+fi
+
+KK=`cat $RUNS`
+
+for i in $KK
+do
+  >&2 echo "Processing run ID $i.."
+  wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
+  WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
+  QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
+  rm -f $i.xml
+  curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed  's/,/\t/g' > $i.sra.tsv
+done
+
+cat *.sra.tsv > $SERIES.sra.tsv
+
+>&2 echo "CURL SRA METADATA: ALL DONE!"
diff --git a/scripts/parse_ena_metadata.sh b/scripts/parse_ena_metadata.sh
@@ -21,8 +21,14 @@ do
   ENAGZ=`grep -w $i $SERIES.ena.tsv | cut -f11 | grep "_1\.fastq.gz" | grep "_2\.fastq.gz"`     ## ENA formatting is strict
   ORIFQ=`grep -w $i $SERIES.ena.tsv | cut -f12 | grep "f.*q"`                                   ## ppl name files *all kinds of random shiz*, really
   ORIBAM=`grep -w $i $SERIES.ena.tsv | cut -f12 | tr ';' '\n' | grep -v "\.bai" | grep "\.bam"` ## don't need the BAM index which is often there
-  SRA=`grep -w $i $SERIES.ena.tsv | cut -f13` 
-
+  SRA=`grep -w $i $SERIES.ena.tsv | cut -f13`
+  SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
+          .result[].files[] | 
+          select(.name | contains("bam")) | 
+          .locations[] | 
+          select((.rehydrationRequired // false) == false and (.payRequired // false) == false) | 
+          .link
+        '`
 
   if [[ $AEGZ != "" ]]
   then
@@ -48,6 +54,12 @@ do
     LOC=$ORIBAM
     echo $ORIBAM >> $SERIES.urls.list
     >&2 echo "Sample $i is available via ENA as an original submitter's BAM file: $LOC"
+  elif [[ $SRABAM != "" ]]
+  then
+    TYPE="BAM"
+    LOC=$SRABAM
+    echo $SRABAM >> $SERIES.urls.list
+    >&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
   elif [[ $SRA != "" ]]
   then 
     TYPE="SRA"

diff --git a/scripts/parse_sra_metadata.sh b/scripts/parse_sra_metadata.sh
@@ -15,8 +15,21 @@ do
 
   SPECIES=`grep -w $i $SERIES.sra.tsv | cut -f29`
   SRA=`grep -w $i $SERIES.sra.tsv | cut -f10` 
-
-  if [[ $SRA != "" ]]
+  SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
+          .result[].files[] | 
+          select(.name | contains("bam")) | 
+          .locations[] | 
+          select((.rehydrationRequired // false) == false and (.payRequired // false) == false) | 
+          .link
+        '`
+
+  if [[ $SRABAM != "" ]]
+  then
+    TYPE="BAM"
+    LOC=$SRABAM
+    echo $SRABAM >> $SERIES.urls.list
+    >&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
+  elif [[ $SRA != "" ]]
   then 
     LOC=$SRA
     echo $SRA >> $SERIES.urls.list

diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh
@@ -204,6 +204,7 @@ then
   >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)."
 fi
 
+
 ## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment. 
 ## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand 
 ## with higher than 50% probability; 2) if not, run the same quantification with "--soloStand Reverse" and calculate the same stat;