Skip to content

Commit

Permalink
Merge pull request #6 from cellgeni/bam_metadata
Browse files Browse the repository at this point in the history
Let's help Dora find BAM files
  • Loading branch information
Claptar authored Jan 14, 2025
2 parents 0bc7ec1 + 51ce855 commit 41f1977
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 41 deletions.
45 changes: 30 additions & 15 deletions scripts/collect_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -50,24 +50,30 @@ then
>&2 echo "WARNING: No secondary run/experiment (SRP/SRX) IDs in the family.soft file; this often happens in datasets that are restricted access (dbGap, etc)."
fi

## curl info about each run (SRR/ERR/DRR) from ENA API; v2 pulls GSM data etc
## curl info about each run (SRR/ERR/DRR) from ENA and SRA APIs; v2 pulls GSM data etc
RET=1
RET_SRA=1
TRIES=1
until (( $RET == 0 ))
until (( $RET == 0))
do
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
RET=$?

if [[ $RET_SRA -eq 1 ]]
then
./curl_sra_metadata.sh $SERIES
RET_SRA=$?
fi

## this either pulls sub-series data (and replaces $SERIES.project.list with useful PRJNA* IDs), or just quits after 5 tries
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
then
>&2 echo "WARNING: No ENA records can be retrieved for GEO projects listed in $SERIES.project.list!"
if [[ $SUBGSE == "" ]]
then
then
>&2 echo "ERROR: No GSE subseries were listed in ${SERIES}_family.soft - no alternative PRJNA* to be found, and no ENA entries can be retrieved!"
exit 1
else
else
>&2 echo "WARNING: replacing $SERIES.project.list with sub-series projects.."
rm $SERIES.project.list
for i in $SUBGSE
Expand All @@ -84,22 +90,31 @@ then
TRIES=1
until (( $RET == 0 ))
do
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
RET=$?
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
then
>&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list, I quit!"
exit 1
>&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list!"
fi
done
fi

## ena metadata loading failed, now check if sra was loaded
if [[ $RET_SRA -eq 1 ]]
then
>&2 echo "ERROR: No SRA records can be retrieved for the $SERIES, I quit!"
exit 1
else
META="$SERIES.sra.tsv"
RET=0
fi
fi
sleep 1
done

# Checking which metadata file was loaded
if [[ -s $SERIES.sra.tsv ]]
# checking if ENA metadata file is empty
if [[ ! -s $SERIES.ena.tsv ]]
then
META="$SERIES.sra.tsv"
fi
Expand Down Expand Up @@ -144,7 +159,7 @@ then
until (( $RET == 0 ))
do
## for ArrayExpress, we query by sample ID because sdrf doesn't list the BioProject ID
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.sample.list > $SERIES.ena.tsv
RET=$?
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
Expand Down Expand Up @@ -182,7 +197,7 @@ then
TRIES=1
until (( $RET == 0 ))
do
./curl_ena_metadata.sh $SERIES
./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv
RET=$?
TRIES=$((TRIES+1))
if (( $TRIES > 5 ))
Expand Down Expand Up @@ -226,8 +241,8 @@ then
>&2 cat $SUBSET
grep -f $SUBSET $SERIES.sample.list > $SERIES.sample.list.tmp
mv $SERIES.sample.list.tmp $SERIES.sample.list
grep -f $SUBSET $SERIES.ena.tsv > $SERIES.ena.tsv.tmp
mv $SERIES.ena.tsv.tmp $SERIES.ena.tsv
grep -f $SUBSET $META > $META.tmp
mv $META.tmp $META
grep -f $SUBSET $SERIES.accessions.tsv > $SERIES.accessions.tsv.tmp
mv $SERIES.accessions.tsv.tmp $SERIES.accessions.tsv

Expand Down
25 changes: 3 additions & 22 deletions scripts/curl_ena_metadata.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash -e

SERIES=$1
RUNS="$SERIES.project.list"
RUNS=$1

if [[ ! -f $RUNS ]]
then
Expand All @@ -11,28 +10,10 @@ fi

KK=`cat $RUNS`

: > $SERIES.ena.tsv

for i in $KK
do
>&2 echo "Processing run ID $i.."
curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | sed '/study_accession/d' >> $SERIES.ena.tsv
curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | grep -v study_accession
done


if [[ ! -s $SERIES.ena.tsv && $SERIES == GSE* ]]
then
>&2 echo "WARNING: Was not able to load metadata from ENA. Loading it from SRA..."
rm $SERIES.ena.tsv
: > $SERIES.sra.tsv
for i in $KK
do
wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
rm -f $i.xml
curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed 's/,/\t/g' > $SERIES.sra.tsv
done
fi

>&2 echo "CURL METADATA: ALL DONE!"
>&2 echo "CURL ENA METADATA: ALL DONE!"
26 changes: 26 additions & 0 deletions scripts/curl_sra_metadata.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
#!/bin/bash -e

SERIES=$1
RUNS="$SERIES.project.list"

if [[ ! -f $RUNS ]]
then
>&2 echo "ERROR: file $RUNS not found!"
exit 1
fi

KK=`cat $RUNS`

for i in $KK
do
>&2 echo "Processing run ID $i.."
wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y"
WebEnv=$(grep -oP '<WebEnv>\K[^<]+' $i.xml)
QueryKey=$(grep -oP '<QueryKey>\K[^<]+' $i.xml)
rm -f $i.xml
curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed 's/,/\t/g' > $i.sra.tsv
done

cat *.sra.tsv > $SERIES.sra.tsv

>&2 echo "CURL SRA METADATA: ALL DONE!"
16 changes: 14 additions & 2 deletions scripts/parse_ena_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,14 @@ do
ENAGZ=`grep -w $i $SERIES.ena.tsv | cut -f11 | grep "_1\.fastq.gz" | grep "_2\.fastq.gz"` ## ENA formatting is strict
ORIFQ=`grep -w $i $SERIES.ena.tsv | cut -f12 | grep "f.*q"` ## ppl name files *all kinds of random shiz*, really
ORIBAM=`grep -w $i $SERIES.ena.tsv | cut -f12 | tr ';' '\n' | grep -v "\.bai" | grep "\.bam"` ## don't need the BAM index which is often there
SRA=`grep -w $i $SERIES.ena.tsv | cut -f13`

SRA=`grep -w $i $SERIES.ena.tsv | cut -f13`
SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
.result[].files[] |
select(.name | contains("bam")) |
.locations[] |
select((.rehydrationRequired // false) == false and (.payRequired // false) == false) |
.link
'`

if [[ $AEGZ != "" ]]
then
Expand All @@ -48,6 +54,12 @@ do
LOC=$ORIBAM
echo $ORIBAM >> $SERIES.urls.list
>&2 echo "Sample $i is available via ENA as an original submitter's BAM file: $LOC"
elif [[ $SRABAM != "" ]]
then
TYPE="BAM"
LOC=$SRABAM
echo $SRABAM >> $SERIES.urls.list
>&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
elif [[ $SRA != "" ]]
then
TYPE="SRA"
Expand Down
17 changes: 15 additions & 2 deletions scripts/parse_sra_metadata.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,21 @@ do

SPECIES=`grep -w $i $SERIES.sra.tsv | cut -f29`
SRA=`grep -w $i $SERIES.sra.tsv | cut -f10`

if [[ $SRA != "" ]]
SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r '
.result[].files[] |
select(.name | contains("bam")) |
.locations[] |
select((.rehydrationRequired // false) == false and (.payRequired // false) == false) |
.link
'`

if [[ $SRABAM != "" ]]
then
TYPE="BAM"
LOC=$SRABAM
echo $SRABAM >> $SERIES.urls.list
>&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC"
elif [[ $SRA != "" ]]
then
LOC=$SRA
echo $SRA >> $SERIES.urls.list
Expand Down
1 change: 1 addition & 0 deletions scripts/starsolo_10x_auto.sh
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,7 @@ then
>&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)."
fi


## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment.
## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand
## with higher than 50% probability; 2) if not, run the same quantification with "--soloStand Reverse" and calculate the same stat;
Expand Down

0 comments on commit 41f1977

Please sign in to comment.