From d47ad64166e8fdeabbf0085475d39accf391322a Mon Sep 17 00:00:00 2001 From: cellgeni Date: Thu, 19 Dec 2024 16:07:41 +0000 Subject: [PATCH 1/4] Made a separate script to parse SRA matadata --- scripts/collect_metadata.sh | 45 ++++++++++++++++++++++++------------ scripts/curl_ena_metadata.sh | 25 +++----------------- scripts/curl_sra_metadata.sh | 26 +++++++++++++++++++++ 3 files changed, 59 insertions(+), 37 deletions(-) create mode 100755 scripts/curl_sra_metadata.sh diff --git a/scripts/collect_metadata.sh b/scripts/collect_metadata.sh index 24f1c38..10b02c6 100755 --- a/scripts/collect_metadata.sh +++ b/scripts/collect_metadata.sh @@ -50,24 +50,30 @@ then >&2 echo "WARNING: No secondary run/experiment (SRP/SRX) IDs in the family.soft file; this often happens in datasets that are restricted access (dbGap, etc)." fi - ## curl info about each run (SRR/ERR/DRR) from ENA API; v2 pulls GSM data etc + ## curl info about each run (SRR/ERR/DRR) from ENA and SRA APIs; v2 pulls GSM data etc RET=1 + RET_SRA=1 TRIES=1 - until (( $RET == 0 )) + until (( $RET == 0)) do - ./curl_ena_metadata.sh $SERIES + ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv RET=$? + if [[ $RET_SRA -eq 1 ]] + then + ./curl_sra_metadata.sh $SERIES + RET_SRA=$? + fi + ## this either pulls sub-series data (and replaces $SERIES.project.list with useful PRJNA* IDs), or just quits after 5 tries TRIES=$((TRIES+1)) if (( $TRIES > 5 )) then >&2 echo "WARNING: No ENA records can be retrieved for GEO projects listed in $SERIES.project.list!" if [[ $SUBGSE == "" ]] - then + then >&2 echo "ERROR: No GSE subseries were listed in ${SERIES}_family.soft - no alternative PRJNA* to be found, and no ENA entries can be retrieved!" - exit 1 - else + else >&2 echo "WARNING: replacing $SERIES.project.list with sub-series projects.." rm $SERIES.project.list for i in $SUBGSE @@ -84,22 +90,31 @@ then TRIES=1 until (( $RET == 0 )) do - ./curl_ena_metadata.sh $SERIES + ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv RET=$? TRIES=$((TRIES+1)) if (( $TRIES > 5 )) then - >&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list, I quit!" - exit 1 + >&2 echo "ERROR: Still no ENA records can be retrieved for the GEO SUBSERIES projects listed in $SERIES.project.list!" fi done fi + + ## ena metadata loading failed, now check if sra was loaded + if [[ $RET_SRA -eq 1 ]] + then + >&2 echo "ERROR: No SRA records can be retrieved for the $SERIES, I quit!" + exit 1 + else + META="$SERIES.sra.tsv" + RET=0 + fi fi sleep 1 done - # Checking which metadata file was loaded - if [[ -s $SERIES.sra.tsv ]] + # checking if ENA metadata file is empty + if [[ ! -s $SERIES.ena.tsv ]] then META="$SERIES.sra.tsv" fi @@ -144,7 +159,7 @@ then until (( $RET == 0 )) do ## for ArrayExpress, we query by sample ID because sdrf doesn't list the BioProject ID - ./curl_ena_metadata.sh $SERIES + ./curl_ena_metadata.sh $SERIES.sample.list > $SERIES.ena.tsv RET=$? TRIES=$((TRIES+1)) if (( $TRIES > 5 )) @@ -182,7 +197,7 @@ then TRIES=1 until (( $RET == 0 )) do - ./curl_ena_metadata.sh $SERIES + ./curl_ena_metadata.sh $SERIES.project.list > $SERIES.ena.tsv RET=$? TRIES=$((TRIES+1)) if (( $TRIES > 5 )) @@ -226,8 +241,8 @@ then >&2 cat $SUBSET grep -f $SUBSET $SERIES.sample.list > $SERIES.sample.list.tmp mv $SERIES.sample.list.tmp $SERIES.sample.list - grep -f $SUBSET $SERIES.ena.tsv > $SERIES.ena.tsv.tmp - mv $SERIES.ena.tsv.tmp $SERIES.ena.tsv + grep -f $SUBSET $META > $META.tmp + mv $META.tmp $META grep -f $SUBSET $SERIES.accessions.tsv > $SERIES.accessions.tsv.tmp mv $SERIES.accessions.tsv.tmp $SERIES.accessions.tsv diff --git a/scripts/curl_ena_metadata.sh b/scripts/curl_ena_metadata.sh index 175b2e1..a2530aa 100755 --- a/scripts/curl_ena_metadata.sh +++ b/scripts/curl_ena_metadata.sh @@ -1,7 +1,6 @@ #!/bin/bash -e -SERIES=$1 -RUNS="$SERIES.project.list" +RUNS=$1 if [[ ! -f $RUNS ]] then @@ -11,28 +10,10 @@ fi KK=`cat $RUNS` -: > $SERIES.ena.tsv - for i in $KK do >&2 echo "Processing run ID $i.." - curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | sed '/study_accession/d' >> $SERIES.ena.tsv + curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=study_accession,secondary_study_accession,sample_accession,secondary_sample_accession,experiment_accession,experiment_alias,run_accession,run_alias,tax_id,scientific_name,fastq_ftp,submitted_ftp,sra_ftp&format=tsv&download=true&limit=0" 2> /dev/null | grep -v study_accession done - -if [[ ! -s $SERIES.ena.tsv && $SERIES == GSE* ]] -then - >&2 echo "WARNING: Was not able to load metadata from ENA. Loading it from SRA..." - rm $SERIES.ena.tsv - : > $SERIES.sra.tsv - for i in $KK - do - wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y" - WebEnv=$(grep -oP '\K[^<]+' $i.xml) - QueryKey=$(grep -oP '\K[^<]+' $i.xml) - rm -f $i.xml - curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed 's/,/\t/g' > $SERIES.sra.tsv - done -fi - ->&2 echo "CURL METADATA: ALL DONE!" +>&2 echo "CURL ENA METADATA: ALL DONE!" diff --git a/scripts/curl_sra_metadata.sh b/scripts/curl_sra_metadata.sh new file mode 100755 index 0000000..97d9876 --- /dev/null +++ b/scripts/curl_sra_metadata.sh @@ -0,0 +1,26 @@ +#!/bin/bash -e + +SERIES=$1 +RUNS="$SERIES.project.list" + +if [[ ! -f $RUNS ]] +then + >&2 echo "ERROR: file $RUNS not found!" + exit 1 +fi + +KK=`cat $RUNS` + +for i in $KK +do + >&2 echo "Processing run ID $i.." + wget --quiet --output-document="$i.xml" "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=sra&term=${i}&usehistory=y" + WebEnv=$(grep -oP '\K[^<]+' $i.xml) + QueryKey=$(grep -oP '\K[^<]+' $i.xml) + rm -f $i.xml + curl "https://trace.ncbi.nlm.nih.gov/Traces/sra-db-be/sra-db-be.cgi?rettype=runinfo&WebEnv=${WebEnv}&query_key=${QueryKey}" 2> /dev/null | sed '/BioProject/d' | sed 's/,/\t/g' > $i.sra.tsv +done + +cat *.sra.tsv > $SERIES.sra.tsv + +>&2 echo "CURL SRA METADATA: ALL DONE!" From ec7566edd5885d666b5844fd299044d0202b8c74 Mon Sep 17 00:00:00 2001 From: cellgeni Date: Thu, 19 Dec 2024 16:08:31 +0000 Subject: [PATCH 2/4] Now metadata parsing script are looking for BAM files through SRA's API --- scripts/parse_ena_metadata.sh | 16 ++++++++++++++-- scripts/parse_sra_metadata.sh | 17 +++++++++++++++-- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/scripts/parse_ena_metadata.sh b/scripts/parse_ena_metadata.sh index eedde6c..ffe60eb 100755 --- a/scripts/parse_ena_metadata.sh +++ b/scripts/parse_ena_metadata.sh @@ -21,8 +21,14 @@ do ENAGZ=`grep -w $i $SERIES.ena.tsv | cut -f11 | grep "_1\.fastq.gz" | grep "_2\.fastq.gz"` ## ENA formatting is strict ORIFQ=`grep -w $i $SERIES.ena.tsv | cut -f12 | grep "f.*q"` ## ppl name files *all kinds of random shiz*, really ORIBAM=`grep -w $i $SERIES.ena.tsv | cut -f12 | tr ';' '\n' | grep -v "\.bai" | grep "\.bam"` ## don't need the BAM index which is often there - SRA=`grep -w $i $SERIES.ena.tsv | cut -f13` - + SRA=`grep -w $i $SERIES.ena.tsv | cut -f13` + SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r ' + .result[].files[] | + select(.name | contains("bam")) | + .locations[] | + select((.rehydrationRequired // false) == false and (.payRequired // false) == false) | + .link + '` if [[ $AEGZ != "" ]] then @@ -48,6 +54,12 @@ do LOC=$ORIBAM echo $ORIBAM >> $SERIES.urls.list >&2 echo "Sample $i is available via ENA as an original submitter's BAM file: $LOC" + elif [[ $SRABAM != "" ]] + then + TYPE="BAM" + LOC=$SRABAM + echo $SRABAM >> $SERIES.urls.list + >&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC" elif [[ $SRA != "" ]] then TYPE="SRA" diff --git a/scripts/parse_sra_metadata.sh b/scripts/parse_sra_metadata.sh index f1bf334..2f1d9dc 100755 --- a/scripts/parse_sra_metadata.sh +++ b/scripts/parse_sra_metadata.sh @@ -15,8 +15,21 @@ do SPECIES=`grep -w $i $SERIES.sra.tsv | cut -f29` SRA=`grep -w $i $SERIES.sra.tsv | cut -f10` - - if [[ $SRA != "" ]] + SRABAM=`curl -s "https://locate.ncbi.nlm.nih.gov/sdl/2/retrieve?acc=$i&accept-alternate-locations=yes" | jq -r ' + .result[].files[] | + select(.name | contains("bam")) | + .locations[] | + select((.rehydrationRequired // false) == false and (.payRequired // false) == false) | + .link + '` + + if [[ $SRABAM != "" ]] + then + TYPE="BAM" + LOC=$SRABAM + echo $SRABAM >> $SERIES.urls.list + >&2 echo "Sample $i is available via SRA as an original submitter's BAM file: $LOC" + elif [[ $SRA != "" ]] then LOC=$SRA echo $SRA >> $SERIES.urls.list From 486ee1f3e000467e89f56922304cb7977379ee2a Mon Sep 17 00:00:00 2001 From: cellgeni Date: Mon, 13 Jan 2025 15:30:43 +0000 Subject: [PATCH 3/4] Added memory sharing to starsolo runs --- scripts/starsolo_10x_auto.sh | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh index 0259c96..dcec9dd 100755 --- a/scripts/starsolo_10x_auto.sh +++ b/scripts/starsolo_10x_auto.sh @@ -204,6 +204,10 @@ then >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)." fi +## Load genome intho shared memory for StarSolo use + +$CMD STAR --genomeDir $REF --genomeLoad LoadAndExit + ## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment. ## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand ## with higher than 50% probability; 2) if not, run the same quantification with "--soloStand Reverse" and calculate the same stat; @@ -213,14 +217,14 @@ STRAND=Forward $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \ --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \ - --soloUMIlen $UMILEN --soloStrand Forward \ + --soloUMIlen $UMILEN --soloStrand Forward --genomeLoad LoadAndKeep \ --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ --soloFeatures Gene GeneFull --soloOutFileNames test_forward/ features.tsv barcodes.tsv matrix.mtx &> /dev/null $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn test.R2.fastq test.R1.fastq --runDirPerm All_RWX --outSAMtype None \ --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) \ - --soloUMIlen $UMILEN --soloStrand Reverse \ + --soloUMIlen $UMILEN --soloStrand Reverse --genomeLoad LoadAndKeep \ --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ --soloFeatures Gene GeneFull --soloOutFileNames test_reverse/ features.tsv barcodes.tsv matrix.mtx &> /dev/null @@ -266,16 +270,20 @@ then $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \ --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \ --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 \ + --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndKeep \ --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx else $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \ --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \ --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 \ + --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndKeep \ --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx fi +## remove genome from shared memory + +$CMD STAR --genomeDir $REF --genomeLoad Remove + ## index the BAM file if [[ -s Aligned.sortedByCoord.out.bam ]] then From 51ce855bf2f02e4f08d57edf9242c26ca9f6ef21 Mon Sep 17 00:00:00 2001 From: cellgeni Date: Tue, 14 Jan 2025 14:12:23 +0000 Subject: [PATCH 4/4] Removed explicit genome Load and Removal --- scripts/starsolo_10x_auto.sh | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/scripts/starsolo_10x_auto.sh b/scripts/starsolo_10x_auto.sh index dcec9dd..1146fde 100755 --- a/scripts/starsolo_10x_auto.sh +++ b/scripts/starsolo_10x_auto.sh @@ -204,9 +204,6 @@ then >&2 echo "WARNING: Read 1 length ($R1LEN) is more than the sum of appropriate barcode and UMI ($BCUMI)." fi -## Load genome intho shared memory for StarSolo use - -$CMD STAR --genomeDir $REF --genomeLoad LoadAndExit ## it's hard to come up with a universal rule to correctly infer strand-specificity of the experiment. ## this is the best I could come up with: 1) check if fraction of test reads (200k random ones) maps to GeneFull forward strand @@ -270,20 +267,16 @@ then $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R1 $R2 --runDirPerm All_RWX $GZIP $BAM --soloBarcodeMate 1 --clip5pNbases 39 0 \ --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloCBstart 1 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand Forward \ --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndKeep \ + --soloCellFilter EmptyDrops_CR --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx else $CMD STAR --runThreadN $CPUS --genomeDir $REF --readFilesIn $R2 $R1 --runDirPerm All_RWX $GZIP $BAM \ --soloType CB_UMI_Simple --soloCBwhitelist $BC --soloBarcodeReadLength 0 --soloCBlen $CBLEN --soloUMIstart $((CBLEN+1)) --soloUMIlen $UMILEN --soloStrand $STRAND \ --soloUMIdedup 1MM_CR --soloCBmatchWLtype 1MM_multi_Nbase_pseudocounts --soloUMIfiltering MultiGeneUMI_CR \ - --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndKeep \ + --soloCellFilter EmptyDrops_CR --clipAdapterType CellRanger4 --outFilterScoreMin 30 --genomeLoad LoadAndRemove \ --soloFeatures Gene GeneFull Velocyto --soloOutFileNames output/ features.tsv barcodes.tsv matrix.mtx --soloMultiMappers EM --outReadsUnmapped Fastx fi -## remove genome from shared memory - -$CMD STAR --genomeDir $REF --genomeLoad Remove - ## index the BAM file if [[ -s Aligned.sortedByCoord.out.bam ]] then