From 6fae38249be11ea2f4fc98ddf05fe83ddaa7d45f Mon Sep 17 00:00:00 2001 From: Max Gao <90296632+lysokine@users.noreply.github.com> Date: Thu, 25 Sep 2025 11:59:51 -0400 Subject: [PATCH 1/2] Create fastp_extract_25n.sh Add fastp 25N extraction script and documentation for novaseq data --- sequencing-preprocessing/fastp_extract_25n.sh | 183 ++++++++++++++++++ 1 file changed, 183 insertions(+) create mode 100644 sequencing-preprocessing/fastp_extract_25n.sh diff --git a/sequencing-preprocessing/fastp_extract_25n.sh b/sequencing-preprocessing/fastp_extract_25n.sh new file mode 100644 index 0000000..d32b68e --- /dev/null +++ b/sequencing-preprocessing/fastp_extract_25n.sh @@ -0,0 +1,183 @@ +#!/bin/bash + +# 25N Variable Region Extraction Pipeline using fastp +# Usage: ./fastp_extract_25n.sh input_R1.fq.gz input_R2.fq.gz output_prefix + +set -euo pipefail + +INPUT_R1=$1 +INPUT_R2=$2 +OUTPUT_PREFIX=$3 + +echo "Starting 25N extraction pipeline using fastp..." +echo "Input R1: $INPUT_R1" +echo "Input R2: $INPUT_R2" +echo "Output prefix: $OUTPUT_PREFIX" + +# Step 1: Use fastp for automatic adapter detection and trimming +echo "Step 1: Running fastp for adapter detection and trimming..." + +fastp \ + --in1 "$INPUT_R1" \ + --in2 "$INPUT_R2" \ + --out1 "${OUTPUT_PREFIX}_R1_trimmed.fastq.gz" \ + --out2 "${OUTPUT_PREFIX}_R2_trimmed.fastq.gz" \ + --detect_adapter_for_pe \ + --correction \ + --cut_front \ + --cut_tail \ + --cut_window_size 4 \ + --cut_mean_quality 20 \ + --qualified_quality_phred 20 \ + --unqualified_percent_limit 40 \ + --length_required 15 \ + --length_limit 50 \ + --low_complexity_filter \ + --complexity_threshold 30 \ + --thread 16 \ + --json "${OUTPUT_PREFIX}_fastp.json" \ + --html "${OUTPUT_PREFIX}_fastp.html" \ + --report_title "${OUTPUT_PREFIX} 25N extraction" + +echo "Step 2: Analyzing fastp results..." + +# Check if fastp found our expected structure by looking at length distribution +echo "Length distribution after fastp trimming:" + +# Get length distribution from both R1 and R2 +{ + zcat "${OUTPUT_PREFIX}_R1_trimmed.fastq.gz" | awk 'NR%4==2 {print length($0)}' + zcat "${OUTPUT_PREFIX}_R2_trimmed.fastq.gz" | awk 'NR%4==2 {print length($0)}' +} | sort -n | uniq -c | sort -nr | head -10 + +# Step 3: Extract sequences from the file with more ~25bp reads +echo "Step 3: Determining which file contains the 25N variable regions..." + +r1_25bp=$(zcat "${OUTPUT_PREFIX}_R1_trimmed.fastq.gz" | awk 'NR%4==2 {if(length($0) >= 20 && length($0) <= 30) count++} END {print count+0}') +r2_25bp=$(zcat "${OUTPUT_PREFIX}_R2_trimmed.fastq.gz" | awk 'NR%4==2 {if(length($0) >= 20 && length($0) <= 30) count++} END {print count+0}') + +total_r1=$(zcat "${OUTPUT_PREFIX}_R1_trimmed.fastq.gz" | awk 'NR%4==2' | wc -l) +total_r2=$(zcat "${OUTPUT_PREFIX}_R2_trimmed.fastq.gz" | awk 'NR%4==2' | wc -l) + +r1_percent=$(awk -v a=$r1_25bp -v b=$total_r1 'BEGIN{printf "%.1f", 100*a/b}') +r2_percent=$(awk -v a=$r2_25bp -v b=$total_r2 'BEGIN{printf "%.1f", 100*a/b}') + +printf "R1: %'d/%'d reads in 20-30bp range (%.1f%%)\n" $r1_25bp $total_r1 $r1_percent +printf "R2: %'d/%'d reads in 20-30bp range (%.1f%%)\n" $r2_25bp $total_r2 $r2_percent + +# Decide which file(s) to use for counting +if (( $(echo "$r1_percent > $r2_percent + 5" | bc -l) )); then + echo "✅ Using R1 for sequence extraction (higher 25bp percentage: ${r1_percent}%)" + SOURCE_FILE="${OUTPUT_PREFIX}_R1_trimmed.fastq.gz" + SOURCE_NAME="R1" +elif (( $(echo "$r2_percent > $r1_percent + 5" | bc -l) )); then + echo "✅ Using R2 for sequence extraction (higher 25bp percentage: ${r2_percent}%)" + echo "⚠️ Note: R2 will be reverse-complemented to get biological sequence orientation" + SOURCE_FILE="${OUTPUT_PREFIX}_R2_trimmed.fastq.gz" + SOURCE_NAME="R2-RC" +else + echo "⚠️ Similar percentages in both files (R1:${r1_percent}%, R2:${r2_percent}%)" + echo "✅ Using both R1 and R2 for sequence extraction" + echo "📝 Note: R2 sequences will be reverse-complemented before merging to avoid duplicates" + SOURCE_FILE="both" + SOURCE_NAME="R1+R2-RC" +fi + +# Step 4: Count sequences +echo "Step 4: Extracting and counting sequences..." + +if [ "$SOURCE_FILE" = "both" ]; then + # Combine both files - R2 needs reverse complement! + echo "Converting R2 to reverse complement before merging..." + { + zcat "${OUTPUT_PREFIX}_R1_trimmed.fastq.gz" | awk 'NR%4==2' + zcat "${OUTPUT_PREFIX}_R2_trimmed.fastq.gz" | awk 'NR%4==2' | \ + rev | tr 'ATCG' 'TAGC' # Reverse complement R2 + } | LC_ALL=C sort | uniq -c | LC_ALL=C sort -nr > "${OUTPUT_PREFIX}_raw_counts.txt" +elif [[ "$SOURCE_FILE" == *"R2"* ]]; then + # R2 only - needs reverse complement + echo "Converting R2 sequences to reverse complement..." + zcat "$SOURCE_FILE" | awk 'NR%4==2' | rev | tr 'ATCG' 'TAGC' | \ + LC_ALL=C sort | uniq -c | LC_ALL=C sort -nr > "${OUTPUT_PREFIX}_raw_counts.txt" +else + # R1 only - use as-is + zcat "$SOURCE_FILE" | awk 'NR%4==2' | LC_ALL=C sort | uniq -c | LC_ALL=C sort -nr > "${OUTPUT_PREFIX}_raw_counts.txt" +fi + +# Step 5: Create output files +echo "Step 5: Creating output files..." + +# Filter sequences in reasonable length range (15-40bp to be inclusive) +awk '$1 >= 1 && length($2) >= 15 && length($2) <= 40' "${OUTPUT_PREFIX}_raw_counts.txt" > "${OUTPUT_PREFIX}_filtered_counts.txt" + +# Create FASTA +awk '{print ">"NR"_count_"$1"_len_"length($2)"\n"$2}' "${OUTPUT_PREFIX}_filtered_counts.txt" > "${OUTPUT_PREFIX}_sequences.fasta" + +# Create count table +total_reads=$(awk '{sum+=$1} END {print sum}' "${OUTPUT_PREFIX}_filtered_counts.txt") +awk -v total=$total_reads '{ + printf "%s\t%d\t%d\t%.6f\n", $2, $1, length($2), $1/total +}' "${OUTPUT_PREFIX}_filtered_counts.txt" > "${OUTPUT_PREFIX}_counts.txt" + +# Add header +sed -i '1i sequence\tcount\tlength\tfrequency' "${OUTPUT_PREFIX}_counts.txt" + +# Create summary +echo "Top 20 most abundant sequences:" > "${OUTPUT_PREFIX}_summary.txt" +echo "Rank\tSequence\tLength\tCount\tFrequency" >> "${OUTPUT_PREFIX}_summary.txt" +head -20 "${OUTPUT_PREFIX}_filtered_counts.txt" | awk -v total=$total_reads '{ + printf "%d\t%s\t%d\t%d\t%.4f\n", NR, $2, length($2), $1, $1/total +}' >> "${OUTPUT_PREFIX}_summary.txt" + +# Create length distribution +echo "length\tnum_unique_sequences\ttotal_reads" > "${OUTPUT_PREFIX}_length_distribution.txt" +awk '{ + len=length($2); + count[len]+=$1; + unique[len]++ +} END { + for(l in count) printf "%d\t%d\t%d\n", l, unique[l], count[l] +}' "${OUTPUT_PREFIX}_filtered_counts.txt" | sort -n >> "${OUTPUT_PREFIX}_length_distribution.txt" + +# Step 6: Generate statistics +echo "Step 6: Generating statistics..." + +unique_sequences=$(wc -l < "${OUTPUT_PREFIX}_filtered_counts.txt") +sequences_25bp=$(awk 'length($2) >= 23 && length($2) <= 27' "${OUTPUT_PREFIX}_filtered_counts.txt" | wc -l) +reads_25bp=$(awk 'length($2) >= 23 && length($2) <= 27 {sum+=$1} END {print sum+0}' "${OUTPUT_PREFIX}_filtered_counts.txt") + +echo "" +echo "=== EXTRACTION RESULTS ===" +printf "Source used for counting: %s\n" "$SOURCE_NAME" +printf "Total unique sequences (15-40bp): %'d\n" $unique_sequences +printf "Unique sequences in 23-27bp range: %'d\n" $sequences_25bp +printf "Total reads in 23-27bp range: %'d\n" $reads_25bp +printf "Total reads after filtering: %'d\n" $total_reads + +if [ $reads_25bp -gt 0 ]; then + echo "✅ Successfully extracted sequences in expected 25bp range!" +else + echo "⚠️ Few sequences in 23-27bp range. Check length distribution file." +fi + +# Cleanup intermediate files +rm -f "${OUTPUT_PREFIX}_R1_trimmed.fastq.gz" "${OUTPUT_PREFIX}_R2_trimmed.fastq.gz" +rm -f "${OUTPUT_PREFIX}_raw_counts.txt" "${OUTPUT_PREFIX}_filtered_counts.txt" + +echo "" +echo "Files created:" +echo " ${OUTPUT_PREFIX}_sequences.fasta - FASTA with abundance" +echo " ${OUTPUT_PREFIX}_counts.txt - Count table with length info" +echo " ${OUTPUT_PREFIX}_summary.txt - Top 20 sequences" +echo " ${OUTPUT_PREFIX}_length_distribution.txt - Length distribution" +echo " ${OUTPUT_PREFIX}_fastp.html - Quality control report" +echo " ${OUTPUT_PREFIX}_fastp.json - Detailed statistics" + +echo "" +echo "Next steps:" +echo "1. Check ${OUTPUT_PREFIX}_fastp.html for quality control" +echo "2. Review length distribution to confirm 25bp enrichment" +echo "3. Proceed with clustering analysis if results look good" + +echo "" +echo "Pipeline completed successfully!" From 00107a7eeea5d477d46b3a2c5b8fe38912b6b950 Mon Sep 17 00:00:00 2001 From: Max Gao <90296632+lysokine@users.noreply.github.com> Date: Thu, 25 Sep 2025 12:00:49 -0400 Subject: [PATCH 2/2] Create fastp_extract_25n_summary.md short description on the shell script --- .../fastp_extract_25n_summary.md | 69 +++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 sequencing-preprocessing/fastp_extract_25n_summary.md diff --git a/sequencing-preprocessing/fastp_extract_25n_summary.md b/sequencing-preprocessing/fastp_extract_25n_summary.md new file mode 100644 index 0000000..4cc9429 --- /dev/null +++ b/sequencing-preprocessing/fastp_extract_25n_summary.md @@ -0,0 +1,69 @@ +# 25N Variable Region Extraction Script + +This script (`fastp_extract_25n.sh`) trims and filters paired-end FASTQ data +using **fastp**, then extracts highly enriched ~25-nucleotide variable regions +and produces sequence count statistics. + +--- + +## Usage +```bash +./fastp_extract_25n.sh +``` + +| Argument | Description | +|----------|-------------| +| `input_R1.fastq.gz` | Read 1 FASTQ file (gzipped) | +| `input_R2.fastq.gz` | Read 2 FASTQ file (gzipped) | +| `output_prefix` | Prefix for all output files | + +Example: +```bash +./fastp_extract_25n.sh sample_R1.fq.gz sample_R2.fq.gz sample_out +``` + +--- + +## Requirements +Install or have available in your `$PATH`: +- **fastp** (adapter detection, quality trimming) +- Standard UNIX utilities: `bash`, `gzip`/`zcat`, `awk`, `sort`, `uniq`, `bc`, + `sed`, `rev`, `tr` + +You can install fastp via conda: +```bash +conda install -c bioconda fastp +``` + +--- + +## Key Output Files +All outputs share the `` you provide: + +| File | Description | +|------|-------------| +| `_fastp.html` | Interactive QC report from fastp | +| `_fastp.json` | Machine-readable QC summary | +| `_counts.txt` | Tab-delimited table of unique sequences: `sequence`, `count`, `length`, `frequency` | +| `_sequences.fasta` | FASTA of unique sequences (headers include counts and length) | +| `_summary.txt` | Top 20 most abundant sequences | +| `_length_distribution.txt` | Length vs. unique count and total read count | + +Temporary intermediate FASTQ and raw count files are removed automatically. + +--- + +## Pipeline Overview +1. **Adapter trimming & QC** – `fastp` automatically detects adapters, corrects + bases, trims low-quality regions, and filters by length and complexity. +2. **Length analysis** – Counts reads in the 20–30 bp range to identify which + mate (R1 or R2) carries the 25 N variable region. +3. **Sequence extraction** – Chooses the higher-signal mate (or both), reverse-complements R2 if used, and collapses identical sequences. +4. **Counting & reporting** – Generates count tables, FASTA, summary, and length-distribution statistics. + +--- + +## Notes +- Inspect the `*_fastp.html` report and `*_length_distribution.txt` + to confirm expected ~25 bp enrichment before downstream analysis. +- The script automatically cleans up large intermediate files to save space.