diff --git a/util/prepare-data.sh b/util/prepare-data.sh index 5771470..c15a8eb 100755 --- a/util/prepare-data.sh +++ b/util/prepare-data.sh @@ -1,7 +1,6 @@ #!/bin/bash set -u - # ============================================================================ # Parse Arguments # ============================================================================ @@ -39,20 +38,57 @@ die-instructively(){ exit 1 } +query= target= outdir= input= +while getopts "ha:b:d:i:" opt; do + case $opt in + h) + print_help + exit 0 ;; + a) + query="$OPTARG" ;; + b) + target="$OPTARG" ;; + i) + input="$OPTARG" + if [[ ! -f $input ]] + then + die "Cannot read input file '$input'" + fi + ;; + d) + outdir="$OPTARG" + [[ -d $outdir ]] || mkdir $outdir || die "Cannot access directory '$outdir'" + ;; + esac +done +# Check for existance of required arguments +for arg in query target outdir input; do + if [[ -z ${!arg} ]]; then + die-instructively 'Missing required argument' + fi +done + # ============================================================================ # Parse utilities # ============================================================================ +# Add two columns to the input table +# 1. Map of sequence names to 0-based indices (alphabetically) +# 2. Map of intervals on a sequence to 0-based indices (by start and stop) +# Arguments: +# ARG1: sequence id (e.g. Chr1 or scaffold_1) +# ARG2: start position relative to sequence id $1 +# ARG3: stop position relative to sequence id $1 append_counts() { sort -k${1},${1} -k${2},${2}n -k${3},${3}n | awk -v NC=$1 ' BEGIN { blkid=0; seqid=0 } - NR == 1 { s = $NC } - s != $NC { + NR == 1 { seqname = $NC } + seqname != $NC { seqid++ blkid = 0 - s = $NC + seqname = $NC } { print $0, seqid, blkid @@ -61,6 +97,28 @@ sort -k${1},${1} -k${2},${2}n -k${3},${3}n | awk -v NC=$1 ' ' } + +# Map sequence ids to names +# +# Lines beginning in '>' indicate the species name and contig count. In the +# example below, the name species is Arabidopsis_thaliana and it has 7 +# chromosomes. +# +# The lines beginning with '$' contain three columns: 1) the contig id, 2) the +# number of intervals on the contig, and 3) the name of the contig. +# +# An '@' sign terminates the species entry +# +# Example: +# > Arabidopsis_thaliana 7 +# $ 0 283 chloroplast +# $ 1 60721 Chr1 +# $ 2 37435 Chr2 +# $ 3 44834 Chr3 +# $ 4 33059 Chr4 +# $ 5 53222 Chr5 +# $ 6 8 mitochondrion +# @ write_side() { awk -v seqname=$1 -v namecol=$2 -v idcol=$3 ' BEGIN{ OFS="\t" } @@ -80,17 +138,26 @@ write_side() { } parse() { - input=$1 - outbase=$2 - query=$3 - target=$4 - outdb=${outbase}.txt - outtmp=${outbase}_temp.txt - append_counts 1 2 3 < $input | - append_counts 4 5 6 | + input=$1 # synteny map + outbase=$2 # output file basename + query=$3 # string: query name (e.g. Arabidopsis_thaliana) + target=$4 # string: target name + outdb=${outbase}.txt # ultimate output + outtmp=${outbase}_temp.txt # temporary output + # outtmp columns (1-8 are from the synteny map, see USAGE): + # 9 qseqid - index for query sequence + # 10 qblkid - index for interval in a query sequence array + # 11 tseqid - index for target sequence + # 12 tblkid - index for interval in a target sequence array + # 13 linkid - index for the link between a query and target interval + cat $input | + # Append qseqid and qblkid + append_counts 1 2 3 | + # Append tseqid and tblkid + append_counts 4 5 6 | + # Append linkid awk '{print $0, linkid++}' > $outtmp - > $outdb - write_side $query 1 9 < $outtmp >> $outdb + write_side $query 1 9 < $outtmp > $outdb write_side $target 4 11 < $outtmp >> $outdb awk '{print "$", $9, $10, $2, $3, $11, $12, $5, $6, $13}' $outtmp | sort -k3,3n -k4,4n >> $outdb @@ -98,42 +165,6 @@ parse() { } -# ============================================================================ -# Parse commandline -# ============================================================================ - -query= target= outdir= input= -while getopts "ha:b:d:i:" opt; do - case $opt in - h) - print_help - exit 0 ;; - a) - query="$OPTARG" ;; - b) - target="$OPTARG" ;; - i) - input="$OPTARG" - if [[ ! -f $input ]] - then - die "Cannot read input file '$input'" - fi - ;; - d) - outdir="$OPTARG" - [[ -d $outdir ]] || mkdir $outdir || die "Cannot access directory '$outdir'" - ;; - esac -done -# Check for existance of required arguments -for arg in query target outdir input; do - if [[ -z ${!arg} ]]; then - die-instructively 'Missing required argument' - fi -done - - - # ============================================================================ # Now run the thing # ============================================================================