Update documentation

arendsee · Jun 1, 2016 · 05ebbbe · 05ebbbe
1 parent 84446d4
commit 05ebbbe
Showing 1 changed file with 81 additions and 50 deletions.
diff --git a/util/prepare-data.sh b/util/prepare-data.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 set -u
 
-
 # ============================================================================
 # Parse Arguments
 # ============================================================================
@@ -39,20 +38,57 @@ die-instructively(){
     exit 1
 }
 
+query= target= outdir= input=
+while getopts "ha:b:d:i:" opt; do
+    case $opt in
+        h)
+            print_help
+            exit 0 ;;
+        a)
+            query="$OPTARG" ;;
+        b)
+            target="$OPTARG" ;;
+        i)
+            input="$OPTARG" 
+            if [[ ! -f $input ]]
+            then
+                die "Cannot read input file '$input'"
+            fi
+            ;;
+        d)
+            outdir="$OPTARG"
+            [[ -d $outdir ]] || mkdir $outdir || die "Cannot access directory '$outdir'"
+            ;;
+    esac
+done
+# Check for existance of required arguments
+for arg in query target outdir input; do
+    if [[ -z ${!arg} ]]; then
+        die-instructively 'Missing required argument'
+    fi
+done
+
 
 
 # ============================================================================
 # Parse utilities
 # ============================================================================
 
+# Add two columns to the input table
+#  1. Map of sequence names to 0-based indices (alphabetically)
+#  2. Map of intervals on a sequence to 0-based indices (by start and stop) 
+# Arguments:
+# ARG1: sequence id (e.g. Chr1 or scaffold_1) 
+# ARG2: start position relative to sequence id $1
+# ARG3: stop position relative to sequence id $1
 append_counts() {
 sort -k${1},${1} -k${2},${2}n -k${3},${3}n | awk -v NC=$1 '
         BEGIN { blkid=0; seqid=0 }
-        NR == 1 { s = $NC }
-        s != $NC {
+        NR == 1 { seqname = $NC }
+        seqname != $NC {
             seqid++
             blkid = 0
-            s = $NC
+            seqname = $NC
         }
         { 
             print $0, seqid, blkid 
@@ -61,6 +97,28 @@ sort -k${1},${1} -k${2},${2}n -k${3},${3}n | awk -v NC=$1 '
     '
 }
 
+
+# Map sequence ids to names
+#
+# Lines beginning in '>' indicate the species name and contig count. In the
+# example below, the name species is Arabidopsis_thaliana and it has 7
+# chromosomes.
+#
+# The lines beginning with '$' contain three columns: 1) the contig id, 2) the
+# number of intervals on the contig, and 3) the name of the contig.
+#
+# An '@' sign terminates the species entry
+#
+# Example:
+# >	Arabidopsis_thaliana	7
+# $	0	283	chloroplast
+# $	1	60721	Chr1
+# $	2	37435	Chr2
+# $	3	44834	Chr3
+# $	4	33059	Chr4
+# $	5	53222	Chr5
+# $	6	8	mitochondrion
+# @
 write_side() {
     awk -v seqname=$1 -v namecol=$2 -v idcol=$3 '
         BEGIN{ OFS="\t" }
@@ -80,60 +138,33 @@ write_side() {
 }
 
 parse() {
-    input=$1
-    outbase=$2
-    query=$3
-    target=$4
-    outdb=${outbase}.txt
-    outtmp=${outbase}_temp.txt
-    append_counts 1 2 3 < $input |
-        append_counts 4 5 6 |
+    input=$1   # synteny map
+    outbase=$2 # output file basename
+    query=$3   # string: query name (e.g. Arabidopsis_thaliana)
+    target=$4  # string: target name
+    outdb=${outbase}.txt       # ultimate output
+    outtmp=${outbase}_temp.txt # temporary output
+    # outtmp columns (1-8 are from the synteny map, see USAGE):
+    # 9  qseqid - index for query sequence
+    # 10 qblkid - index for interval in a query sequence array
+    # 11 tseqid - index for target sequence
+    # 12 tblkid - index for interval in a target sequence array
+    # 13 linkid - index for the link between a query and target interval
+    cat $input |
+        # Append qseqid and qblkid
+        append_counts 1 2 3 | 
+        # Append tseqid and tblkid
+        append_counts 4 5 6 | 
+        # Append linkid
         awk '{print $0, linkid++}' > $outtmp
-    > $outdb
-    write_side $query  1 9  < $outtmp >> $outdb
+    write_side $query  1 9  < $outtmp >  $outdb
     write_side $target 4 11 < $outtmp >> $outdb
     awk '{print "$", $9, $10, $2, $3, $11, $12, $5, $6, $13}' $outtmp |
         sort -k3,3n -k4,4n >> $outdb
     rm $outtmp
 }
 
 
-# ============================================================================
-# Parse commandline 
-# ============================================================================
-
-query= target= outdir= input=
-while getopts "ha:b:d:i:" opt; do
-    case $opt in
-        h)
-            print_help
-            exit 0 ;;
-        a)
-            query="$OPTARG" ;;
-        b)
-            target="$OPTARG" ;;
-        i)
-            input="$OPTARG" 
-            if [[ ! -f $input ]]
-            then
-                die "Cannot read input file '$input'"
-            fi
-            ;;
-        d)
-            outdir="$OPTARG"
-            [[ -d $outdir ]] || mkdir $outdir || die "Cannot access directory '$outdir'"
-            ;;
-    esac
-done
-# Check for existance of required arguments
-for arg in query target outdir input; do
-    if [[ -z ${!arg} ]]; then
-        die-instructively 'Missing required argument'
-    fi
-done
-
-
-
 # ============================================================================
 # Now run the thing
 # ============================================================================