Skip to content

Commit

Permalink
Update documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
arendsee committed Jun 1, 2016
1 parent 84446d4 commit 05ebbbe
Showing 1 changed file with 81 additions and 50 deletions.
131 changes: 81 additions & 50 deletions util/prepare-data.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
#!/bin/bash
set -u


# ============================================================================
# Parse Arguments
# ============================================================================
Expand Down Expand Up @@ -39,20 +38,57 @@ die-instructively(){
exit 1
}

query= target= outdir= input=
while getopts "ha:b:d:i:" opt; do
case $opt in
h)
print_help
exit 0 ;;
a)
query="$OPTARG" ;;
b)
target="$OPTARG" ;;
i)
input="$OPTARG"
if [[ ! -f $input ]]
then
die "Cannot read input file '$input'"
fi
;;
d)
outdir="$OPTARG"
[[ -d $outdir ]] || mkdir $outdir || die "Cannot access directory '$outdir'"
;;
esac
done
# Check for existance of required arguments
for arg in query target outdir input; do
if [[ -z ${!arg} ]]; then
die-instructively 'Missing required argument'
fi
done



# ============================================================================
# Parse utilities
# ============================================================================

# Add two columns to the input table
# 1. Map of sequence names to 0-based indices (alphabetically)
# 2. Map of intervals on a sequence to 0-based indices (by start and stop)
# Arguments:
# ARG1: sequence id (e.g. Chr1 or scaffold_1)
# ARG2: start position relative to sequence id $1
# ARG3: stop position relative to sequence id $1
append_counts() {
sort -k${1},${1} -k${2},${2}n -k${3},${3}n | awk -v NC=$1 '
BEGIN { blkid=0; seqid=0 }
NR == 1 { s = $NC }
s != $NC {
NR == 1 { seqname = $NC }
seqname != $NC {
seqid++
blkid = 0
s = $NC
seqname = $NC
}
{
print $0, seqid, blkid
Expand All @@ -61,6 +97,28 @@ sort -k${1},${1} -k${2},${2}n -k${3},${3}n | awk -v NC=$1 '
'
}


# Map sequence ids to names
#
# Lines beginning in '>' indicate the species name and contig count. In the
# example below, the name species is Arabidopsis_thaliana and it has 7
# chromosomes.
#
# The lines beginning with '$' contain three columns: 1) the contig id, 2) the
# number of intervals on the contig, and 3) the name of the contig.
#
# An '@' sign terminates the species entry
#
# Example:
# > Arabidopsis_thaliana 7
# $ 0 283 chloroplast
# $ 1 60721 Chr1
# $ 2 37435 Chr2
# $ 3 44834 Chr3
# $ 4 33059 Chr4
# $ 5 53222 Chr5
# $ 6 8 mitochondrion
# @
write_side() {
awk -v seqname=$1 -v namecol=$2 -v idcol=$3 '
BEGIN{ OFS="\t" }
Expand All @@ -80,60 +138,33 @@ write_side() {
}

parse() {
input=$1
outbase=$2
query=$3
target=$4
outdb=${outbase}.txt
outtmp=${outbase}_temp.txt
append_counts 1 2 3 < $input |
append_counts 4 5 6 |
input=$1 # synteny map
outbase=$2 # output file basename
query=$3 # string: query name (e.g. Arabidopsis_thaliana)
target=$4 # string: target name
outdb=${outbase}.txt # ultimate output
outtmp=${outbase}_temp.txt # temporary output
# outtmp columns (1-8 are from the synteny map, see USAGE):
# 9 qseqid - index for query sequence
# 10 qblkid - index for interval in a query sequence array
# 11 tseqid - index for target sequence
# 12 tblkid - index for interval in a target sequence array
# 13 linkid - index for the link between a query and target interval
cat $input |
# Append qseqid and qblkid
append_counts 1 2 3 |
# Append tseqid and tblkid
append_counts 4 5 6 |
# Append linkid
awk '{print $0, linkid++}' > $outtmp
> $outdb
write_side $query 1 9 < $outtmp >> $outdb
write_side $query 1 9 < $outtmp > $outdb
write_side $target 4 11 < $outtmp >> $outdb
awk '{print "$", $9, $10, $2, $3, $11, $12, $5, $6, $13}' $outtmp |
sort -k3,3n -k4,4n >> $outdb
rm $outtmp
}


# ============================================================================
# Parse commandline
# ============================================================================

query= target= outdir= input=
while getopts "ha:b:d:i:" opt; do
case $opt in
h)
print_help
exit 0 ;;
a)
query="$OPTARG" ;;
b)
target="$OPTARG" ;;
i)
input="$OPTARG"
if [[ ! -f $input ]]
then
die "Cannot read input file '$input'"
fi
;;
d)
outdir="$OPTARG"
[[ -d $outdir ]] || mkdir $outdir || die "Cannot access directory '$outdir'"
;;
esac
done
# Check for existance of required arguments
for arg in query target outdir input; do
if [[ -z ${!arg} ]]; then
die-instructively 'Missing required argument'
fi
done



# ============================================================================
# Now run the thing
# ============================================================================
Expand Down

0 comments on commit 05ebbbe

Please sign in to comment.