-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathcount_c_in_reads.pl
executable file
·71 lines (56 loc) · 2.06 KB
/
count_c_in_reads.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#!/usr/bin/env perl
# ___UNDOCUMENTED___
use strict;
if(@ARGV<5) {die("Usage: countC.pl <elandfile> <fastafile> <strand (0=forward, 1=reverse)> <type (exclude, include)> <regex to exclude or include>\n");}
my $elandfile=$ARGV[0];
my $fastafile=$ARGV[1];
my $reverse=$ARGV[2];
my $type=$ARGV[3];
my $regex=$ARGV[4];
print STDERR "Processing on the reverse strand...\n" if $reverse;
open(my $ELANDFILE, "<", "$elandfile") or die("Can't open $elandfile");
open(my $FASTAFILE, "<", "$fastafile") or die("Can't open $fastafile");
my ($total_C, $total_CG, $total_CHG, $total_CHH, $total_BP)=(0,0,0,0,0);
my $independent_C = 0;
while(my $elandline = <$ELANDFILE>) {
my $fastaline = <$FASTAFILE>;
while($fastaline =~ m/^>|^#.*$|^\s*$/) {
$fastaline = <$FASTAFILE>;
}
next if((split "\t", $elandline)[2] =~ m/NM/);
next if((split "\t", $elandline)[3]!~m/$regex/i && $type=~m/include/i);
next if((split "\t", $elandline)[3]=~m/$regex/i && $type=~m/exclude/i);
chomp $fastaline;
my @seq=split(//, $fastaline);
$total_BP = $total_BP + scalar(@seq);
for(my $i=0;$i<@seq;$i++) {
$independent_C++ if( ($seq[$i] =~ m/[Cc]/ && !$reverse) ||
($seq[$i] =~ m/[Gg]/ && $reverse) );
if((join("",@seq[$i..($i+1)])=~m/[Cc][Gg]/ && !$reverse) ||
(join("",@seq[($i-1)..$i])=~m/[Cc][Gg]/ && $reverse)) {
$total_C++;
$total_CG++;
}
elsif((join("",@seq[$i..($i+2)])=~m/[Cc][^Gg][Gg]/ && !$reverse) ||
(join("",@seq[($i-2)..$i])=~m/[Cc][^Cc][Gg]/ && $reverse)) {
$total_C++;
$total_CHG++;
}
elsif((join("",@seq[$i..($i+2)])=~m/[Cc][^Gg][^Gg]/ && !$reverse) ||
(join("",@seq[($i-2)..$i])=~m/[^Cc][^Cc][Gg]/ && $reverse)) {
$total_C++;
$total_CHH++;
}
}
}
close($ELANDFILE);
close($FASTAFILE);
print "Post alignment frequencies";
print "\nProcessed files: $ARGV[0] and $ARGV[1]";
print "\nType of count: $type $regex";
print "\nTotal BP: $total_BP";
print "\nIndependent C: $independent_C";
print "\nTotal C: $total_C";
print "\nTotal CG: $total_CG";
print "\nTotal CHG: $total_CHG";
print "\nTotal CHH: $total_CHH \n\n";