Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions Load/bin/filterForLongestTranscript
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
#!/usr/bin/perl
use strict;
use warnings;
use Getopt::Long;
use File::Temp qw(tempdir);
use File::Spec;
use File::Basename;
use File::Copy qw(move);

my $fastaDir;
GetOptions("fastaDir=s" => \$fastaDir) or die "Usage: $0 --fastaDir <directory>\n";
die "Error: --fastaDir not provided\n" unless $fastaDir;
die "Error: directory '$fastaDir' not found\n" unless -d $fastaDir;

# Create a temporary working directory for filtered FASTAs
my $tmpdir = tempdir(CLEANUP => 1);
print "Temporary directory: $tmpdir\n";

# Process each FASTA file in the directory
opendir(my $dh, $fastaDir) or die "Cannot open directory $fastaDir: $!\n";
while (my $file = readdir($dh)) {
next unless $file =~ /\.fa(sta)?$/i; # only .fa or .fasta
my $path = File::Spec->catfile($fastaDir, $file);
process_fasta($path, $tmpdir);
}
closedir($dh);

# Move processed files back to original directory
opendir(my $tdh, $tmpdir) or die "Cannot open temp directory $tmpdir: $!\n";
while (my $file = readdir($tdh)) {
next if $file =~ /^\./;
my $src = File::Spec->catfile($tmpdir, $file);
my $dest = File::Spec->catfile($fastaDir, $file);
move($src, $dest) or die "Failed to overwrite $dest: $!\n";
}
closedir($tdh);

# ================================================= Subroutines ==================================================

sub process_fasta {
my ($fasta, $tmpdir) = @_;
my %genes;

open(my $fh, '<', $fasta) or die "Cannot open $fasta: $!\n";
my ($header, $seq) = ('', '');
while (<$fh>) {
chomp;
if (/^>/) {
store_record($header, $seq, \%genes) if $header && $seq;
$header = $_;
$seq = '';
} else {
$seq .= $_;
}
}
store_record($header, $seq, \%genes) if $header && $seq;
close($fh);

my ($filename) = fileparse($fasta);
my $out = File::Spec->catfile($tmpdir, $filename);
open(my $outfh, '>', $out) or die "Cannot write $out: $!\n";

foreach my $gene (sort keys %genes) {
print $outfh $genes{$gene}->{header}, "\n", $genes{$gene}->{seq}, "\n";
}
close($outfh);
}

sub store_record {
my ($header, $seq, $genes_ref) = @_;
return unless $header && $seq;

my ($gene_id) = $header =~ /gene=([^\s]+)/;
# gene= currently consistent across all fastas, core and peripheral, but could change
unless ($gene_id) {
warn " No gene= found in header: $header\n";
return;
}

my $len = length($seq);
if (!exists $genes_ref->{$gene_id} || $len > $genes_ref->{$gene_id}->{len}) {
$genes_ref->{$gene_id} = {
header => $header,
seq => $seq,
len => $len
};
}
}
7 changes: 2 additions & 5 deletions Load/bin/orthomclEcPrediction
Original file line number Diff line number Diff line change
Expand Up @@ -1372,16 +1372,13 @@ sub ecsSql {
sub proteinsSql {
my ($group) = @_;
return "SELECT full_id,product,length,core_peripheral,group_name,organism_name
FROM webready.ProteinSequence_pGroup WHERE group_name='$group'";
FROM webready.ProteinSequenceGroup WHERE group_name='$group'";
}

sub groupsSql {
my ($minNumProteinsWithEc,$maxNumProteinsWithEc) = @_;
<<<<<<< HEAD

return "SELECT DISTINCT(group_name) FROM webready.ProteinSequenceGroup";
=======
return "SELECT DISTINCT(group_name) FROM webready.ProteinSequence_pgroup";
>>>>>>> master
}

sub orthoOrganismSql {
Expand Down
90 changes: 49 additions & 41 deletions Load/plugin/perl/InsertPhylogeneticProfile.pm
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ my $argsDeclaration =
descr => 'ortholog groups file as produced by orthofinder',
reqd => 1,
mustExist => 1,
format => 'OG7_0001009: osa|ENS1222992 pfa|PF11_0844...',
format => 'OG7_0001009: osa|ENS1222992 pfa|PF11_0844...',
constraintFunc => undef,
isList => 0, }),

Expand Down Expand Up @@ -63,11 +63,11 @@ my $documentation = { purpose => $purpose,
# ----------------------------------------------------------------------

sub new {
my ($class) = @_;
my $self = {};
bless($self,$class);
my ($class) = @_;
my $self = {};
bless($self,$class);

$self->initialize({ requiredDbVersion => 4.0,
$self->initialize({ requiredDbVersion => 4.0,
cvsRevision => '$Revision$',
name => ref($self),
argsDeclaration => $argsDeclaration,
Expand All @@ -79,59 +79,67 @@ sub new {
# ======================================================================

sub run {
my ($self) = @_;
my ($self) = @_;

my %allTaxa;
my %allTaxa;

my $sql = "select orthomcl_abbrev from ApiDB.Organism";
my $sth = $self->prepareAndExecute($sql);
my $sql = "select orthomcl_abbrev from ApiDB.Organism";
my $sth = $self->prepareAndExecute($sql);

while (my ($orthomclAbbrev) = $sth->fetchrow_array()) {
$allTaxa{$orthomclAbbrev} = 1;
}
while (my ($orthomclAbbrev) = $sth->fetchrow_array()) {
$allTaxa{$orthomclAbbrev} = 1;
}

my @allTaxaSorted = sort(keys(%allTaxa));
my %proteinIdToOrthomclAbbrev;

open(FILE, $self->getArg('groupsFile')) || die "Could Not open Ortholog File for reading: $!\n";
my $idSql = "select org.orthomcl_abbrev, aas.source_id from dots.aasequence aas, apidb.organism org where aas.taxon_id = org.taxon_id";
my $idSth = $self->prepareAndExecute($idSql);

my ($counter);
while (my $line = <FILE>) {
chomp($line);
my ($groupId, $membersString) = split(/\:\s/, $line);
my @members = split(/\s/, $membersString);
while (my ($orthoAbbrev,$source_id) = $idSth->fetchrow_array()) {
$proteinIdToOrthomclAbbrev{$source_id} = $orthoAbbrev;
}

my %taxaInThisGroup;
my @allTaxaSorted = sort(keys(%allTaxa));

foreach my $member (@members) {
# pfal|PF11_0987
my ($taxonCode, $sourceId) = split(/\|/, $member);
$taxaInThisGroup{$taxonCode} = 1;
}
open(FILE, $self->getArg('groupsFile')) || die "Could Not open Ortholog File for reading: $!\n";

my @fullProfile;
my ($counter);
while (my $line = <FILE>) {
chomp($line);
my ($groupId, $membersString) = split(/\:\s/, $line);
my @members = split(/\s/, $membersString);

foreach my $taxaCode (@allTaxaSorted) {
my $yesNo = $taxaInThisGroup{$taxaCode} ? 'Y' : 'N';
push(@fullProfile, "$taxaCode:$yesNo");
}
my %taxaInThisGroup;

my $profileString = join(':', @fullProfile);
foreach my $member (@members) {
my $taxonCode = $proteinIdToOrthomclAbbrev{$member};
$taxaInThisGroup{$taxonCode} = 1;
}

my $profile = GUS::Model::ApiDB::PhylogeneticProfile->new({source_id => $groupId, profile_string => $profileString});
$profile->submit();
my @fullProfile;

$counter++;
if ($counter % 5000 == 0) {
$self->log("Processed $counter lines from groupsFile");
$self->undefPointerCache();
}
foreach my $taxaCode (@allTaxaSorted) {
my $yesNo = $taxaInThisGroup{$taxaCode} ? 'Y' : 'N';
push(@fullProfile, "$taxaCode:$yesNo");
}

}
my $profileString = join(':', @fullProfile);

my $profile = GUS::Model::ApiDB::PhylogeneticProfile->new({source_id => $groupId, profile_string => $profileString});
$profile->submit();

$counter++;
if ($counter % 5000 == 0) {
$self->log("Processed $counter lines from groupsFile");
$self->undefPointerCache();
}

}
}

sub undoTables {
my ($self) = @_;
return ('ApiDB.PhylogeneticProfile');
my ($self) = @_;
return ('ApiDB.PhylogeneticProfile');
}

1;