Skip to content

Commit

Permalink
Apply suggestions from code review
Browse files Browse the repository at this point in the history
Co-authored-by: Janani Ravi <[email protected]>
  • Loading branch information
falquaddoomi and jananiravi authored Dec 10, 2024
1 parent c9ad4e4 commit af56507
Showing 1 changed file with 31 additions and 89 deletions.
120 changes: 31 additions & 89 deletions R/pspViz.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,6 @@ cln_colnames <- c("AccNum", "DomArch", "GenContext", "Length", "GeneName",
"Lineage", "Species", "GCA_ID", "GeneDesc",
"DomArch.repeats", "GenContext.repeats")

domains_rename <- read_delim("data/acc_files/domains_rename.txt",
delim="\t", col_names=TRUE)

# domains_ignore <- read_delim("data/acc_files/domains_ignore.txt",
# delim="\t", col_names=T)

domains_keep <- read_delim("data/acc_files/domains_keep.txt",
delim="\t", col_names=T)

query_domains <- read_delim("data/acc_files/query_domains.txt",
delim="\t", col_names=T)
lineage_rename <- read_tsv("data/acc_files/lineage_rename.txt",
Expand All @@ -40,24 +31,23 @@ data_cleanup <- function(all_raw) {
# For the repeats
all_cln <- all_cln %>%
cleanup_domarch(domains_rename=domains_rename,
domains_keep=NULL, # filter applied to only ClustName for now.
domains_ignore=NULL, #!! should check and remove soon!
repeat2s=FALSE,
remove_tails=F, #new! check below if it works!
remove_empty=F) #new! needed?
repeat2s=FALSE)
# remove_tails=F, # FIXME # keep if it exists & works for DomArch
# remove_empty=F) # FIXME # keep if it exists & works for DomArch


all_cln$DomArch.repeats <- all_cln$DomArch

# Removing duplicate AccNum w/ different DomArchs
all_cln$DomArch.uncompressed <- all_cln$DomArch.repeats
# !! repeat2s: deprecation notice for funs & list
all_cln <- repeat2s(all_cln, "DomArch.uncompressed",
excluded_prots=c("PspC", "LiaI-LiaF-TM"))
all_cln <- repeat2s(all_cln, "DomArch.uncompressed")

# Extract unique rows
all_cln <- all_cln %>% distinct()
# Pick longer of the duplicated AccNum DomArchs
all_cln <- all_cln %>% pick_longer_duplicate("DomArch.uncompressed")
all_cln <- all_cln %>%
pick_longer_duplicate("DomArch.uncompressed")

all_cln <- all_cln %>%
cleanup_domarch(domains_rename=domains_rename,
Expand All @@ -67,17 +57,18 @@ data_cleanup <- function(all_raw) {
remove_tails=F,
remove_empty=F)

# FIXME # Not using genomic context parts of the script, yet
# Cleanup GenContext
# Calls reverse_operons
all_cln <- all_cln %>%
cleanup_gencontext(domains_rename=domains_rename,
repeat2s=FALSE)

all_cln$GenContext.repeats <- all_cln$GenContext

all_cln <- all_cln %>%
cleanup_gencontext(domains_rename=domains_rename,
repeat2s=T)
# Calls reverseOperons
# all_cln <- all_cln %>%
# cleanup_gencontext(domains_rename=domains_rename,
# repeat2s=FALSE)
#
# all_cln$GenContext.repeats <- all_cln$GenContext
#
# all_cln <- all_cln %>%
# cleanup_gencontext(domains_rename=domains_rename,
# repeat2s=T)

# Subset essential columns for downstream analyses
all_cln <- all_cln %>% select(all_of(cln_colnames))
Expand All @@ -87,88 +78,39 @@ data_cleanup <- function(all_raw) {



# Replace SIG+Toastrack with TM+Toastrack
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack")
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack")

all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack")
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack")

# Replace SIG+DUF4178 with TM+4178
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178")
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178")

all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178")
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178")

# Replace SIG+PspA with TM+PspA
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA")
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA")

all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA")
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA")


# Convert Sig+Snf7 to TM+Snf7
all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7")
all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7")
all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7")
all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7")


## Write cleaned up file
# write_tsv(all_cln, "data/rawdata_tsv/all_clean_combined_20210329.txt")


prot <- all_cln
}

psp_fig4 <- function(job_dir) {
# FIXME: figure out if all_raw can be read from the job folder
plotLineageBarStack <- function(job_dir) {
# FIXME # figure out if all_raw can be read from the job folder
all_w_extra <- read_tsv(file="data/rawdata_tsv/all_with_extrapspasnf7.tsv")
# all_w_extra$GI
all_raw <- all_w_extra

# similar to the Rmd, we run the cleanup function on all_raw to produce prot
# cleanup function on all_raw to produce prot
# FIXME # Check if this is needed
prot <- data_cleanup(all_raw)

# the rest of the figure generation follows as-is
prot$Lineage.reduced = prot$Lineage %>% str_replace(pattern = "^eukaryota.*", replacement = "eukaryota") %>% str_replace(pattern = "^viruses.*", replacement = "viruses")
# rename lineages
prot$Lineage.reduced = prot$Lineage %>%
str_replace(pattern = "^eukaryota.*", replacement = "eukaryota") %>%
str_replace(pattern = "^viruses.*", replacement = "viruses")

prot$Lineage = prot$Lineage.reduced

nopspa <- prot %>%
filter_by_doms(column="DomArch",
doms_remove=c("PspA", "PspA(s)", "Snf7"),
ignore.case=T)

# Include only DAs ≥ min.cutoff
nopspa_tc <- nopspa %>% total_counts(cutoff=100)
prot_cutoff <- prot %>% total_counts(cutoff=100)

nopspa_tc <- nopspa_tc %>% filter(totalcount>=50)
prot_cutoff <- prot_cutoff %>% filter(totalcount>=50)

cutoff_perc <- 100 - nopspa_tc$CumulativePercent[nrow(nopspa_tc)]
cutoff_perc <- 100 - prot_cutoff$CumulativePercent[nrow(prot_cutoff)]

stlin_nops <- stacked_lin_plot(prot=nopspa, column="DomArch",
stacked_lin <- stacked_lin_plot(prot= prot, column="DomArch",
cutoff=cutoff_perc,
label.size=20,
legend.position=c(0.6, 0.40), legend.text.size=20,
legend.size=1)

# res = stacked_lin_plot(prot=nopspa, column="DomArch",
# cutoff=cutoff_perc,
# label.size=20,
# legend.position=c(0.6, 0.40), legend.text.size=20,
# legend.size=1)

return(stlin_nops)

# FA: writing out the file commented out in favor of returning the object
return(stacked_lin)

# Decrease width, increase height, increase font size
#
# ggsave("stackedLinPlot_50tc.png", stlin_nops,
# width=15,
# height=18,
# dpi=300)
}

0 comments on commit af56507

Please sign in to comment.