From af56507801c7926dbddf3b2e8a14180506f42f4d Mon Sep 17 00:00:00 2001 From: Faisal Alquaddoomi Date: Tue, 10 Dec 2024 11:43:33 -0700 Subject: [PATCH] Apply suggestions from code review Co-authored-by: Janani Ravi --- R/pspViz.R | 120 ++++++++++++++--------------------------------------- 1 file changed, 31 insertions(+), 89 deletions(-) diff --git a/R/pspViz.R b/R/pspViz.R index 75a9d262..dc21016e 100644 --- a/R/pspViz.R +++ b/R/pspViz.R @@ -14,15 +14,6 @@ cln_colnames <- c("AccNum", "DomArch", "GenContext", "Length", "GeneName", "Lineage", "Species", "GCA_ID", "GeneDesc", "DomArch.repeats", "GenContext.repeats") -domains_rename <- read_delim("data/acc_files/domains_rename.txt", - delim="\t", col_names=TRUE) - -# domains_ignore <- read_delim("data/acc_files/domains_ignore.txt", -# delim="\t", col_names=T) - -domains_keep <- read_delim("data/acc_files/domains_keep.txt", - delim="\t", col_names=T) - query_domains <- read_delim("data/acc_files/query_domains.txt", delim="\t", col_names=T) lineage_rename <- read_tsv("data/acc_files/lineage_rename.txt", @@ -40,24 +31,23 @@ data_cleanup <- function(all_raw) { # For the repeats all_cln <- all_cln %>% cleanup_domarch(domains_rename=domains_rename, - domains_keep=NULL, # filter applied to only ClustName for now. - domains_ignore=NULL, #!! should check and remove soon! - repeat2s=FALSE, - remove_tails=F, #new! check below if it works! - remove_empty=F) #new! needed? + repeat2s=FALSE) + # remove_tails=F, # FIXME # keep if it exists & works for DomArch + # remove_empty=F) # FIXME # keep if it exists & works for DomArch + all_cln$DomArch.repeats <- all_cln$DomArch # Removing duplicate AccNum w/ different DomArchs all_cln$DomArch.uncompressed <- all_cln$DomArch.repeats # !! repeat2s: deprecation notice for funs & list - all_cln <- repeat2s(all_cln, "DomArch.uncompressed", - excluded_prots=c("PspC", "LiaI-LiaF-TM")) + all_cln <- repeat2s(all_cln, "DomArch.uncompressed") # Extract unique rows all_cln <- all_cln %>% distinct() # Pick longer of the duplicated AccNum DomArchs - all_cln <- all_cln %>% pick_longer_duplicate("DomArch.uncompressed") + all_cln <- all_cln %>% + pick_longer_duplicate("DomArch.uncompressed") all_cln <- all_cln %>% cleanup_domarch(domains_rename=domains_rename, @@ -67,17 +57,18 @@ data_cleanup <- function(all_raw) { remove_tails=F, remove_empty=F) + # FIXME # Not using genomic context parts of the script, yet # Cleanup GenContext - # Calls reverse_operons - all_cln <- all_cln %>% - cleanup_gencontext(domains_rename=domains_rename, - repeat2s=FALSE) - - all_cln$GenContext.repeats <- all_cln$GenContext - - all_cln <- all_cln %>% - cleanup_gencontext(domains_rename=domains_rename, - repeat2s=T) + # Calls reverseOperons + # all_cln <- all_cln %>% + # cleanup_gencontext(domains_rename=domains_rename, + # repeat2s=FALSE) + # + # all_cln$GenContext.repeats <- all_cln$GenContext + # + # all_cln <- all_cln %>% + # cleanup_gencontext(domains_rename=domains_rename, + # repeat2s=T) # Subset essential columns for downstream analyses all_cln <- all_cln %>% select(all_of(cln_colnames)) @@ -87,88 +78,39 @@ data_cleanup <- function(all_raw) { - # Replace SIG+Toastrack with TM+Toastrack - all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") - all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") - - all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") - all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Toastrack", replacement = "TM+Toastrack") - - # Replace SIG+DUF4178 with TM+4178 - all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") - all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") - - all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") - all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+DUF4178", replacement = "TM+DUF4178") - - # Replace SIG+PspA with TM+PspA - all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") - all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") - - all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") - all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+PspA", replacement = "TM+PspA") - - - # Convert Sig+Snf7 to TM+Snf7 - all_cln$DomArch <- all_cln$DomArch %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") - all_cln$DomArch.repeats <- all_cln$DomArch.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") - all_cln$GenContext <- all_cln$GenContext %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") - all_cln$GenContext.repeats <- all_cln$GenContext.repeats %>% str_replace_all(pattern = "SIG\\+Snf7", replacement = "TM+Snf7") - - - ## Write cleaned up file - # write_tsv(all_cln, "data/rawdata_tsv/all_clean_combined_20210329.txt") - prot <- all_cln } -psp_fig4 <- function(job_dir) { - # FIXME: figure out if all_raw can be read from the job folder +plotLineageBarStack <- function(job_dir) { + # FIXME # figure out if all_raw can be read from the job folder all_w_extra <- read_tsv(file="data/rawdata_tsv/all_with_extrapspasnf7.tsv") - # all_w_extra$GI all_raw <- all_w_extra - # similar to the Rmd, we run the cleanup function on all_raw to produce prot + # cleanup function on all_raw to produce prot + # FIXME # Check if this is needed prot <- data_cleanup(all_raw) - # the rest of the figure generation follows as-is - prot$Lineage.reduced = prot$Lineage %>% str_replace(pattern = "^eukaryota.*", replacement = "eukaryota") %>% str_replace(pattern = "^viruses.*", replacement = "viruses") + # rename lineages + prot$Lineage.reduced = prot$Lineage %>% + str_replace(pattern = "^eukaryota.*", replacement = "eukaryota") %>% + str_replace(pattern = "^viruses.*", replacement = "viruses") prot$Lineage = prot$Lineage.reduced - nopspa <- prot %>% - filter_by_doms(column="DomArch", - doms_remove=c("PspA", "PspA(s)", "Snf7"), - ignore.case=T) - # Include only DAs ≥ min.cutoff - nopspa_tc <- nopspa %>% total_counts(cutoff=100) + prot_cutoff <- prot %>% total_counts(cutoff=100) - nopspa_tc <- nopspa_tc %>% filter(totalcount>=50) + prot_cutoff <- prot_cutoff %>% filter(totalcount>=50) - cutoff_perc <- 100 - nopspa_tc$CumulativePercent[nrow(nopspa_tc)] + cutoff_perc <- 100 - prot_cutoff$CumulativePercent[nrow(prot_cutoff)] - stlin_nops <- stacked_lin_plot(prot=nopspa, column="DomArch", + stacked_lin <- stacked_lin_plot(prot= prot, column="DomArch", cutoff=cutoff_perc, label.size=20, legend.position=c(0.6, 0.40), legend.text.size=20, legend.size=1) - # res = stacked_lin_plot(prot=nopspa, column="DomArch", - # cutoff=cutoff_perc, - # label.size=20, - # legend.position=c(0.6, 0.40), legend.text.size=20, - # legend.size=1) - - return(stlin_nops) - - # FA: writing out the file commented out in favor of returning the object + return(stacked_lin) - # Decrease width, increase height, increase font size - # - # ggsave("stackedLinPlot_50tc.png", stlin_nops, - # width=15, - # height=18, - # dpi=300) }