Automate Case-Studies Issue #27

Expanded Bug-Drug.R code to retrieve and save FASTA sequences for ESKAPE pathogens resistant to DAP (Daptomycin)
JRaviLab · Nov 24, 2024 · 1dc5c81 · 1dc5c81
1 parent aee86b7
commit 1dc5c81
Showing 1 changed file with 321 additions and 0 deletions.
diff --git a/case_studies/CARD/ESKAPE Pathogens Code.R b/case_studies/CARD/ESKAPE Pathogens Code.R
@@ -0,0 +1,321 @@
+# config.R
+url <- "https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2"
+destfile <- "broadstreet-v3.3.0.tar.bz2"
+
+# Download the file
+download.file(url, destfile)
+
+#Extract the file
+if (!require("R.utils")) {
+  install.packages("R.utils")
+  library(R.utils)
+}
+
+
+# Extract the tar file
+untar("broadstreet-v3.3.0.tar.bz2", exdir = "CARD_data")
+
+
+# Map CARD Short Name
+# Install and Load dplyr and readr
+packages <- c("dplyr", "readr")
+
+for (pkg in packages) {
+  if (!require(pkg, character.only = TRUE)) {
+    install.packages(pkg)
+    library(pkg, character.only = TRUE)
+  } else {
+    library(pkg, character.only = TRUE)
+  }
+}
+
+# Parse the required files using readr::read_delim
+aro_index <- read_delim("CARD_data/aro_index.tsv", delim = "\t", col_names = TRUE)
+antibiotics_data <- read_delim("CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE)
+pathogens_data <- read_delim("CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE)
+
+
+# Extract pathogen, gene, drug, and include Protein.Accession from 'CARD.Short.Name'
+# Load necessary libraries
+library(dplyr)
+library(tidyr)
+library(purrr)
+
+# Define the function to extract pathogen, gene, and drug information
+extract_card_info <- function(card_short_name, drug_class, `Protein Accession`, `DNA Accession`) {
+  # Split the CARD Short Name by underscores
+  split_names <- unlist(strsplit(card_short_name, "_"))
+
+  # Initialize variables with defaults
+  pathogen <- NA
+  gene <- NA
+  drug <- drug_class  # Default to Drug Class column
+
+  # Determine the information based on the split names and patterns
+  if (length(split_names) == 1) {
+    # Gene only (single part entry)
+    gene <- split_names[1]
+    pathogen <- "MULTI"  # Assign MULTI as default for pathogen
+  } else if (all(toupper(split_names) == split_names)) {
+    # Gene complex (all uppercase entries)
+    gene <- card_short_name  # Entire entry as gene
+    pathogen <- "MULTI"
+  } else if (length(split_names) == 2) {
+    # Pathogen-Gene scenario
+    pathogen <- split_names[1]
+    gene <- split_names[2]
+  } else if (length(split_names) == 3) {
+    # Pathogen-Gene-Drug scenario
+    pathogen <- split_names[1]
+    gene <- split_names[2]
+    drug <- split_names[3]  # Assign drug from the split entry
+  }
+
+  # If both pathogen and gene are NA, classify as complex gene
+  if (is.na(pathogen) && is.na(gene)) {
+    gene <- card_short_name  # Assign entire CARD Short Name as gene
+    pathogen <- "MULTI"      # Default to MULTI for pathogen
+  }
+
+  # Handle Protein Accession
+  if (is.na(`Protein Accession`) || `Protein Accession` == "") {
+    `Protein Accession` <- `DNA Accession`  # Use DNA Accession if Protein Accession is NA
+  }
+
+  return(list(Pathogen = pathogen, Gene = gene, Drug = drug, Protein_Accession = `Protein Accession`))
+}
+
+# Apply the function to the data frame
+resistance_profile_data <- aro_index %>%
+  mutate(extracted_info = pmap(list(`CARD Short Name`, `Drug Class`, `Protein Accession`, `DNA Accession`),
+                               extract_card_info)) %>%
+  unnest_wider(extracted_info)
+
+# View the resulting data frame
+print(resistance_profile_data)
+
+# Define a relative path for saving the data
+output_path <- file.path("CARD_data", "resistance_profile_data.tsv")
+
+# Save resistance_profile_data to the specified path
+write_delim(resistance_profile_data, output_path, delim = "\t")
+
+# Load data
+resistance_profile_data <- read_delim(output_path, delim = "\t", col_names = TRUE)
+antibiotics_data <- read_delim("CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE)
+pathogens_data <- read_delim("CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE)
+
+
+# Merge the extracted resistance profile data with antibiotics_data on Drug
+merged_data_antibiotics <- left_join(
+  resistance_profile_data,
+  antibiotics_data,
+  by = c("Drug" = "AAC Abbreviation"), # Adjusting for abbreviations between datasets
+  relationship = "many-to-many"
+)
+
+# Merge the result with pathogens_data on Pathogen, renaming Pathogen.y to Pathogen_Full_Name
+merged_data_pathogens <- left_join(
+  merged_data_antibiotics,
+  pathogens_data,
+  by = c("Pathogen" = "Abbreviation")
+) %>%
+  rename(Pathogen_Full_Name = Pathogen.y)
+
+# Assign "Multi-species" to Pathogen_Full_Name where Pathogen values are "MULTI"
+merged_data_pathogens <- merged_data_pathogens %>%
+  mutate(Pathogen_Full_Name = if_else(Pathogen == "MULTI", "Multi-species", Pathogen_Full_Name))
+
+
+# Assign "Multi-class" to Molecule where Drug values are full names (not abbreviations)
+merged_data_pathogens <- merged_data_pathogens %>%
+  mutate(Molecule = if_else(grepl(" ", Drug) | grepl("-", Drug), "Multi-class", Molecule))
+
+
+
+#FASTA sequences
+#Install and Load required packages
+if (!requireNamespace("rentrez", quietly = TRUE)) {
+  install.packages("rentrez")
+}
+if (!requireNamespace("XML", quietly = TRUE)) {
+  install.packages("XML")
+}
+if (!requireNamespace("stringr", quietly = TRUE)) {
+  install.packages("stringr")
+}
+
+
+library(rentrez)
+library(XML)
+library(stringr)
+
+# Filter for the target drug (DAP) and pathogen (Staphylococcus aureus)
+# Define the filtering function
+filter_resistance_mechanisms <- function(data, drug, bug, exclude_multiclass = FALSE, species_restricted = TRUE) {
+
+  # Filter by drug using partial match to include multiclass entries containing the target drug
+  filtered_data <- data %>%
+    filter(grepl(drug, Drug, ignore.case = TRUE))
+
+  # Filter by pathogen, using partial match
+  filtered_data <- filtered_data %>%
+    filter(grepl(bug, Pathogen_Full_Name, ignore.case = TRUE))
+
+  # Optionally exclude multiclass resistance if exclude_multiclass = TRUE
+  if (exclude_multiclass) {
+    filtered_data <- filtered_data %>%
+      filter(!grepl(";", Drug))  # Only include entries with single drug classes
+  }
+
+  # Optionally restrict to species-specific mechanisms if species_restricted = TRUE
+  if (species_restricted) {
+    filtered_data <- filtered_data %>%
+      filter(Pathogen_Full_Name == bug)  # Include only entries with exact match to the bug of interest
+  }
+
+  return(filtered_data)
+}
+
+# Usage example for ESKAPE Pathogens resistant to DAP, including multispecies and multiclass resistance
+
+filtered_data_saurdap <- filter_resistance_mechanisms(
+  data = merged_data_pathogens,
+  drug = "DAP",
+  bug = "Staphylococcus aureus",
+  exclude_multiclass = FALSE,
+  species_restricted = FALSE
+)
+
+
+
+filtered_data_enterobacter <- filter_resistance_mechanisms(
+  data = merged_data_pathogens,
+  drug = "DAP",
+  bug = "Enterobacter spp",
+  exclude_multiclass = FALSE,
+  species_restricted = FALSE
+)
+
+
+
+filtered_data_klebsiella <- filter_resistance_mechanisms(
+  data = merged_data_pathogens,
+  drug = "DAP",
+  bug = "Klebsiella pneumoniae",
+  exclude_multiclass = FALSE,
+  species_restricted = FALSE
+)
+
+filtered_data_acinetobacter <- filter_resistance_mechanisms(
+  data = merged_data_pathogens,
+  drug = "DAP",
+  bug = "Acinetobacter baumannii",
+  exclude_multiclass = FALSE,
+  species_restricted = FALSE
+)
+
+filtered_data_pseudomonas <- filter_resistance_mechanisms(
+  data = merged_data_pathogens,
+  drug = "DAP",
+  bug = "Pseudomonas aeruginosa",
+  exclude_multiclass = FALSE,
+  species_restricted = FALSE
+)
+
+filtered_data_enterococcus <- filter_resistance_mechanisms(
+  data = merged_data_pathogens,
+  drug = "DAP",
+  bug = "Enterococcus faecalis",
+  exclude_multiclass = FALSE,
+  species_restricted = FALSE
+)
+
+# Combine results
+filtered_data_all_species <- bind_rows(
+  filtered_data_enterobacter,
+  filtered_data_saurdap,
+  filtered_data_klebsiella,
+  filtered_data_acinetobacter,
+  filtered_data_pseudomonas,
+  filtered_data_enterococcus
+)
+
+# View the combined data for all species
+View(filtered_data_all_species)
+
+# FASTA sequences
+# Install and Load required packages
+if (!requireNamespace("rentrez", quietly = TRUE)) {
+  install.packages("rentrez")
+}
+if (!requireNamespace("XML", quietly = TRUE)) {
+  install.packages("XML")
+}
+if (!requireNamespace("stringr", quietly = TRUE)) {
+  install.packages("stringr")
+}
+
+library(rentrez)
+library(XML)
+library(stringr)
+
+# Fetch FASTA sequence from Entrez
+fetch_fasta_sequence <- function(Protein_Accession) {
+  tryCatch({
+    # Fetch the FASTA sequence using Entrez
+    fasta_seq <- rentrez::entrez_fetch(db = "protein",
+                                       id = Protein_Accession,
+                                       rettype = "fasta",
+                                       retmode = "text")
+
+    if (!is.null(fasta_seq)) {
+      # Ensure the first line starts with ">"
+      if (!grepl("^>", fasta_seq[1])) {
+        fasta_seq[1] <- paste0(">", fasta_seq[1])
+      }
+
+      # Split the sequence into lines
+      lines <- str_split(fasta_seq, "\n")[[1]]
+
+      # Join the lines back together
+      fasta_seq <- paste(lines, collapse = "\n")
+
+      return(fasta_seq)
+    } else {
+      warning(paste("Failed to retrieve FASTA sequence for protein accession:", Protein_Accession))
+      return(NULL)
+    }
+  }, error = function(e) {
+    warning(paste("Error fetching FASTA sequence for protein accession:", Protein_Accession, ":", e$message))
+    return(NULL)
+  })
+}
+
+
+
+
+# Loop through filtered_data_all_species to fetch and save FASTA sequences
+combined_sequences <- character()
+
+for (i in 1:nrow(filtered_data_all_species)) {
+  # Fetch FASTA sequence for each protein accession
+  protein_accession <- filtered_data_all_species$`Protein_Accession`[i]
+  fasta_sequence <- fetch_fasta_sequence(protein_accession)
+
+  if (!is.null(fasta_sequence)) {
+    combined_sequences <- c(combined_sequences, fasta_sequence)
+  }
+}
+
+# Save the combined FASTA sequences
+filename <- "ESKAPE Pathogens resistant to DAP.fasta"
+
+writeLines(combined_sequences, filename)
+
+# Read the FASTA file
+fasta_content <- readLines(filename)
+
+# Display the contents
+cat(fasta_content, sep = "\n")
+