Skip to content

Commit

Permalink
Automate Case-Studies Issue #27
Browse files Browse the repository at this point in the history
Expanded Bug-Drug.R code to retrieve and save FASTA sequences for ESKAPE pathogens resistant to DAP (Daptomycin)
  • Loading branch information
Cateline authored Nov 24, 2024
1 parent aee86b7 commit 1dc5c81
Showing 1 changed file with 321 additions and 0 deletions.
321 changes: 321 additions & 0 deletions case_studies/CARD/ESKAPE Pathogens Code.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
# config.R
url <- "https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2"
destfile <- "broadstreet-v3.3.0.tar.bz2"

# Download the file
download.file(url, destfile)

#Extract the file
if (!require("R.utils")) {
install.packages("R.utils")
library(R.utils)
}


# Extract the tar file
untar("broadstreet-v3.3.0.tar.bz2", exdir = "CARD_data")


# Map CARD Short Name
# Install and Load dplyr and readr
packages <- c("dplyr", "readr")

for (pkg in packages) {
if (!require(pkg, character.only = TRUE)) {
install.packages(pkg)
library(pkg, character.only = TRUE)
} else {
library(pkg, character.only = TRUE)
}
}

# Parse the required files using readr::read_delim
aro_index <- read_delim("CARD_data/aro_index.tsv", delim = "\t", col_names = TRUE)
antibiotics_data <- read_delim("CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE)
pathogens_data <- read_delim("CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE)


# Extract pathogen, gene, drug, and include Protein.Accession from 'CARD.Short.Name'
# Load necessary libraries
library(dplyr)
library(tidyr)
library(purrr)

# Define the function to extract pathogen, gene, and drug information
extract_card_info <- function(card_short_name, drug_class, `Protein Accession`, `DNA Accession`) {
# Split the CARD Short Name by underscores
split_names <- unlist(strsplit(card_short_name, "_"))

# Initialize variables with defaults
pathogen <- NA
gene <- NA
drug <- drug_class # Default to Drug Class column

# Determine the information based on the split names and patterns
if (length(split_names) == 1) {
# Gene only (single part entry)
gene <- split_names[1]
pathogen <- "MULTI" # Assign MULTI as default for pathogen
} else if (all(toupper(split_names) == split_names)) {
# Gene complex (all uppercase entries)
gene <- card_short_name # Entire entry as gene
pathogen <- "MULTI"
} else if (length(split_names) == 2) {
# Pathogen-Gene scenario
pathogen <- split_names[1]
gene <- split_names[2]
} else if (length(split_names) == 3) {
# Pathogen-Gene-Drug scenario
pathogen <- split_names[1]
gene <- split_names[2]
drug <- split_names[3] # Assign drug from the split entry
}

# If both pathogen and gene are NA, classify as complex gene
if (is.na(pathogen) && is.na(gene)) {
gene <- card_short_name # Assign entire CARD Short Name as gene
pathogen <- "MULTI" # Default to MULTI for pathogen
}

# Handle Protein Accession
if (is.na(`Protein Accession`) || `Protein Accession` == "") {
`Protein Accession` <- `DNA Accession` # Use DNA Accession if Protein Accession is NA
}

return(list(Pathogen = pathogen, Gene = gene, Drug = drug, Protein_Accession = `Protein Accession`))
}

# Apply the function to the data frame
resistance_profile_data <- aro_index %>%
mutate(extracted_info = pmap(list(`CARD Short Name`, `Drug Class`, `Protein Accession`, `DNA Accession`),
extract_card_info)) %>%
unnest_wider(extracted_info)

# View the resulting data frame
print(resistance_profile_data)

# Define a relative path for saving the data
output_path <- file.path("CARD_data", "resistance_profile_data.tsv")

# Save resistance_profile_data to the specified path
write_delim(resistance_profile_data, output_path, delim = "\t")

# Load data
resistance_profile_data <- read_delim(output_path, delim = "\t", col_names = TRUE)
antibiotics_data <- read_delim("CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE)
pathogens_data <- read_delim("CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE)


# Merge the extracted resistance profile data with antibiotics_data on Drug
merged_data_antibiotics <- left_join(
resistance_profile_data,
antibiotics_data,
by = c("Drug" = "AAC Abbreviation"), # Adjusting for abbreviations between datasets
relationship = "many-to-many"
)

# Merge the result with pathogens_data on Pathogen, renaming Pathogen.y to Pathogen_Full_Name
merged_data_pathogens <- left_join(
merged_data_antibiotics,
pathogens_data,
by = c("Pathogen" = "Abbreviation")
) %>%
rename(Pathogen_Full_Name = Pathogen.y)

# Assign "Multi-species" to Pathogen_Full_Name where Pathogen values are "MULTI"
merged_data_pathogens <- merged_data_pathogens %>%
mutate(Pathogen_Full_Name = if_else(Pathogen == "MULTI", "Multi-species", Pathogen_Full_Name))


# Assign "Multi-class" to Molecule where Drug values are full names (not abbreviations)
merged_data_pathogens <- merged_data_pathogens %>%
mutate(Molecule = if_else(grepl(" ", Drug) | grepl("-", Drug), "Multi-class", Molecule))



#FASTA sequences
#Install and Load required packages
if (!requireNamespace("rentrez", quietly = TRUE)) {
install.packages("rentrez")
}
if (!requireNamespace("XML", quietly = TRUE)) {
install.packages("XML")
}
if (!requireNamespace("stringr", quietly = TRUE)) {
install.packages("stringr")
}


library(rentrez)
library(XML)
library(stringr)

# Filter for the target drug (DAP) and pathogen (Staphylococcus aureus)
# Define the filtering function
filter_resistance_mechanisms <- function(data, drug, bug, exclude_multiclass = FALSE, species_restricted = TRUE) {

# Filter by drug using partial match to include multiclass entries containing the target drug
filtered_data <- data %>%
filter(grepl(drug, Drug, ignore.case = TRUE))

# Filter by pathogen, using partial match
filtered_data <- filtered_data %>%
filter(grepl(bug, Pathogen_Full_Name, ignore.case = TRUE))

# Optionally exclude multiclass resistance if exclude_multiclass = TRUE
if (exclude_multiclass) {
filtered_data <- filtered_data %>%
filter(!grepl(";", Drug)) # Only include entries with single drug classes
}

# Optionally restrict to species-specific mechanisms if species_restricted = TRUE
if (species_restricted) {
filtered_data <- filtered_data %>%
filter(Pathogen_Full_Name == bug) # Include only entries with exact match to the bug of interest
}

return(filtered_data)
}

# Usage example for ESKAPE Pathogens resistant to DAP, including multispecies and multiclass resistance

filtered_data_saurdap <- filter_resistance_mechanisms(
data = merged_data_pathogens,
drug = "DAP",
bug = "Staphylococcus aureus",
exclude_multiclass = FALSE,
species_restricted = FALSE
)



filtered_data_enterobacter <- filter_resistance_mechanisms(
data = merged_data_pathogens,
drug = "DAP",
bug = "Enterobacter spp",
exclude_multiclass = FALSE,
species_restricted = FALSE
)



filtered_data_klebsiella <- filter_resistance_mechanisms(
data = merged_data_pathogens,
drug = "DAP",
bug = "Klebsiella pneumoniae",
exclude_multiclass = FALSE,
species_restricted = FALSE
)

filtered_data_acinetobacter <- filter_resistance_mechanisms(
data = merged_data_pathogens,
drug = "DAP",
bug = "Acinetobacter baumannii",
exclude_multiclass = FALSE,
species_restricted = FALSE
)

filtered_data_pseudomonas <- filter_resistance_mechanisms(
data = merged_data_pathogens,
drug = "DAP",
bug = "Pseudomonas aeruginosa",
exclude_multiclass = FALSE,
species_restricted = FALSE
)

filtered_data_enterococcus <- filter_resistance_mechanisms(
data = merged_data_pathogens,
drug = "DAP",
bug = "Enterococcus faecalis",
exclude_multiclass = FALSE,
species_restricted = FALSE
)

# Combine results
filtered_data_all_species <- bind_rows(
filtered_data_enterobacter,
filtered_data_saurdap,
filtered_data_klebsiella,
filtered_data_acinetobacter,
filtered_data_pseudomonas,
filtered_data_enterococcus
)

# View the combined data for all species
View(filtered_data_all_species)

# FASTA sequences
# Install and Load required packages
if (!requireNamespace("rentrez", quietly = TRUE)) {
install.packages("rentrez")
}
if (!requireNamespace("XML", quietly = TRUE)) {
install.packages("XML")
}
if (!requireNamespace("stringr", quietly = TRUE)) {
install.packages("stringr")
}

library(rentrez)
library(XML)
library(stringr)

# Fetch FASTA sequence from Entrez
fetch_fasta_sequence <- function(Protein_Accession) {
tryCatch({
# Fetch the FASTA sequence using Entrez
fasta_seq <- rentrez::entrez_fetch(db = "protein",
id = Protein_Accession,
rettype = "fasta",
retmode = "text")

if (!is.null(fasta_seq)) {
# Ensure the first line starts with ">"
if (!grepl("^>", fasta_seq[1])) {
fasta_seq[1] <- paste0(">", fasta_seq[1])
}

# Split the sequence into lines
lines <- str_split(fasta_seq, "\n")[[1]]

# Join the lines back together
fasta_seq <- paste(lines, collapse = "\n")

return(fasta_seq)
} else {
warning(paste("Failed to retrieve FASTA sequence for protein accession:", Protein_Accession))
return(NULL)
}
}, error = function(e) {
warning(paste("Error fetching FASTA sequence for protein accession:", Protein_Accession, ":", e$message))
return(NULL)
})
}




# Loop through filtered_data_all_species to fetch and save FASTA sequences
combined_sequences <- character()

for (i in 1:nrow(filtered_data_all_species)) {
# Fetch FASTA sequence for each protein accession
protein_accession <- filtered_data_all_species$`Protein_Accession`[i]
fasta_sequence <- fetch_fasta_sequence(protein_accession)

if (!is.null(fasta_sequence)) {
combined_sequences <- c(combined_sequences, fasta_sequence)
}
}

# Save the combined FASTA sequences
filename <- "ESKAPE Pathogens resistant to DAP.fasta"

writeLines(combined_sequences, filename)

# Read the FASTA file
fasta_content <- readLines(filename)

# Display the contents
cat(fasta_content, sep = "\n")

0 comments on commit 1dc5c81

Please sign in to comment.