-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Expanded Bug-Drug.R code to retrieve and save FASTA sequences for ESKAPE pathogens resistant to DAP (Daptomycin)
- Loading branch information
Showing
1 changed file
with
321 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,321 @@ | ||
# config.R | ||
url <- "https://card.mcmaster.ca/download/0/broadstreet-v3.3.0.tar.bz2" | ||
destfile <- "broadstreet-v3.3.0.tar.bz2" | ||
|
||
# Download the file | ||
download.file(url, destfile) | ||
|
||
#Extract the file | ||
if (!require("R.utils")) { | ||
install.packages("R.utils") | ||
library(R.utils) | ||
} | ||
|
||
|
||
# Extract the tar file | ||
untar("broadstreet-v3.3.0.tar.bz2", exdir = "CARD_data") | ||
|
||
|
||
# Map CARD Short Name | ||
# Install and Load dplyr and readr | ||
packages <- c("dplyr", "readr") | ||
|
||
for (pkg in packages) { | ||
if (!require(pkg, character.only = TRUE)) { | ||
install.packages(pkg) | ||
library(pkg, character.only = TRUE) | ||
} else { | ||
library(pkg, character.only = TRUE) | ||
} | ||
} | ||
|
||
# Parse the required files using readr::read_delim | ||
aro_index <- read_delim("CARD_data/aro_index.tsv", delim = "\t", col_names = TRUE) | ||
antibiotics_data <- read_delim("CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE) | ||
pathogens_data <- read_delim("CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE) | ||
|
||
|
||
# Extract pathogen, gene, drug, and include Protein.Accession from 'CARD.Short.Name' | ||
# Load necessary libraries | ||
library(dplyr) | ||
library(tidyr) | ||
library(purrr) | ||
|
||
# Define the function to extract pathogen, gene, and drug information | ||
extract_card_info <- function(card_short_name, drug_class, `Protein Accession`, `DNA Accession`) { | ||
# Split the CARD Short Name by underscores | ||
split_names <- unlist(strsplit(card_short_name, "_")) | ||
|
||
# Initialize variables with defaults | ||
pathogen <- NA | ||
gene <- NA | ||
drug <- drug_class # Default to Drug Class column | ||
|
||
# Determine the information based on the split names and patterns | ||
if (length(split_names) == 1) { | ||
# Gene only (single part entry) | ||
gene <- split_names[1] | ||
pathogen <- "MULTI" # Assign MULTI as default for pathogen | ||
} else if (all(toupper(split_names) == split_names)) { | ||
# Gene complex (all uppercase entries) | ||
gene <- card_short_name # Entire entry as gene | ||
pathogen <- "MULTI" | ||
} else if (length(split_names) == 2) { | ||
# Pathogen-Gene scenario | ||
pathogen <- split_names[1] | ||
gene <- split_names[2] | ||
} else if (length(split_names) == 3) { | ||
# Pathogen-Gene-Drug scenario | ||
pathogen <- split_names[1] | ||
gene <- split_names[2] | ||
drug <- split_names[3] # Assign drug from the split entry | ||
} | ||
|
||
# If both pathogen and gene are NA, classify as complex gene | ||
if (is.na(pathogen) && is.na(gene)) { | ||
gene <- card_short_name # Assign entire CARD Short Name as gene | ||
pathogen <- "MULTI" # Default to MULTI for pathogen | ||
} | ||
|
||
# Handle Protein Accession | ||
if (is.na(`Protein Accession`) || `Protein Accession` == "") { | ||
`Protein Accession` <- `DNA Accession` # Use DNA Accession if Protein Accession is NA | ||
} | ||
|
||
return(list(Pathogen = pathogen, Gene = gene, Drug = drug, Protein_Accession = `Protein Accession`)) | ||
} | ||
|
||
# Apply the function to the data frame | ||
resistance_profile_data <- aro_index %>% | ||
mutate(extracted_info = pmap(list(`CARD Short Name`, `Drug Class`, `Protein Accession`, `DNA Accession`), | ||
extract_card_info)) %>% | ||
unnest_wider(extracted_info) | ||
|
||
# View the resulting data frame | ||
print(resistance_profile_data) | ||
|
||
# Define a relative path for saving the data | ||
output_path <- file.path("CARD_data", "resistance_profile_data.tsv") | ||
|
||
# Save resistance_profile_data to the specified path | ||
write_delim(resistance_profile_data, output_path, delim = "\t") | ||
|
||
# Load data | ||
resistance_profile_data <- read_delim(output_path, delim = "\t", col_names = TRUE) | ||
antibiotics_data <- read_delim("CARD_data/shortname_antibiotics.tsv", delim = "\t", col_names = TRUE) | ||
pathogens_data <- read_delim("CARD_data/shortname_pathogens.tsv", delim = "\t", col_names = TRUE) | ||
|
||
|
||
# Merge the extracted resistance profile data with antibiotics_data on Drug | ||
merged_data_antibiotics <- left_join( | ||
resistance_profile_data, | ||
antibiotics_data, | ||
by = c("Drug" = "AAC Abbreviation"), # Adjusting for abbreviations between datasets | ||
relationship = "many-to-many" | ||
) | ||
|
||
# Merge the result with pathogens_data on Pathogen, renaming Pathogen.y to Pathogen_Full_Name | ||
merged_data_pathogens <- left_join( | ||
merged_data_antibiotics, | ||
pathogens_data, | ||
by = c("Pathogen" = "Abbreviation") | ||
) %>% | ||
rename(Pathogen_Full_Name = Pathogen.y) | ||
|
||
# Assign "Multi-species" to Pathogen_Full_Name where Pathogen values are "MULTI" | ||
merged_data_pathogens <- merged_data_pathogens %>% | ||
mutate(Pathogen_Full_Name = if_else(Pathogen == "MULTI", "Multi-species", Pathogen_Full_Name)) | ||
|
||
|
||
# Assign "Multi-class" to Molecule where Drug values are full names (not abbreviations) | ||
merged_data_pathogens <- merged_data_pathogens %>% | ||
mutate(Molecule = if_else(grepl(" ", Drug) | grepl("-", Drug), "Multi-class", Molecule)) | ||
|
||
|
||
|
||
#FASTA sequences | ||
#Install and Load required packages | ||
if (!requireNamespace("rentrez", quietly = TRUE)) { | ||
install.packages("rentrez") | ||
} | ||
if (!requireNamespace("XML", quietly = TRUE)) { | ||
install.packages("XML") | ||
} | ||
if (!requireNamespace("stringr", quietly = TRUE)) { | ||
install.packages("stringr") | ||
} | ||
|
||
|
||
library(rentrez) | ||
library(XML) | ||
library(stringr) | ||
|
||
# Filter for the target drug (DAP) and pathogen (Staphylococcus aureus) | ||
# Define the filtering function | ||
filter_resistance_mechanisms <- function(data, drug, bug, exclude_multiclass = FALSE, species_restricted = TRUE) { | ||
|
||
# Filter by drug using partial match to include multiclass entries containing the target drug | ||
filtered_data <- data %>% | ||
filter(grepl(drug, Drug, ignore.case = TRUE)) | ||
|
||
# Filter by pathogen, using partial match | ||
filtered_data <- filtered_data %>% | ||
filter(grepl(bug, Pathogen_Full_Name, ignore.case = TRUE)) | ||
|
||
# Optionally exclude multiclass resistance if exclude_multiclass = TRUE | ||
if (exclude_multiclass) { | ||
filtered_data <- filtered_data %>% | ||
filter(!grepl(";", Drug)) # Only include entries with single drug classes | ||
} | ||
|
||
# Optionally restrict to species-specific mechanisms if species_restricted = TRUE | ||
if (species_restricted) { | ||
filtered_data <- filtered_data %>% | ||
filter(Pathogen_Full_Name == bug) # Include only entries with exact match to the bug of interest | ||
} | ||
|
||
return(filtered_data) | ||
} | ||
|
||
# Usage example for ESKAPE Pathogens resistant to DAP, including multispecies and multiclass resistance | ||
|
||
filtered_data_saurdap <- filter_resistance_mechanisms( | ||
data = merged_data_pathogens, | ||
drug = "DAP", | ||
bug = "Staphylococcus aureus", | ||
exclude_multiclass = FALSE, | ||
species_restricted = FALSE | ||
) | ||
|
||
|
||
|
||
filtered_data_enterobacter <- filter_resistance_mechanisms( | ||
data = merged_data_pathogens, | ||
drug = "DAP", | ||
bug = "Enterobacter spp", | ||
exclude_multiclass = FALSE, | ||
species_restricted = FALSE | ||
) | ||
|
||
|
||
|
||
filtered_data_klebsiella <- filter_resistance_mechanisms( | ||
data = merged_data_pathogens, | ||
drug = "DAP", | ||
bug = "Klebsiella pneumoniae", | ||
exclude_multiclass = FALSE, | ||
species_restricted = FALSE | ||
) | ||
|
||
filtered_data_acinetobacter <- filter_resistance_mechanisms( | ||
data = merged_data_pathogens, | ||
drug = "DAP", | ||
bug = "Acinetobacter baumannii", | ||
exclude_multiclass = FALSE, | ||
species_restricted = FALSE | ||
) | ||
|
||
filtered_data_pseudomonas <- filter_resistance_mechanisms( | ||
data = merged_data_pathogens, | ||
drug = "DAP", | ||
bug = "Pseudomonas aeruginosa", | ||
exclude_multiclass = FALSE, | ||
species_restricted = FALSE | ||
) | ||
|
||
filtered_data_enterococcus <- filter_resistance_mechanisms( | ||
data = merged_data_pathogens, | ||
drug = "DAP", | ||
bug = "Enterococcus faecalis", | ||
exclude_multiclass = FALSE, | ||
species_restricted = FALSE | ||
) | ||
|
||
# Combine results | ||
filtered_data_all_species <- bind_rows( | ||
filtered_data_enterobacter, | ||
filtered_data_saurdap, | ||
filtered_data_klebsiella, | ||
filtered_data_acinetobacter, | ||
filtered_data_pseudomonas, | ||
filtered_data_enterococcus | ||
) | ||
|
||
# View the combined data for all species | ||
View(filtered_data_all_species) | ||
|
||
# FASTA sequences | ||
# Install and Load required packages | ||
if (!requireNamespace("rentrez", quietly = TRUE)) { | ||
install.packages("rentrez") | ||
} | ||
if (!requireNamespace("XML", quietly = TRUE)) { | ||
install.packages("XML") | ||
} | ||
if (!requireNamespace("stringr", quietly = TRUE)) { | ||
install.packages("stringr") | ||
} | ||
|
||
library(rentrez) | ||
library(XML) | ||
library(stringr) | ||
|
||
# Fetch FASTA sequence from Entrez | ||
fetch_fasta_sequence <- function(Protein_Accession) { | ||
tryCatch({ | ||
# Fetch the FASTA sequence using Entrez | ||
fasta_seq <- rentrez::entrez_fetch(db = "protein", | ||
id = Protein_Accession, | ||
rettype = "fasta", | ||
retmode = "text") | ||
|
||
if (!is.null(fasta_seq)) { | ||
# Ensure the first line starts with ">" | ||
if (!grepl("^>", fasta_seq[1])) { | ||
fasta_seq[1] <- paste0(">", fasta_seq[1]) | ||
} | ||
|
||
# Split the sequence into lines | ||
lines <- str_split(fasta_seq, "\n")[[1]] | ||
|
||
# Join the lines back together | ||
fasta_seq <- paste(lines, collapse = "\n") | ||
|
||
return(fasta_seq) | ||
} else { | ||
warning(paste("Failed to retrieve FASTA sequence for protein accession:", Protein_Accession)) | ||
return(NULL) | ||
} | ||
}, error = function(e) { | ||
warning(paste("Error fetching FASTA sequence for protein accession:", Protein_Accession, ":", e$message)) | ||
return(NULL) | ||
}) | ||
} | ||
|
||
|
||
|
||
|
||
# Loop through filtered_data_all_species to fetch and save FASTA sequences | ||
combined_sequences <- character() | ||
|
||
for (i in 1:nrow(filtered_data_all_species)) { | ||
# Fetch FASTA sequence for each protein accession | ||
protein_accession <- filtered_data_all_species$`Protein_Accession`[i] | ||
fasta_sequence <- fetch_fasta_sequence(protein_accession) | ||
|
||
if (!is.null(fasta_sequence)) { | ||
combined_sequences <- c(combined_sequences, fasta_sequence) | ||
} | ||
} | ||
|
||
# Save the combined FASTA sequences | ||
filename <- "ESKAPE Pathogens resistant to DAP.fasta" | ||
|
||
writeLines(combined_sequences, filename) | ||
|
||
# Read the FASTA file | ||
fasta_content <- readLines(filename) | ||
|
||
# Display the contents | ||
cat(fasta_content, sep = "\n") | ||
|