From 218f37a7e7dbbb6ccbe749d20222e599df50b616 Mon Sep 17 00:00:00 2001 From: j0kaso Date: Fri, 17 Jun 2022 00:03:06 +0200 Subject: [PATCH] [prism_parser] add static files for sharing --- software/scripts/FillVariants.py | 139 +++ software/scripts/PrismData.py | 1407 ++++++++++++++++++++++++ software/scripts/struc_select_sifts.py | 1077 ++++++++++++++++++ 3 files changed, 2623 insertions(+) create mode 100644 software/scripts/FillVariants.py create mode 100755 software/scripts/PrismData.py create mode 100644 software/scripts/struc_select_sifts.py diff --git a/software/scripts/FillVariants.py b/software/scripts/FillVariants.py new file mode 100644 index 0000000..315f8f0 --- /dev/null +++ b/software/scripts/FillVariants.py @@ -0,0 +1,139 @@ +############################################################## +# +# Author: Johanna K. S. Tiemann +# +# Copyright: CC +# +# Last edited: 2020-11-12 +############################################################## + +""" +Function creating each possible variant with WT information + +Commands: +======= + + +Example: +======= + +python FillVariants.py prism_uniprot_XXX_uniprot_id.txt + + +""" + +# Standard library imports +from argparse import ArgumentParser +from datetime import datetime +import os + +# Third party imports +import pandas as pd + +# Local application imports +from PrismData import PrismParser, VariantData + + +def parse_args(): + """ + Argument parser function + """ + + parser = ArgumentParser( description="" ) + + # Initiating/setup command line arguments + parser.add_argument( 'file', + type=str, + help="Prism data file to process" + ) + parser.add_argument('--output_dir', '-o', + type=str, + default='', + help="Directory where files will be written. If not specified, the input directory will be used." + ) + parser.add_argument('--incl_del', '-i', + default=False, + type=lambda s: s.lower() in ['true', 't', 'yes', '1'], + help="generates values for ~ and * too. Default False" + ) + + args = parser.parse_args() + + # Generate and check saved outputs + if args.output_dir!='': + if not os.path.isdir(args.output_dir): + os.makedirs(args.output_dir) + else: + args.output_dir = os.path.dirname(os.path.abspath(args.file)) + if args.incl_del != False: + args.incl_del = True + return args + + +def copy_wt_variants(dataframe, incl_del=False): + #copy wt columns to all variants + + if 'aa_var' in dataframe.columns: + dataframe = dataframe.drop(columns=['aa_var']) + final_df = dataframe.copy() + final_df['variant'] = final_df['variant'].str.replace(r'=', 'A') + final_df['aa_var'] = ['A']*len(dataframe['variant']) + if incl_del: + resi_list = ['C','D','E', 'F','G','H', 'I','K','L','M','N','P','Q','R','S','T','V','W','Y', '~', '*'] + else: + resi_list = ['C','D','E', 'F','G','H', 'I','K','L','M','N','P','Q','R','S','T','V','W','Y'] + for variant in resi_list: + df = dataframe.copy() + df['variant'] = df['variant'].str.replace('=', variant) + df['aa_var'] = [variant]*len(dataframe['variant']) + final_df = pd.concat([final_df, df], axis=0, ignore_index=True) + final_df['aa_var'] = final_df['variant'].apply(lambda x: x[-1]) + final_df['aa_ref'] = final_df['variant'].apply(lambda x: x[0]) + final_df['res'] = final_df['variant'].apply(lambda x: int(x[1:-1])) + mask = final_df['aa_ref']==final_df['aa_var'] + final_df.loc[mask, 'aa_var'] = '=' + final_df['variant'] = final_df['aa_ref'] + final_df['res'].astype(str) + final_df['aa_var'] + final_df = final_df.sort_values(by=['res','aa_var']) + for elem in ['n_mut', 'aa_ref', 'resi', 'aa_var', 'res']: + if elem in final_df.columns: + final_df = final_df.drop(columns=elem) + final_df = final_df.drop_duplicates().reset_index(drop=True) + return(final_df) + + +def write_prism(metadata, dataframe, prism_file, comment=''): + variant_dataset = VariantData(metadata, dataframe) + parser = PrismParser() + parser.write(prism_file, variant_dataset, comment_lines=comment) + + +def read_from_prism(primsfile): + parser = PrismParser() + dataframe = parser.read(primsfile).dataframe + meta_data = parser.read_header(primsfile) + meta_data.pop('variants') + return meta_data, dataframe + + +def main(): + """ + Main function called as default at the end. + """ + # get user input arguments + args = parse_args() + + # read metadata and dataframe + meta_data, dataframe = read_from_prism(args.file) + + # create new dataframe + new_df = copy_wt_variants(dataframe, incl_del=args.incl_del) + + # write dataframe + file_name = os.path.basename(args.file) + new_prism_file = os.path.join(args.output_dir, f'{file_name[:-4]}_filled{file_name[-4:]}') + comment = [ f"{datetime.date(datetime.now())} - expanded variant data",] + write_prism(meta_data, new_df, new_prism_file, comment) + + +if __name__ == '__main__': + main() diff --git a/software/scripts/PrismData.py b/software/scripts/PrismData.py new file mode 100755 index 0000000..3e42da4 --- /dev/null +++ b/software/scripts/PrismData.py @@ -0,0 +1,1407 @@ +#!/storage1/shared/software/anaconda3/bin/python3 + +# Copyright (C) 2020 Kristoffer Enoe Johansson + +"""Module for data handling in the PRISM project + +This module implments classes for parsing (PrismParser) and handling +(PrismData derived classes) of data files. + +In general, calling PrismParser.read(filename) will return a data object of +the same derived class, e.g. a VariantData object. +See documentation of derived parser and data classes for help. + +See README.md for general file format definitions. +""" + +__version__ = 1.002 + +from Bio import Seq,SeqRecord,SeqIO,pairwise2,SubsMat +from Bio.SubsMat import MatrixInfo +import numpy as np +import pandas as pd +import yaml,csv,copy,time + + +class PrismFormatError(Exception): + """Exception raised for format errors in prism data files""" + +class PrismValueFail(Exception): + """Exception raised when a threshold was not met""" + + +class PrismParser: + """Read and write prism data files + + Examples + -------- + Parsing a data file + >>> parser = PrismData.PrismParser() + >>> data = parser.read("prism_mave/prism_mave_001_GFP.txt") + + Parsing the YAML file header only is much faster for large files + >>> meta_data = parser.read_header("prism_mave/prism_mave_001_GFP.txt") + """ + + def read(self, filename, verbose=0): + """Read data and meta data from a PRISM data file + + Read the meta-data section (via read_header) and data section (via Pandas.read_csv) and + check format adherence and the consistancy between the two. + + Parameters + ---------- + filename : string + Name and path of file to read + verbose : int, optional + Level of output + + Returns + ------- + PrismData derived instance + """ + + if verbose > 0: + start_time = time.process_time() + + # Read meta-data and check for necessary fields + metadata = self.read_header(filename, verbose=verbose) + + # Read data section + dataframe = pd.read_csv(filename, delim_whitespace=True, comment='#', header=0, + keep_default_na=True, na_values=['Na','na']) + + # Determine type of data + data = None + if dataframe.keys()[0] == "variant": + data = VariantData(metadata, dataframe) + # elif dataframe.keys()[0] == "resi": + # data = ResidueData(metadata, dataframe) + # checks ... + else: + raise PrismFormatError("Could not determine data type of file %s from first column name %s" % (filename, dataframe.columns[0])) + + data.check(verbose=verbose) + + if verbose > 0: + elapsed_time = time.process_time() - start_time + print("Total time of parsing %s %.1f sec" % (filename,elapsed_time)) + + return(data) + + def write(self, filename, data, comment_lines=None, no_overwrite_metadata=False, verbose=0): + """Write a PrismData object to a file + + Will check the consistency of variants and cloumns meta-data fields. + + Parameters + ---------- + filename : string + Name and path of file to write + data : VariantData + Data set to write + comment_lines : list of strings + Each list element is printed as a comment line between header and data sections + no_overwrite_metadata : bool + Do not allow the check function to overwrite meta data. Except instead + """ + + # Replace whitespace in column names if present + has_whitespace = [cn.find(" ") > -.5 for cn in data.metadata['columns'].keys()] + if any(has_whitespace): + if verbose > 0: + print("White space in columns names replaced with _ (they are white-space separated)") + # Replace in data frame + data.dataframe.columns = [cn.strip().replace(" ","_") for cn in data.dataframe.columns] + # Replace in meta data + data.metadata["columns"] = {k.strip().replace(" ","_"):v for (k,v) in data.metadata['columns'].items()} + + # Check data before printing + data.check(no_overwrite_metadata=no_overwrite_metadata, verbose=verbose) + + # if present, remove the index columns (aa_ref, etc.) + write_cols = list(data.dataframe.columns) + for col in data.index_column_names: + if col in write_cols: + write_cols.remove(col) + + # Write a file + with open(filename, "w") as fh: + fh.write(data.header_to_string(skip_fields=['filename'])) + if comment_lines: + for line in comment_lines: + fh.write("# "+line+"\n") + fh.write("#\n") + # data.dataframe.to_csv(fh, float_format='%8.4f', sep=' ', header=True, index=False, columns=write_cols, + data.dataframe.to_csv(fh, sep=' ', header=True, index=False, columns=write_cols, + quoting=csv.QUOTE_NONE, escapechar=' ', na_rep=" NA") + + def is_aa_one_nat(self, sequence, additional=""): + """Test if a sequence consists of single-letter natural amino acids only (case-insensitive) + + Parameters + ---------- + sequence : string + Amino acid sequence + additional : string, optional + Additional characters to allow, case insensitive + """ + for aa in sequence.upper(): + if not (aa in "ACDEFGHIKLMNPQRSTVWY" or aa in additional.upper()): + return(False) + return(True) + + def read_header(self, filename, verbose=0): + """Read the YAML header of a data file and run check_header + + Parameters + ---------- + filename : string + Path and name of the file to read + verbose : int, optional + Level of output + + Returns + ------- + dict + The meta data as a dictonary of one or more levels of dictionaries + """ + header_begin = False + header_str = "" + with open(filename, "r") as file_handle: + for line in file_handle: + if line.replace(' ','')[0:4] == '#---': + if header_begin: + header = yaml.safe_load(header_str) + header['filename'] = filename.split('/')[-1] + self.check_header(header) + return(header) + else: + header_begin = True + continue + if header_begin: + if not line.strip()[0] == "#": + raise PrismFormatError("Missing comment char inside YAML header") + s = line.replace('#','',1) + header_str += s + if header_begin: + raise PrismFormatError("Header never ended") + else: + raise PrismFormatError("No header found") + + def check_header(self, header): + """Check a header for fields required by all data files""" + if not 'version' in header.keys(): + raise PrismFormatError("Header has no \'version\' field") + if not 'protein' in header.keys(): + raise PrismFormatError("Header has no \'protein\' field") + if not 'name' in header['protein'].keys(): + raise PrismFormatError("Header has no \'protein: name\' field") + if not 'sequence' in header['protein'].keys(): + raise PrismFormatError("Header has no \'protein: sequence\' field") + if not 'uniprot' in header['protein'].keys(): + raise PrismFormatError("Header has no \'protein: uniprot\' field") + if 'first_residue_number' in header['protein'].keys(): + if int(header['protein']['first_residue_number']) < 0: + raise PrismFormatError("First residue number must be non-negative") + if not 'columns' in header.keys(): + raise PrismFormatError("Header has no \'columns\' field") + if 'filename' in header.keys(): + # Upon parsing, filename sould be added to header and the second field (split by '_') should give the data type + data_type = header['filename'].split('_')[1] + if not data_type.lower() in [hk.lower() for hk in header.keys()]: + print("WARNING: Header has no \'%s\' field but filename indicates this data type" % (data_type)) + # raise PrismFormatError("Header has no \'%s\' field but filename indicates this data type" % (data_type)) + + def write_meta_file(self, data_list, filename, output_type="csv"): + """Write a overview file of meta-data (headers) from a list of data sets + + Parameters + ---------- + data_list : list of PrismData-derived objects or meta data dictionaries + The data to output + filename : string + Name and path of output file + output_type : string, optional + One of the following + -csv: Excel-readable CSV overview of all meta-data + -fasta: Header sequences in FASTA format + """ + header_list = [data.metadata if isinstance(data,PrismData) else data for data in data_list] + if output_type == "csv": + self.__dump_header_csv(filename, header_list) + elif output_type == "fasta": + self.__dump_fasta(filename, header_list) + else: + raise ValueError("write_meta_file argument output_type must be \'csv\' or \'fasta\'") + + def __merge_header_fields(self, header_list, mode="union"): + """Determine a common set of header fields""" + + # Recursive grap of keys - because we can-can + def update_keys(key_dic, dic): + """Update key_dic with keys from dic recursively""" + for key in dic.keys(): + if not key in key_dic.keys(): + key_dic[key] = {} + if type(dic[key]) == dict: + update_keys(key_dic[key],dic[key]) + + # Read all header keys + common_fields = {} + for header in header_list: + update_keys(common_fields, header) + # Return according to mode + if mode.lower()=="union": + return(common_fields) + elif mode.lower()=="intersect": + raise NotImplementedError("__merge_header_fields mode==intersect not implemented") + else: + raise ValueError("Header merge mode must be \'union\' or \'intersect\'") + + def __dump_header_csv(self, filename, header_list): + common_header_keys = self.__merge_header_fields(header_list, mode="union") + + # Columns fields have different names so rename to column01, column02, etc + ncol_list = [len(header['columns'].keys()) for header in header_list] + new_col_names = ["column%02d" % (c+1) for c in range(max(ncol_list))] + common_header_keys['columns'] = dict(zip(new_col_names, [{}]*len(new_col_names))) + + with open(filename, 'w', newline='') as csvfile: + csv_dialect = csv.excel() + csv_dialect.delimiter = ";" + writer = csv.writer(csvfile, dialect=csv_dialect) + # First layer of keys + row = [] + for key in common_header_keys: + row += [key]+['']*np.max([0,len(common_header_keys[key])-1]) + writer.writerow(row) + # Second layer of keys + row = [] + for key in common_header_keys: + if len(common_header_keys[key]) == 0: + row += [key] + else: + row += [kkey for kkey in common_header_keys[key].keys()] + writer.writerow(row) + # Fill in values for each header + for header in header_list: + row = [] + for key in common_header_keys.keys(): + if not key in header: + row += ['']*np.max([1,len(common_header_keys[key])]) + elif len(common_header_keys[key].keys()) == 0: + row += [header[key]] + elif key == 'columns': + for keyi in range(len(common_header_keys[key])): + key_list = list(header['columns'].keys()) + value_list = list(header['columns'].values()) + if keyi < len(header['columns']): + row += [key_list[keyi]+": "+value_list[keyi]] + else: + row += [''] + else: + for kkey in common_header_keys[key].keys(): + if kkey in header[key].keys(): + row += [header[key][kkey]] + else: + row += [''] + writer.writerow(row) + + def __dump_fasta(self, filename, header_list): + sequences = [] + for header in header_list: + file_id = header['filename'].rsplit(".",1)[0] + "_v" + str(header['version']) + seq = Seq.Seq(header['protein']['sequence']) + sequences.append(SeqRecord.SeqRecord(seq, id=file_id, description="")) + with open(filename, "w", newline='') as file_handle: + count = SeqIO.write(sequences, file_handle, "fasta") + return(count) + +class PrismData: + """Base class of PRISM data types + + This class is thought as abstract and should not be instantiated. See documentation of derived classes. + """ + + def __init__(self, metadata, dataframe): + """Constructor + + See documentation of derived classes + """ + self.metadata = metadata + self.dataframe = dataframe + self.index_column_names = None + + def __repr__(self): + """Representation - used e.g. for interactive printing""" + return(self.header_to_string() + str(self.dataframe)) + + def __str__(self): + """Cast to string - used e.g. by print""" + return("%s object of protein %s (uniprot %s) with dataframe shape %s" % + (type(self), self.metadata['protein']['name'], self.metadata['protein']['uniprot'], str(self.dataframe.shape))) + + def copy(self): + """Make a deep copy of self""" + new_meta = copy.deepcopy(self.metadata) + new_dataframe = self.dataframe.copy(deep=True) + new_obj = PrismData(new_meta, new_dataframe) + new_obj.__class__ = self.__class__ + return(new_obj) + + def seq_from_data(self): + """Builds a sequence based on aa_ref and resi data columns + + Data is assumed to have index columns resi and aa_ref + """ + + if not 'aa_ref' in self.dataframe.columns: + self.add_index_columns() + + n_res = 0 + seq = np.full(100,'X') + for (aa_list,resi_list) in zip(self.dataframe['aa_ref'],self.dataframe['resi']): + for (aa,resi) in zip(aa_list,resi_list): + n_res = max(n_res,resi) + if n_res > len(seq): + seq = np.concatenate([seq,np.full(max(100,n_res-len(seq)),'X')]) + if seq[resi-1] == 'X': + seq[resi-1] = aa + elif seq[resi-1] != aa: + raise PrismFormatError("Data reference amino acid mismatch at position %d: % and %s" % + (resi, seq[resi-1], aa)) + return("".join(seq[:n_res])) + + def check_header_and_data_sequence(self, verbose=0): + """Check reference sequence from header against the data""" + data_seq = self.seq_from_data() + meta_seq = self.metadata['protein']['sequence'] + first_resn = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1 + meta_seq = 'X'*(first_resn-1) + meta_seq # For checking purposes, matching indices makes things easier + + # Check sequence length + if len(data_seq) < len(meta_seq): + # Add X for unobserved positions + data_seq = data_seq + 'X'*(len(meta_seq)-len(data_seq)) + elif (len(data_seq) != len(meta_seq)): + raise PrismFormatError("Data has more residues than header sequence") + + # Check all substitution amino acids against header sequence + match = [aa1==aa2 if aa1!='X' and aa2!='X' else True for (aa1,aa2) in zip(meta_seq,data_seq)] + if not np.all(match): + mismatch = np.where(np.logical_not(match))[0] + 1 + raise PrismFormatError("Mismatch between header sequence and data sequence at positions %s:\n%s\n%s" % + (str(mismatch), meta_seq,data_seq)) + if (verbose > 0): + print("Data and header amino acid sequence match with %d of %d positions not observed in data" % (data_seq.upper().count('X'),len(meta_seq))) + return(data_seq) + + def header_to_string(self, skip_fields=[]): + """Format meta-data into a string""" + # Recursive function to write the next level of a dictionary + def dump_subfields(dic, indent_level): + return_str = "" + dict_keys = list(dic.keys()) + # For the top level, order fields (dict is not ordered) + if indent_level == 0: + dict_keys_first = list() + dict_keys_last = list() + for k in ['version','protein']: + if k in dict_keys: + dict_keys.remove(k) + dict_keys_first.append(k) + for k in ['variants','columns','filename']: + if k in dict_keys: + dict_keys.remove(k) + dict_keys_last.append(k) + dict_keys = dict_keys_first + dict_keys + dict_keys_last + # Make new indent and run recursively or dump values + for key in dict_keys: + if key in skip_fields: + continue + if isinstance(dic[key], dict): + return_str += "# " + " "*indent_level + key + ":\n" + return_str += dump_subfields(dic[key], indent_level+1) + else: + # format floats values and use standard cast for others + s = "%.3f" % (dic[key]) if isinstance(dic[key],float) else str(str(dic[key])) + return_str += "# " + " "*indent_level + key + ": " + s + "\n" + return(return_str) + + return_str = "" + return_str += "# --------------------\n" + return_str += dump_subfields(self.metadata, 0) + return_str += "# --------------------\n" + return_str += "#\n" + return(return_str) + + def check_column_names(self, verbose=0): + # Check column names - case sensitive + meta_colnames = list(self.metadata['columns'].keys()) + ['n_mut','aa_ref','resi','aa_var'] + data_colnames = self.dataframe.keys()[1:] # first column is data specific and not in header + for cn in data_colnames: + if not cn in meta_colnames: + raise PrismFormatError("Could not find column name \'%s\' in header" % (cn)) + for cn in meta_colnames: + if not cn in data_colnames: + raise PrismFormatError("Could not find header column name \'%s\' in data" % (cn)) + if verbose > 0: + print("Column names checks ok: %s" % (",".join(self.dataframe.keys()))) + + +class VariantData(PrismData): + """Container class for protein variant data + + Parameters + ---------- + metadata : dict + Metadata for data set + dataframe : pandas.DataFrame + Data + + Examples + -------- + Get meta-data dictionary + >>> meta_data = VariantData.metadata + + Get sequence from header + >>> sequence = meta_data['protein']['sequence'] + + Get pandas.DataFrame + >>> data_frame = VariantData.dataframe + + Get all double mutants + >>> data_frame[data_frame['n_mut']==2] + + Get all variants with substitutions into Ala + >>> data_frame = VariantData.get_var_into_aa('A') + + Get all variants involving positions 30-45 (both included) + >>> data_frame = VariantData.get_var_at_pos(range(30,45+1)) + + Get a complete site-saturation (single mutant) library + >>> data_frame = VariantData.saturate_variants() + + Merge three VariantData files and retrieve the union of variants (merge=outer) + >>> merged_data = data1.merge([data2, data3], merge="outer") + """ + # To be implmented: + # Get all (multi-mutant) variants that involves D83V and L11P + # >>> subst2var = VariantData.make_subst2var_map() + # >>> data_frame = VariantData.dataframe[subst2var['D83V'] and subst2var['L11P']] + + def add_index_columns(self): + var_split = [var.split(":") if var!="WT" else [] for var in self.dataframe['variant']] + resi_ll = [[int(sub[1:-1]) for sub in var] if len(var)>0 else [] for var in var_split] + aa_ref_ll = [[mut[0] for mut in var] if len(var)>0 else [] for var in var_split] + aa_mut_ll = [[mut[-1] for mut in var] if len(var)>0 else [] for var in var_split] + self.dataframe['n_mut'] = [len(var) for var in var_split] + self.dataframe['aa_ref'] = aa_ref_ll + self.dataframe['resi'] = resi_ll + self.dataframe['aa_var'] = aa_mut_ll + self.index_column_names = ['n_mut','aa_ref','resi','aa_var'] + + def remove_index_columns(self): + """Remove the indexing columns, i.e. ['n_mut','aa_ref','resi','aa_var']""" + if 'aa_ref' in self.dataframe.columns: + self.dataframe.drop(['n_mut','aa_ref','resi','aa_var'], axis=1, inplace=True) + self.index_column_names = None + + def get_var_into_aa(self, aa, multimutant_mode='any'): + """Get a data frame of variants with substitutions into a given amino acid + + Parameters + ---------- + aa : character + Amino acid in one-letter code + multimutant_mode : string + Mode of action for multi-mutants + - any : At least one substitution per variant into aa required + - all : All substitutions in variants must be into aa + - exclude : Only return single mutants + """ + # Check argument + if not multimutant_mode in ['any','all','exclude']: + raise ValueError("Function get_var_into_aa argument multimutant_mode must be \'any\', \'all\' or \'exclude\'") + # Make a series of boleans to select rows + if multimutant_mode=='any': + bool_mask = [aa in aa_list for aa_list in self.dataframe['aa_var']] + elif multimutant_mode=='all': + bool_mask = [np.all(np.array(aa_list)==aa) if len(aa_list)>0 else False for aa_list in self.dataframe['aa_var']] + elif multimutant_mode=='exclude': + bool_mask = [aa_list==[aa] for aa_list in self.dataframe['aa_var']] + else: + assert False + return(self.dataframe[bool_mask]) + + def get_var_from_aa(self, aa, multimutant_mode='any'): + # Exclude synonymous and nonsense? + """Get a data frame of variants with substitutions from a given amino acid + + Parameters + ---------- + aa : character + Amino acid in one-letter code + multimutant_mode : string + Mode of action for multi-mutants + - any : At least one substitution per variant from aa required + - all : All substitutions in variants must be from aa + - exclude : Only return single mutants + """ + # Check argument + if not multimutant_mode in ['any','all','exclude']: + raise ValueError("Function get_var_from_aa argument multimutant_mode must be \'any\', \'all\' or \'exclude\'") + # Make a series of boleans to select rows + if multimutant_mode=='any': + bool_mask = [aa in aa_list for aa_list in self.dataframe['aa_ref']] + elif multimutant_mode=='all': + bool_mask = [np.all(np.array(aa_list)==aa) if len(aa_list)>0 else False for aa_list in self.dataframe['aa_ref']] + elif multimutant_mode=='exclude': + bool_mask = [aa_list==[aa] for aa_list in self.dataframe['aa_ref']] + else: + assert False + return(self.dataframe[bool_mask]) + + def get_var_at_pos(self, target_resi, mode='any'): + # Exclude synonymous and nonsense? + """Get a data frame of variants with substitutions in one or more positions + + Parameters + ---------- + target_resi : int or iterable of int + Residue number as given in variants + mode : string + Mode of action for multi-mutants + - any : At least one substitution in one of the given position + - all : Substitutions at all given position + - exact : Substitutions at all given position and no others + """ + # Check argument mode + if not mode in ['any','all','exact']: + raise ValueError("Function get_var_from_aa argument mode must be \'any\', \'all\' or \'exact\'") + # Check argument target_resi + if isinstance(target_resi, int): + target_resi = [target_resi] + elif not isinstance(target_resi, list): + target_resi = list(target_resi) + target_resi = list(np.sort(target_resi)) + if mode == 'any': + bool_mask = [np.any([ti in resi_list for ti in target_resi]) if len(resi_list)>0 else False for resi_list in self.dataframe['resi']] + elif mode == 'all': + bool_mask = [np.all([ti in resi_list for ti in target_resi]) if len(resi_list)>0 else False for resi_list in self.dataframe['resi']] + elif mode=='exact': + bool_mask = [resi_list==target_resi for resi_list in self.dataframe['resi']] + else: + assert False + return(self.dataframe[bool_mask]) + + def get_subst(self, subst, single_only=True): + """Get a data frame of variants that all contain a given substitution + + Parameters + ---------- + subst : string + Substitution, e.g. M1A + single_only : bool + Only return single mutants. If false, all multi-mutants with the given substitution will be returned + """ + bool_mask = [subst in var.split(':') if var!='WT' else False for var in self.dataframe['variant']] + # Add requirement on number of mutations if requested + if single_only: + bool_mask = [a and b for a,b in zip(bool_mask,self.dataframe['n_mut']==1)] + # Return data frame slice + return(self.dataframe[bool_mask]) + + def order_variants(self, aa_order="CDEKRHNQAGSTVMLIFYWP=*~"): + """Order variant according to number of substitutions, residue number, and a specified order of + amino acids. Non-specified amino acids are keps in original order. Inserts are not supported. + """ + n_aa = len(aa_order) + if len(set(aa_order)) != n_aa: + raise ValueError("Argument aa_order to VariantData.order_variants must be unique characters") + aa_dict = dict(zip(aa_order,range(n_aa))) + aa_dict['WT'] = -1 + aa_list = [l[0] if len(l)>0 else 'WT' for l in self.dataframe['aa_var']] + self.dataframe['aa0'] = [aa_dict[aa] if aa in aa_dict.keys() else n_aa for aa in aa_list] + self.dataframe['resi0'] = [l[0] if len(l)>0 else 0 for l in self.dataframe['resi']] + self.dataframe.sort_values(by=['n_mut','resi0','aa0'], inplace=True) + self.dataframe.index = range(self.dataframe.shape[0]) + self.dataframe.drop(columns=['aa0','resi0'], inplace=True) + + def saturate_variants(self, aa_order="CDEKRHNQAGSTVMLIFYWP", verbose=0): + """Saturate variant list to make a full site-saturation library + + Order single mutants according to position and target amino acid and add NA + rows for missing single variants. Inserts are not supported. + + Parameters + ---------- + aa_order : string + Which amino acids to include and the order, default CDEKRHNQAGSTVMLIFYWP + verbose : int, optional + Level of output + """ + n_aa = len(aa_order) + if len(set(aa_order)) != n_aa: + raise ValueError("Argument aa_order to VariantData.saturate_variants must be unique characters") + seq = self.metadata['protein']['sequence'] + first_resn = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1 + var_list = [seq[resi]+str(resi+first_resn)+var if seq[resi] != var else seq[resi]+str(resi+first_resn)+"=" for resi in range(len(seq)) for var in aa_order] + if verbose > 0: + remove_set = set(self.dataframe['variant'])-set(var_list) + if len(remove_set) > 0: + if len(remove_set) > 500: + rm_list = list(remove_set) + print("Saturate variants will remove the following %d variants:\n%s\n... [skip %d variants] ...\n%s" % + (len(rm_list), ",".join([str(i) for i in rm_list[0:25]]), len(rm_list)-50, ",".join([str(i) for i in rm_list[-25:]]))) + else: + print("Saturate variants will remove the following %d variants:\n%s" % (len(remove_set), ",".join([str(i) for i in remove_set]))) + new_dataframe = pd.DataFrame({'variant':var_list}) + new_dataframe = new_dataframe.merge(self.dataframe, on='variant', how='left', copy=False) + self.dataframe = new_dataframe + self.add_index_columns() + self.metadata['variants'] = self.calc_variants_metadata(verbose=verbose) + + def __missing_mask(self, verbose=0): + """Private helper function for clean_na_variants and others + + Returns a bool array that marks rows with NA only. Should be fast.""" + # pd.DataFrame.iterrows() is not vectorized and very slow, DataFrame.apply is not too good either + # return([np.all(row[1]) for row in self.dataframe.drop(columns=['variant','n_mut','aa_ref','resi','aa_var']).isnull().iterrows()]) + if 'aa_ref' in self.dataframe.columns: + return(np.all(self.dataframe.drop(columns=['variant','n_mut','aa_ref','resi','aa_var']).isnull(), axis=1)) + else: + return(np.all(self.dataframe.drop(columns='variant').isnull(), axis=1)) + + def clean_na_variants(self, verbose=0): + """Remove variants rows with all data missing""" + missing_mask = self.__missing_mask() + i = np.where(missing_mask)[0] + if len(i) > 0: + self.dataframe.drop(self.dataframe.index[i], axis=0, inplace=True) + self.dataframe.index = range(self.dataframe.shape[0]) + if verbose > 0: + print("Removed %d variant rows with all data missing" % (len(i))) + + def check_variants_metadata(self, no_overwrite=False, verbose=0): + recalc_variants = self.calc_variants_metadata(verbose=verbose) + if 'variants' in self.metadata: + # Number of variants + if 'number' in self.metadata['variants'].keys(): + if recalc_variants['number'] != self.metadata['variants']['number']: + s = "Header field \'variants:number\' is %d but %d variants (possible including WT) were found" % \ + (self.metadata['variants']['number'], recalc_variants['number']) + if no_overwrite: + raise PrismFormatError(s) + elif verbose > 0: + print("WARNING: "+s) + elif verbose>0 and not no_overwrite: + print("Will add variants:number: %d to header" % (recalc_variants['number'])) + + # Coverage + if 'coverage' in self.metadata['variants'].keys(): + if np.abs(recalc_variants['coverage'] - self.metadata['variants']['coverage']) > 0.01: + s = "Header field \'variants:coverage\' is %.2f but variants calculated to cover %.2f" % \ + (self.metadata['variants']['coverage'], recalc_variants['coverage']) + if no_overwrite: + raise PrismFormatError(s) + elif verbose > 0: + print("WARNING: "+s) + elif verbose>0 and not no_overwrite: + print("Will add variants:coverage: %.2f to header" % (recalc_variants['coverage'])) + + # Depth + if 'depth' in self.metadata['variants'].keys(): + if np.abs(recalc_variants['depth'] - self.metadata['variants']['depth']) > 0.01: + s = "Header field \'variants:depth\' is %.2f but variants calculated to depth %.2f" % \ + (self.metadata['variants']['depth'], recalc_variants['depth']) + if no_overwrite: + raise PrismFormatError(s) + elif verbose > 0: + print("WARNING: "+s) + elif verbose>0 and not no_overwrite: + print("Will add variants:depth: %.2f to header" % (recalc_variants['depth'])) + + # Helper function + def strip_all(s): + return(s.lower().replace('.', '').replace('-','').replace('+','').replace('/','').replace(' ','')) + + # Variant width + if 'width' in self.metadata['variants'].keys(): + if recalc_variants['width'] == "single mutants": + if not strip_all(self.metadata['variants']['width']) in ['singlemutants','singlemutant','singlemut','single','singleonly']: + s = "Header field \'variants:width\' is \'%s\' but only single mutants were found - use \'single mutants\'" % \ + (self.metadata['variants']['width']) + if no_overwrite: + raise PrismFormatError(s) + elif verbose > 0: + print("WARNING: "+s) + elif recalc_variants['width'] == "multi mutants": + if not strip_all(self.metadata['variants']['width']) in ['multimutants','multimutant','multimut','multi']: + # multimut = [var_list[i] for i in np.where(n_mut > 2)[0]] + s = "Header field \'variants:width\' is \'%s\' but the following multi-mutant were found - use \'multi mutants\'" % \ + (self.metadata['variants']['width']) + if no_overwrite: + raise PrismFormatError(s) + elif verbose > 0: + print("WARNING: "+s) + elif recalc_variants['width'] == "single and double mutants": + if not strip_all(self.metadata['variants']['width']) in ['singleanddoublemutants','singleanddoublemutant','singleanddouble']: + s = "Header field \'variants:width\' is \'%s\' but variants are single and double mutants - use \'single and double mutants\'" % \ + (self.metadata['variants']['width']) + if no_overwrite: + raise PrismFormatError(s) + elif verbose > 0: + print("WARNING: "+s) + else: + raise ValueError("Recalculated variants width has an unknown value \'%s\'" % (recalc_variants['width'])) + elif verbose>0 and not no_overwrite: + print("Will add variants:width: %s to header" % (recalc_variants['width'])) + else: + if no_overwrite: + raise PrismFormatError("No variants field in meta data") + + if not no_overwrite: + self.metadata['variants'] = recalc_variants + + def calc_variants_metadata(self, verbose=0, return_missing_mask=False): + ret = {} + + if verbose > 1: + print("===============================================================") + print("==== Calculate variants meta-data fields with timing ====") + start_time = time.process_time() + print("Calc number of variants (incl. missing_mask) ...", end=" ", flush=True) + + # Number of variants + missing_mask = self.__missing_mask(verbose=verbose) + ret['number'] = len(self.dataframe['variant']) - np.sum(missing_mask) + + if verbose > 1: + elapsed_time = time.process_time() - start_time + print("done. Elapsed time %.1f sec" % (elapsed_time)) + start_time = time.process_time() + print("Calc coverage and depth of variants ...", end=" ", flush=True) + + # Coverage and depth + # Positions with substitutions + non_missing_indices = np.where([not b for b in missing_mask])[0] + flat_resi_list = [resi for resi_list in self.dataframe['resi'][non_missing_indices] for resi in resi_list] + (resi_list,resi_counts) = np.unique(flat_resi_list, return_counts=True) + + # Coverage is positions with substitutions per residue in reference sequence + ret['coverage'] = len(resi_list)/len(self.metadata['protein']['sequence']) + # Depth is the average number of substitutions per position with substitutions + ret['depth'] = np.mean(resi_counts) + + if verbose > 1: + elapsed_time = time.process_time() - start_time + print("done. Elapsed time %.1f sec" % (elapsed_time)) + start_time = time.process_time() + print("Calc width of variants ...", end=" ", flush=True) + + # Width + n_mut = self.dataframe['n_mut'] + if np.all(n_mut <= 1): + ret['width'] = "single mutants" + elif np.max(n_mut) > 2: + ret['width'] = "multi mutants" + else: + ret['width'] = "single and double mutants" + + if verbose > 1: + elapsed_time = time.process_time() - start_time + print("done. Elapsed time %.1f sec" % (elapsed_time)) + print("===============================================================") + + if return_missing_mask: + return((ret,missing_mask)) + else: + return(ret) + + def check(self, no_overwrite_metadata=False, verbose=0): + """Perform all checks of data and meta data + + Checks formatting of all variants in the first column and sorts multimutants + and that the list is unique. Adds index columns. + + Consistancy checks of data and meta data includes column names, reference amino + acid sequence and all variants sub fields. + + Parameters + ---------- + no_overwrite_metadata : bool + Allow overwriting of the variants meta-data fields + verbose : int, optional + Level of output + """ + # Check and update variant column + if verbose > 1: + print("===============================================================") + print("==== Check data section variant list with timing ====") + start_time = time.process_time() + print("Make var_list ...", end=" ", flush=True) + + var_list = [s.upper() for s in self.dataframe['variant']] + + # At least 2 characters for 'WT' + if np.any([len(var) < 2 for var in var_list]): + il = np.where([len(var) for var in var_list] < 2)[0] + raise PrismFormatError("Bad variants: %s" % (str([var_list[i] for i in il]))) + + if verbose > 1: + elapsed_time = time.process_time() - start_time + print("done. Elapsed time %.1f sec" % (elapsed_time)) + start_time = time.process_time() + print("Sort variants ...", end=" ", flush=True) + + # Sort multi-mutants according to residue number + var_split_unordered = [var.split(":") if var!="WT" else [] for var in var_list] + resi_ll_unordered = [[int(subst[1:-1]) for subst in var] if len(var)>0 else [] for var in var_split_unordered] + + # Make a new variant list with ordered multi-mutants + var_split = [] + for ivar in range(len(var_split_unordered)): + if len(set(resi_ll_unordered[ivar])) < len(resi_ll_unordered[ivar]): + raise PrismFormatError("Variant %d %s has more substitutions on the same position" % (ivar+1, var_list[ivar])) + var_split.append( [var_split_unordered[ivar][i] for i in np.array(resi_ll_unordered[ivar]).argsort()] ) + + if verbose > 1: + elapsed_time = time.process_time() - start_time + print("done. Elapsed time %.1f sec" % (elapsed_time)) + start_time = time.process_time() + print("Split all substitutions ...", end=" ", flush=True) + + # Split substitutions in amino acids and residue number + resi_ll = [[int(sub[1:-1]) for sub in var] if len(var)>0 else [] for var in var_split] + aa_ref_ll = [[mut[0] for mut in var] if len(var)>0 else [] for var in var_split] + aa_mut_ll = [[mut[-1] for mut in var] if len(var)>0 else [] for var in var_split] + + if verbose > 1: + elapsed_time = time.process_time() - start_time + print("done. Elapsed time %.1f sec" % (elapsed_time)) + start_time = time.process_time() + print("Make new_var_list ...", end=" ", flush=True) + + # Check all substitutions and make new variant list + new_var_list = [] + for ivar in range(len(resi_ll)): + subst_list = [] + for isub in range(len(resi_ll[ivar])): + aa1 = aa_ref_ll[ivar][isub] + resi = resi_ll[ivar][isub] + aa2 = aa_mut_ll[ivar][isub] + if not PrismParser.is_aa_one_nat(None, aa2, additional="*=~"): + raise PrismFormatError("Substitution %s%d%s is not to a standard amino acid" % (aa1,resi,aa2)) + if aa1 == aa2: + if verbose > 0: + print("Synonymous substitution %s%d%s renamed to %s%d=" % (aa1,resi,aa2,aa1,resi)) + aa_mut_ll[ivar][isub] = '=' + subst_list.append("%s%d%s" % (aa_ref_ll[ivar][isub], resi_ll[ivar][isub], aa_mut_ll[ivar][isub])) + new_var_list.append(":".join(subst_list) if len(subst_list)>0 else "WT") + + # Check if variants are unique + (var_unique,var_counts) = np.unique(new_var_list, return_counts=True) + if np.any(var_counts > 1): + raise PrismFormatError("The following variants are listed multiple times:\n%s" % (var_unique[np.where(var_counts > 1)[0]])) + + # Update data frame + self.dataframe['variant'] = new_var_list + + # Update index columns now that we have them + self.dataframe['n_mut'] = [len(var) for var in var_split] + self.dataframe['aa_ref'] = aa_ref_ll + self.dataframe['resi'] = resi_ll + self.dataframe['aa_var'] = aa_mut_ll + self.index_column_names = ['n_mut','aa_ref','resi','aa_var'] + + if verbose > 1: + elapsed_time = time.process_time() - start_time + print("done. Elapsed time %.1f sec" % (elapsed_time)) + print("===============================================================") + + self.check_column_names(verbose=verbose) + self.check_header_and_data_sequence(verbose=verbose) + self.check_variants_metadata(no_overwrite=no_overwrite_metadata, verbose=verbose) + + def to_new_reference(self, target_seq=None, first_resn=None, verbose=0, + min_identity=.8, min_coverage=.1, mismatch="remove", allow_inserts=True, allow_deletions=True): + """Map variants from data file onto a new reference sequence + + This is the workhorse of data set merging + + Parameters + ---------- + target_seq : string + Amino acid sequence that variants are aligned to + first_resn: integer >= 0 + If None, the header value is used if present, else one. If given the header value is replaced + min_identity : float in [0.0, 1.0], optional + Raise exception if the sequence identity is less than this (in covered region) + min_coverage : float in [0.0, 1.0], optional + Raise exception if the fraction of matched positions (i.e. not indel/gap) is less than this + This is a symmetric kind of coverage: (N_res-N_indel)/N_res where N_res is the lengths of the + longest sequence + mismatch : string, optional + How to handle mismatches in the alignment + -convert: Change reference amino acid to match the new reference + -remove: Remove variants that involve mismatch positions + -not_allowed: Raise exception is a mismatch is incountered + allow_inserts : bool, optional + How to handle target sequence position missing in the sequence of the data + If true, the residue numbering will be ajusted accordingly + allow_deletions : bool, optional + How to handle data sequence positions missing in the target sequence + If true, the residue numbering will be ajusted accordingly + verbose : int, optional + Level of output + """ + + # Determine new sequence offset and how much to shift residue numbering + first_resn_meta = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1 + if target_seq is None and first_resn is None: + # Nothing to do + return self + elif first_resn is None: + # Only alignment to target seq + first_resn = first_resn_meta + resi_shift_init = 0 + else: + # Shift residue numbering (and possible align to target sequence) + resi_shift_init = first_resn - first_resn_meta + + if first_resn < 0: + raise ValueError("Argument first_resn to VariantData.to_new_reference must be a non-negative integer") + + seq = self.metadata['protein']['sequence'] + n_res_data = len(seq) + + # Which original data residue indices (0-based) to change + resi_rm = [] + resi_shift = np.full(n_res_data,resi_shift_init) + aa_change = {} + if not target_seq is None: + if not PrismParser.is_aa_one_nat(None, target_seq, "X"): + raise ValueError("Argument target_seq to VariantData.to_new_reference must be a single-letter amino acid string (or None)") + n_res_target = len(target_seq) + # Align sequences + # MatrixInfo supports a wildcard 'X' so use upper case. Open and extend penalty 11 and 1 is blast default + # 2DO: Consider to write gap_function where X--XX is 2 openings and --XXX is only one, i.e. NT-gap is not an opening + align = pairwise2.align.globalds(target_seq.upper(), seq.upper(), MatrixInfo.blosum62, -3, -1) + # Check for alternative alignments + if len(align) > 1 and verbose > 0: + print("Found %d alignments" % (len(align))) + if verbose > 0: + # Dump verbose number of alignments + for ia in range(min([verbose,len(align)])): + print("==== alignment %d of %d ====" % (ia+1,len(align))) + print(pairwise2.format_alignment(*align[ia])) + # Check for equally good alignments, a good alignment has high score + if len(align) > 1: + align_scores = np.array([a[2] for a in align]) + if any(align_scores[0] - align_scores[1:] < 1e-3): + print("WARNING: There are alignments for %s with same or better scores (try verbose). Sores: %s" % + (self.metadata['protein']['name'],str(align_scores))) + # Use first alignment + align = align[0] + assert len(align[0]) == len(align[1]) + n_align = len(align[0]) + # Number to convert 0-based alignment index to 0-based data (original) residue numbering + ialign_shift = 0 + n_indel = n_match = n_mismatch = 0 + # Iterate over alignment, possible including gaps + for ialign in range(len(align[0])): + if align[0][ialign] == '-': + # Deletion: Position in data sequence not present in target sequence + if not allow_deletions: + raise PrismValueFail("Aborting alignment due to deletion at position %d which is not allowed" % (ialign+ialign_shift)) + # Remove all substitutions at this position + resi_rm.append(ialign+ialign_shift) + # Downstream position are shifted to the left + resi_shift[(ialign+ialign_shift):] -= 1 + n_indel += 1 + elif align[1][ialign] == '-': + # Insertion: Position in target sequence not present in data sequence + if not allow_inserts: + raise PrismValueFail("Aborting alignment due to insertion at position %d which is not allowed" % (ialign+ialign_shift)) + resi_shift[(ialign+ialign_shift):] += 1 + # Shift conversion number + ialign_shift -= 1 + n_indel += 1 + elif align[0][ialign] != align[1][ialign]: + # Mismatch of 'X' always allowed since it is always mismatched + # Variants matched to X are always removed because position is ambiguous if adjacent to gap and because X is used for numbering offset + if align[0][ialign] == 'X' or align[1][ialign] == 'X': + resi_rm.append(ialign+ialign_shift) + elif mismatch == "not_allowed": + raise PrismValueFail("Aborting alignment due to mismatch at position %d which is not allowed" % (ialign+ialign_shift)) + elif mismatch == "remove": + # Mark the substitutions at this position for removal + resi_rm.append(ialign+ialign_shift) + elif mismatch == "convert": + # Change reference amino acid + aa_change[ialign+ialign_shift] = align[0][ialign] + else: + raise ValueError("mismatch argument must be in [not_allowed,convert,remove]") + n_mismatch += 1 + else: + n_match += 1 + # Is alignment ok? + n_res = max(n_res_target, n_res_data) + coverage = (n_res-n_indel)/n_res + if coverage < min_coverage: + raise PrismValueFail("Sequence coverage %.3f (%d/%d) below threshold of %.3f" % (coverage, n_res-n_indel, n_res, min_coverage)) + identity = (n_res_target-n_mismatch)/n_res_target + if identity < min_identity: + raise PrismValueFail("Sequence identity %.3f below threshold of %.3f" % (identity, min_identity)) + # Update header if alignment is ok + self.metadata['protein']['sequence'] = target_seq + # Report actions + if verbose > 0: + print("Making new reference of %s with coverage %.3f and identity %.3f. Maximum shift in residue number is %d" % + (self.metadata['filename'], coverage, identity, max(abs(resi_shift)))) + if verbose > 0 and len(resi_rm) > 0: + print("New reference sequence will remove variants at %d positions (mismatch setting: %s; indels: %d):\n%s" % + (len(resi_rm), mismatch, n_indel, ",".join([str(resi+first_resn_meta) for resi in resi_rm]))) + if verbose > 0 and len(aa_change) > 0: + print("New reference sequence will change variants at %d positions (mismatch setting: %s; indels: %d):\n%s" % + (len(aa_change), mismatch, n_indel, ",".join([str(int(resi)+first_resn_meta) for resi in aa_change.keys()]))) + + if not first_resn is None: + self.metadata['protein']['first_residue_number'] = first_resn + assert int(self.metadata['protein']['first_residue_number']) == first_resn + + # Split variant in amino acid letters and residue numbers + aa_ref_ll = list(self.dataframe['aa_ref']) + resi_ll = list(self.dataframe['resi']) + aa_mut_ll = list(self.dataframe['aa_var']) + + # Update all variants + new_var_list = [] + ivar_rm = [] + for ivar in range(len(resi_ll)): + subst_list = [] + for isub in range(len(resi_ll[ivar])): + # 0-based index of original sequence without offset + resi_orig = resi_ll[ivar][isub] - first_resn_meta + # Is substitution marked for removal? + if resi_orig in resi_rm: + ivar_rm.append(ivar) + break + # Update residue number + resi_ll[ivar][isub] += resi_shift[resi_orig] + # Update reference amino acid + if resi_orig in aa_change.keys(): + if aa_change[resi_orig] == aa_mut_ll[ivar][isub]: + if verbose > 2: + print("Data contains a substitution, %s%d%s, into the new reference which will be removed" % + (aa_ref_ll[ivar][isub],resi_ll[ivar][isub],aa_mut_ll[ivar][isub])) + ivar_rm.append(ivar) + break + else: + if verbose > 2: + print("Convert variant due to mismatch: %s%d%s -> %s%d%s" % + (aa_ref_ll[ivar][isub], resi_ll[ivar][isub]-resi_shift[resi_orig], aa_mut_ll[ivar][isub], + aa_change[resi_orig], resi_ll[ivar][isub], aa_mut_ll[ivar][isub])) + aa_ref_ll[ivar][isub] = aa_change[resi_orig] + subst_list.append("%s%d%s" % (aa_ref_ll[ivar][isub], resi_ll[ivar][isub], aa_mut_ll[ivar][isub])) + if len(subst_list) > 0: + new_var_list.append(":".join(subst_list)) + else: + new_var_list.append("WT") + + if verbose > 0 and len(ivar_rm) > 0: + rm_list = [str(self.dataframe['variant'].iat[i]) for i in ivar_rm] + if len(rm_list) > 500: + print("New reference sequence will remove %d variants (mismatch setting: %s; indels: %d):\n%s\n... [skip %d variants] ...\n%s" % + (len(rm_list), mismatch, n_indel, ",".join(rm_list[0:25]), len(rm_list)-50, ",".join(rm_list[-25:]))) + else: + print("New reference sequence will remove %d variants (mismatch setting: %s; indels: %d):\n%s" % + (len(rm_list), mismatch, n_indel, ",".join(rm_list))) + + self.dataframe['variant'] = new_var_list + self.dataframe['aa_ref'] = aa_ref_ll + self.dataframe['resi'] = resi_ll + self.dataframe['aa_var'] = aa_mut_ll + # Remove rows containing variants with deleted substitutions + self.dataframe = self.dataframe.drop(self.dataframe.index[ivar_rm], axis=0) + self.dataframe.index = range(self.dataframe.shape[0]) + + def merge(self, data_list, target_seq=None, first_resn=None, merge='outer', **kwargs): + """Merge a list of data files into a single DataFrame and dump it in a file + + Parameters + ---------- + data_list : list of VariantData + The list of data sets to merge + target_seq : string + Amino acid sequence that all data sets are aligned to. If None, + the sequence of the first data set will be used. + first_resn : Integer >= 0 + Residue number of first amino acid in target_seq, default one. If + target_seq is None, the header value will be used if present or + overwritten if an argument is given here. + merge : string + How to merge: + - outer: output the union of variants, default + - left: Only keep veriants that are present in the self data set + - inner: output the intersect of variants + **kwargs : keyword arguments + Passed to to_new_reference function + """ + if not merge in ['left', 'outer', 'inner']: + raise ValueError("Allowed merge arguments are left, outer or inner") + + merged_data = self.copy() + + if isinstance(data_list, VariantData): + data_list = [data_list] + elif len(data_list) < 1: + raise ValueError("List of data set to merge is empty") + + # Target sequence + if target_seq is None: + # Make from meta data, variant residue numbers will match the index of this + target_seq = self.metadata['protein']['sequence'] + if not first_resn is None: + raise ValueError("merge argument first_resn can only be set if target_seq != None\n" + \ + "Use VariantData.to_new_reference to only shift residue numbering") + first_resn = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1 + else: + # Align self data set to target + if first_resn is None: + first_resn = 1 + else: + if first_resn < 0: + raise ValueError("Argument first_resn must be non-negative") + try: + merged_data.to_new_reference(target_seq, first_resn, **kwargs) + except PrismValueFail as msg: + print("Could not align data to target sequence: %s" % (msg)) + return(None) + + # Remove the index columns ['n_mut','aa_ref','resi','aa_mut'] before merging + merged_data.remove_index_columns() + # Add '_00' to the names of all but variant column + merged_data.dataframe.columns = [merged_data.dataframe.columns[0]] + [s+"_00" for s in merged_data.dataframe.columns[1:]] + + # Init new meta data dictionary + merged_metadata = {'version':0, 'protein':{}, 'merged':{}, 'variants':{}, 'columns':{}} + merged_metadata['protein'] = copy.deepcopy(self.metadata['protein']) + + merged_metadata['merged']["file_00"] = self.metadata['filename']+" (version "+str(self.metadata['version'])+")" + for key in self.metadata['columns'].keys(): + merged_metadata['columns'][key+"_00"] = self.metadata['columns'][key] + + # Align and merge other data sets + c = 0 + for data in data_list: + data_copy = data.copy() + # 2DO: Consider to check uniprot (others are human readable and may have typoes) of data to be merged + try: + data_copy.to_new_reference(target_seq, first_resn, **kwargs) + except PrismValueFail as msg: + print("Skip merge of file %s: %s" % (data_copy.metadata['filename'],msg)) + continue + data_copy.remove_index_columns() + suffix = "_%02d" % (c+1) + data_copy.dataframe.columns = [data_copy.dataframe.columns[0]] + [s+suffix for s in data_copy.dataframe.columns[1:]] + merged_data.dataframe = merged_data.dataframe.merge(data_copy.dataframe, on='variant', how=merge, suffixes=(False,False), copy=False) + if merged_metadata['protein']['uniprot'] != data.metadata['protein']['uniprot']: + print("WARNING: Uniprot mismatch on merge: Merging file %s uniprot %s with file %s of different uniprot %s" % ( + merged_metadata['merged']["file_00"], merged_metadata['protein']['uniprot'], + data.metadata['filename'], data.metadata['protein']['uniprot'])) + merged_metadata['merged']["file_%02d" % (c+1)] = data_copy.metadata['filename']+" (version "+str(data_copy.metadata['version'])+")" + for key in data_copy.metadata['columns'].keys(): + merged_metadata['columns'][key+"_%02d" % (c+1)] = data_copy.metadata['columns'][key] + c += 1 + + # Add the index columns ['n_mut','aa_ref','resi','aa_mut'] again + merged_data.add_index_columns() + + merged_metadata['protein']['sequence'] = target_seq + if first_resn != 1: + merged_metadata['protein']['first_residue_number'] = first_resn + merged_data.metadata = merged_metadata + merged_data.metadata['variants'] = merged_data.calc_variants_metadata() + return(merged_data) + + +# class ResidueData(PrismData): +# """Container class for per-residue data +# """ +# +# def _init(self): +# self.index_column_names = ['aa_ref'] +# +# def add_index_columns(self): +# pass +# +# def remove_index_columns(self): +# """Remove the indexing columns, i.e. 'aa_ref'""" +# self.dataframe.drop(['aa_ref'], axis=1, inplace=True) +# +# def check(self, verbose=0): +# # check ResidueData specific stuff +# check_status = None +# try: +# self.check_column_names(verbose=verbose) +# self.check_header_and_data_sequence(verbose=verbose) +# self.check_variants_metadata(no_overwrite=no_overwrite_metadata, verbose=verbose) +# except PrismFormatError as msg: +# raise PrismFormatError("VariantData.check failed. First fail message:\n%s" % (msg)) +# else: +# check_status = True +# +# return(check_status) +# +# def to_new_reference(self, target_seq=None, first_resn=None, verbose=0): +# pass + + +if __name__ == "__main__": + # Parse commandline arguments + import argparse,sys + arg_parser = argparse.ArgumentParser(description="PRISM data file processing and alignment") + # positional arguments + arg_parser.add_argument("files", nargs="+", metavar="DATAFILE", + help="Data files to process") + # keyword arguments + arg_parser.add_argument("--dump_header_csv", metavar="FILE", + help="Dump a .csv file containing an overview of file headers") + arg_parser.add_argument("--dump_fasta", metavar="FILE", + help="Dump all sequences in a fasta file") + arg_parser.add_argument("--parse", action='store_true', + help="Read data file(s), run all checks and write new data file(s)") + arg_parser.add_argument("--merge", metavar="FILE", + help="Merge data files concerning the (approximately) same target protein") + # A run of this main can only have a single target sequence so to parse more files with different sequence, run multiple times + arg_parser.add_argument("--target_seq", metavar="FILE_OR_SEQUENCE", + help="Align all output to this sequence, default is the sequence given in the first file") + arg_parser.add_argument("--saturate", action='store_true', + help="In the output data, add rows to cover all 20 substitutions at all considered positions") + arg_parser.add_argument("--clean_na", action='store_true', + help="In the output data, add rows to cover all 20 substitutions at all considered positions") + arg_parser.add_argument("--first_res_num", metavar="INT", + help="In the output data, assign this number to the first given amino acid of the sequence") + arg_parser.add_argument("-v","--verbose", action="count", default=0, + help="Level of output, default zero is no output") + args = arg_parser.parse_args() + if args.saturate and args.clean_na: + raise ValueError("Arguments saturate and clean_na cannot be used together") + + # Read data files + data_parser = PrismParser() + data_list = [] + for filename in args.files: + try: + data = None + if args.merge or args.parse: + if args.verbose > 0: + print("\nRead data file %s " % (filename),flush=True) + data = data_parser.read(filename, verbose=args.verbose) + elif args.dump_header_csv or args.dump_fasta: + # Only read header which is much faster for large files + if args.verbose > 0: + print("\nRead header of %s " % (filename),flush=True) + header = data_parser.read_header(filename, verbose=args.verbose) + data = PrismData(header, None) + else: + break + except PrismFormatError as error: + print("ERROR reading file %s:\nBad file format: %s\n" % (filename,error)) + except yaml.parser.ParserError as error: + print("ERROR reading file %s:\nBad YAML syntax (ParserError) in header:\n%s\n" % (filename,error)) + except yaml.scanner.ScannerError as error: + print("ERROR reading file %s:\nBad YAML syntax (ScannerError, perhaps a missing whitespace after colon?) in header:\n%s\n" % (filename,error)) + except FileNotFoundError: + print("ERROR reading file %s:\nCannot find file\n" % (filename)) + else: + # Only if all is good + data_list.append(data) + + # Check input + if len(data_list) < 1: + print("\nERROR: Missing data files or arguments") + arg_parser.print_usage() + sys.exit(2) + if args.verbose > 0: + print("Done loading %d file(s)\n" % (len(data_list))) + + # Dump overview csv if requested + if args.dump_header_csv: + if args.target_seq or args.first_res_num or args.saturate or args.clean_na: + print("WARNING: Arguments target_seq, first_res_num, saturate and clean_na have no effect on dump_header_csv") + data_parser.write_meta_file(data_list, args.dump_header_csv, output_type="csv") + print("Dumped header overveiw of %d files in %s" % (len(data_list),args.dump_header_csv)) + + # Dump amino acid sequences in fasta format if requested + if args.dump_fasta: + if args.target_seq or args.first_res_num or args.saturate or args.clean_na: + print("WARNING: Arguments target_seq, first_res_num, saturate and clean_na have no effect on dump_fasta") + data_parser.write_meta_file(data_list, args.dump_fasta, output_type="fasta") + print("Dumped %d sequences in %s" % (len(data_list),args.dump_fasta)) + + # Set target sequence and offset for all data files + target_seq = None + first_resn = 1 + if args.target_seq: + if args.target_seq[-6:].lower() == ".fasta" or args.target_seq[-4:].lower() == ".fas": + record = None + for r in SeqIO.parse(args.target_seq, "fasta"): + if not record is None: + # if args.verbose > 0: + print("WARNING: Only using the first sequence record in %s" % (args.target_seq)) + break + record = r + target_seq = str(record.seq) + print("Target sequence \'%s\' of %d residues from file %s" % (record.description, len(target_seq), args.target_seq)) + else: + target_seq = args.target_seq + if args.first_res_num: + first_resn = int(args.first_res_num) + else: + target_seq = (data_list[0]).metadata['protein']['sequence'] + if args.first_res_num: + # Overwrite offset given in header of first file + first_resn = int(args.first_res_num) + elif 'first_residue_number' in (data_list[0]).metadata['protein']: + # Only use offset of first file with that sequence (and not the commandline input) + first_resn = int((data_list[0]).metadata['protein']['first_residue_number']) + + if not data_parser.is_aa_one_nat(target_seq, "X"): + print("ERROR: Target sequence seems not to be one-letter amino acids or \'X\':") + print("Target sequence: %s" % (target_seq)) + sys.exit(2) + + # Parse & dump all files if requested + if args.parse: + filename_suffix = "_parsed" + for data in data_list: + if args.target_seq or first_resn: + data.to_new_reference(target_seq, first_resn, verbose=args.verbose, + min_identity=.9, min_coverage=.1, mismatch="remove", allow_inserts=True, allow_deletions=True) + if args.saturate: + data.saturate_variants() + elif args.clean_na: + data.clean_na_variants() + filename_parts = data.metadata['filename'].split("/")[-1].rsplit(".",1) + outfilename = filename_parts[0] + filename_suffix + "." + filename_parts[1] + data_parser.write(outfilename, data) + print("Parsed and wrote %d data file(s) with output filename suffix %s" % (len(data_list), filename_suffix)) + + # Dump a merged data file if requested + if args.merge: + merged_data = data_list[0].merge(data_list[1:], target_seq, first_resn, merge='outer', verbose=args.verbose, + min_identity=.9, min_coverage=.1, mismatch="remove", allow_inserts=True, allow_deletions=True) + if merged_data is None: + print("ERROR: Could not merge data") + sys.exit(2) + + if args.saturate: + merged_data.saturate_variants() + elif args.clean_na: + merged_data.clean_na_variants() + data_parser.write(args.merge, merged_data) + print("Dumped merge of %d data files in %s " % (len(merged_data.metadata['merged']), args.merge)) diff --git a/software/scripts/struc_select_sifts.py b/software/scripts/struc_select_sifts.py new file mode 100644 index 0000000..26d3c82 --- /dev/null +++ b/software/scripts/struc_select_sifts.py @@ -0,0 +1,1077 @@ +#!/usr/local/bin/python3 + +# Copyright (C) 2021 Kristoffer Enoe Johansson + +__version__ = 0.03 + +from argparse import ArgumentParser +import os,sys,argparse +import urllib +import json +import time + +from Bio import SeqIO +from Bio import Align +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord +from io import StringIO + +import numpy as np +import pandas as pd +pd.set_option('display.max_colwidth', 20) + +# Global variables to ensure that servers are not pinged too often +__pdb_time = 0.0 +__uniprot_time = 0.0 +__sifts_time = 0.0 + + +def select_structures(uniprot_record, struc_map, n_select=1, min_cover_positions=40, identity_power=1.0, verbose=0): + """Select a set of structures that best represent a uniprot protein based on a structure map + + Assigns a score to each structure and selects the best scoring structure per position: + score = 2.5/method_score * coverage * identitiy**identity_power + where method_score is X-ray resolution, 3 for cryo-em, 4 for nmr and 10 for others + + Parameters + ---------- + uniprot_record : BioPythin SeqRecord object parsed using uniprot-xml format + Information on target protein + struc_map : Pandas DateFrame + Structure map as generated by url_strucmap function below + identity_power : float + In the score function, the fraction identity is raised to this power to achieve a steeper + decent from identity 1.0. Default 1.0 for linear scaling. + min_cover_positions : integer + The minimum number of position that a structure should be optimal for in order to + appear in the final selection. Default 40 thought as a minimum size of a folded domain. + verbose : integer + Default 0 is silent and higher integers increses level of terminal output + """ + uniprot_id = uniprot_record.id + uniprot_seq = str(uniprot_record.seq) + unp_nres = len(uniprot_seq) + nstruc = len(struc_map.index) + res_score_matrix = np.ones([nstruc, unp_nres]) * -1 + score_list = [-1.0]*nstruc + for i in range(nstruc): + pdb_str = "%s-%s" % (struc_map['pdb_id'][i], struc_map['chain_id'][i]) + seq = struc_map['aligned_resseq'][i] + # pdb_nres = struc_map['end'][i] - struc_map['start'][i] + 1 + + # # The indentity measure in the score function compeates with coverage because the coverage can be made larger at the expence + # # of identity. + # ident = struc_map['ident'][i] / pdb_nres + + # Count identities among observed residues + ident_obs = sum( [aa1==aa2 and not aa2 in "-x" for aa1,aa2 in zip(uniprot_seq.lower(),struc_map['aligned_resseq_obs'][i].lower())] ) + # Number of positions that align, checked to be SIFTS coverage times len(uniprot_seq) + cover = struc_map['coverage'][i] + ident_score = ( ident_obs*1.0/cover )**identity_power + + # Number of oberserved positions + align_obs = struc_map['aligned_resseq_obs'][i].lower() + cover_obs = (unp_nres - align_obs.count('-') - align_obs.count('x')) / unp_nres + + # Assign a score to the method used to determine the structure + method = (struc_map['experimental_method'][i]).lower() + if 'x-ray' in method or 'xray' in method: + method_score = struc_map['resolution'][i] + elif 'cryoem' in method or 'cryo-em' in method: + # Cryo-EM structure same as 3Å X-ray + method_score = 3.0 + # Consider to look for EM resilution + elif 'nmr' in method: + # NMR structure same as 4Å X-ray + method_score = 4.0 + else: + # Assume model, only prefer when nothing else is available + method_score = 5.0 + + # Structure score used for selecting best structure for uniprot + score_list[i] = 1/(1+np.exp((method_score-3.5)*2)) + cover_obs * ident_score + # method_score: 1.8, 2.2, 2.5, 2.7, 3.0, 4.0, 5.0 + # sigmoid 0.968, 0.931, 0.881, 0.832, 0.731, 0.269, 0.047 + + # Structures of identical score is typically different but identical chains of the same asym unit, here argmax selects the first + if verbose > 0: + print("%s %s score %.2f (%s, method_score %.2f, cover_obs %.3f, ident_score = %d/%d^%.1f = %.3f)" % + (uniprot_id, pdb_str, score_list[i], method, method_score, cover_obs, ident_obs, cover, identity_power, ident_score)) + + # Assign structure score to positions that structure cover + seq_array = np.array(list(struc_map['aligned_resseq_obs'][i])) + pos_mask = np.logical_and(seq_array != '-', seq_array != 'x') + res_score_matrix[i,pos_mask] = score_list[i] + + struc_map['score'] = score_list + + # For each position (column in res_score_matrix), find the best (highest scoring) structure + best_struc_index = np.argmax(res_score_matrix, axis=0) + + # Count how many positions each structure best represent + best_struc_counts = np.bincount(best_struc_index) + + # Fill array with structure indices that are never optimal and store in structure map + if best_struc_counts.size < nstruc: + best_struc_counts = np.append(best_struc_counts, [0]*(nstruc-best_struc_counts.size)) + struc_map['selected_pos'] = best_struc_counts + + # Which structure(s) covers most positions + struc_rank = np.flip( np.argsort(best_struc_counts) ) + if verbose > 1: + # print("Matrix with indices of best structure per position") + # print(res_score_matrix) + print("Index of best scoring structure per position") + print(best_struc_index) + print("Number of positions that each structure (row) is selected for: %s" % (str(best_struc_counts))) + print("Ranked indices of the structure map raows: %s" % (str(struc_rank))) + + # Do final selection + selected_indices = [] + for rank in range(len(struc_rank)): + struc_index = struc_rank[rank] + if verbose > 0: + print("Rank %2d is index %2d %s-%s that covers %d positions" % + (rank, struc_index, struc_map['pdb_id'][struc_index],struc_map['chain_id'][struc_index], best_struc_counts[struc_index])) + # Check if structure is ok to use + if best_struc_counts[struc_index] > min_cover_positions: + # print("Using %s chain %s" % (struc_map['pdb_id'][struc_index],struc_map['chain_id'][struc_index])) + selected_indices.append(struc_index) + + # Report + if verbose > 0: + nsel = len(selected_indices) + if nsel == 1: + i = selected_indices[0] + print("%s: %s covers %d of %d residues" % (uniprot_id, pdb_str, struc_map['coverage'][i], len(uniprot_seq))) + elif nsel > 1: + struc_str = ", ".join( ["%s-%s cover %d" % (pdb,c,cover*100) for (pdb,c,cover) in + zip(struc_map['pdb_id'][selected_indices], + struc_map['chain_id'][selected_indices], + struc_map['coverage'][selected_indices])] ) + print("%s covered by %d structures: %s" % (uniprot_id, nsel, struc_str)) + + # Determine overlap of structures + overlap = np.zeros([nsel,nsel]) + for i in range(nsel-1): + for j in range(i+1,nsel): + overlap[i,j] = sum( np.logical_and(res_score_matrix[i,] > 0, res_score_matrix[j,] > 0) ) + if nsel < 3: + print("The two structures overlap at %d positions" % (overlap[1,1])) + else: + print("Overlaping positions matrix:") + print(overlap) + else: + print("ERROR: No structures found for %s" % (uniprot_id)) + + # Return the rows of stuc_map input that was selected + return(struc_map.iloc[selected_indices,:]) + + +def get_method_score(smap_row, verbose=0): + method = smap_row.method_exp.lower() + if 'x-ray' in method or 'xray' in method: + method_score = smap_row.method_res + if method_score is None: + if verbose > 0: + print("%s-%s: No resolution given for X-ray structure. Using 2.5" % (smap_row.pdb, smap_row.chain)) + method_score = 2.5 + elif 'electron microscopy' in method: + method_score = smap_row.method_res + if method_score is None: + if verbose > 0: + print(" %s-%s: No resolution given for EM structure. Using 3.5" % (smap_row.pdb, smap_row.chain)) + method_score = 3.5 + elif 'nmr' in method: + # NMR structure same as 4Å X-ray + method_score = 4.0 + else: + # Assume model, only prefer when nothing else is available + if verbose > 0: + print("%s-%s: Assuming method '%s' is model" % (smap_row.pdb, smap_row.chain, smap_row.method_exp)) + method_score = 5.0 + return(method_score) + + +def rank_structures(target_record, strucmap, verbose=0): + nres_target = len(target_record.seq) + + # Coverage score + # Conceptually, score terms are weighted in relation to coverage because this is available early on from SIFTS + # Cover_exp includes un-observed residues but not deletions (actually deleted from the molecules) + # Cover_obs includes mismatch and modified residues but not un-observed + cover_score = 0.0 + if 'cover_obs' in strucmap.columns: + cover_score = strucmap['cover_obs'] + elif 'cover_exp' in strucmap.columns: + cover_score = strucmap['cover_exp'] + + # Score to evaluate the experimental method by which the structure was determined + method_scores = 0.0 + if 'method_exp' in strucmap.columns: + method_scores = np.array( [get_method_score(row, verbose=verbose) for row in strucmap.itertuples(index=False)] ) + # method_scores 1.3, 1.5, 1.7, 1.9, 2.1, 2.2, 2.3, 2.5, 3.0, 4.0, 5.0 + # sigmoid 0.9955, 0.9933, 0.9900, 0.9852, 0.9781, 0.9734, 0.9677, 0.9526, 0.8808, 0.5000, 0.1192 + method_scores = 1/(1+np.exp((method_scores-3.8)*2)) + + # Score based on inserts in the structure (not present in the target sequence), see case P07550 + insert_score = 0 + if 'inserts' in strucmap.columns: + # Inserts are completely removed from the alignment so they need to be penalized + insert_score = -5.0 * strucmap['inserts'] / nres_target + + # Score based on chemically modified residues + mod_score = 0 + if 'modified' in strucmap.columns: + # If cover and identity is the same, prefer structure with least modified residues + # If a structure has one additional aligned residue, it counts 1.5/nres, and a modified sould be worse + # since this is typically in the folded part of the structure whereas the additionl residue likely to + # be in the terminals + mod_score = - 1.7 * strucmap['modified'] / nres_target + + # Mismatch score + mismatch_score = 0 + if 'mismatch' in strucmap.columns: + # A mismatch should weight more than 2 residue coverage because coverage is typically extended in + # terminals and a mismatch in the folded region + mismatch_score = -2.5 * strucmap['mismatch'] / nres_target + + # Everything else equal, prefer chain A + chain_score = 0.0 + if 'chain' in strucmap.columns: + chain_score = 1e-5 * (strucmap['chain']=='A') + + # Total score + scores = method_scores + cover_score + insert_score + mod_score + mismatch_score + chain_score + strucmap['score'] = scores + + # Reorder structure map to have highest scoring structures on top + strucmap = strucmap.iloc[ np.flip(np.argsort(scores)) ] + # Rename row to running indices and do not create a column with old indices + strucmap.reset_index(inplace=True, drop=True) + + return(strucmap) + + +def url_sifts_strucmap(target_record, min_ping_time=5.0, verbose=0): + target_id = target_record.id + target_seq = str(target_record.seq) + if verbose > 0: + # print("") + print("%s: Initial structure map from URL requests to SIFTS" % (target_id)) + # if verbose > 1: + # print("%s target sequence:" % (target_id)) + # print(target_seq) + + # Get best structure mappings from sifts + sifts_list = url_sifts(target_id, min_ping_time, verbose=verbose) + n_struc = len(sifts_list) + if verbose > 0: + print("%s: Recieved map of %d structures from SIFTS" % (target_id, n_struc)) + if verbose > 2: + print("%s: raw SIFTS record:" % (target_id)) + print(sifts_list) + + # SIFTS keys {'end': 101, 'chain_id': 'C', 'pdb_id': '6xog', 'start': 1, 'unp_end': 101, 'coverage': 1, 'unp_start': 1, 'resolution': 1.98, + # 'experimental_method': 'X-ray diffraction', 'tax_id': 9606} + columns = ['pdb', 'chain', 'score', 'select', 'method_exp', 'method_res', 'unp_start', 'unp_end', 'resseq_start', 'resseq_end', 'cover_exp'] + + sifts_dict = {'pdb': [d['pdb_id'] for d in sifts_list], 'chain': [d['chain_id'] for d in sifts_list], + 'score': [0.0]*n_struc, 'select': [1]*n_struc, + 'method_exp': [d['experimental_method'] for d in sifts_list], 'method_res': [d['resolution'] for d in sifts_list], + 'cover_exp': [d['coverage'] for d in sifts_list], + 'unp_start': [d['unp_start'] for d in sifts_list], 'unp_end': [d['unp_end'] for d in sifts_list], + 'resseq_start': [d['start'] for d in sifts_list], 'resseq_end': [d['end'] for d in sifts_list]} + + return( pd.DataFrame(sifts_dict, columns=columns) ) + + +def align_strucmap(target_record, strucmap, min_ping_time=5.0, verbose=0): + """Update a structure map with aligned RESSEQ from PDBe URL request""" + + target_id = target_record.id + target_seq = str(target_record.seq) + n_struc = len(strucmap.index) + + # New columns to add to strucmap + cover_obs_col = [] + ident_exp_col = [] + ident_obs_col = [] + inserts_col = [] + deletions_col = [] + mismatch_col = [] + non_obs_col = [] + modified_col = [] + resseq_col = [] + aligned_resseq_col = [] + aligned_resseq_obs_col = [] + + c = 0 + for smap in list(strucmap.itertuples(index=False))[c:]: + c += 1 + if verbose > 0: + print("=== Structure %d of %d for %s: %s-%s" % (c,n_struc,target_id,smap.pdb,smap.chain)) + + # A structure map entry must have pdb and chain fields + if not 'pdb' in smap._fields: + raise ValueError("ERROR: Structure map row with no pdb field: %s" % (str(smap))) + if not 'chain' in smap._fields: + raise ValueError("ERROR: Structure map row with no chain field: %s" % (str(smap))) + pdb_str = "%s-%s" % (smap.pdb, smap.chain) + + # Get RESSEQ sequence, i.e. the sequence of the protein actually used to determine structure (including residues not observed in the structure) + try: + pdb_resseq = url_pdb_resseq(smap.pdb, smap.chain, min_ping_time=min_ping_time, verbose=verbose) + except Exception as err: + print("ERROR: RESSEQ request for %s failed (%s)" % (pdb_str, err)) + cover_obs_col.append(np.nan) + ident_exp_col.append(np.nan) + ident_obs_col.append(np.nan) + inserts_col.append(np.nan) + deletions_col.append(np.nan) + mismatch_col.append(np.nan) + non_obs_col.append(np.nan) + modified_col.append(np.nan) + resseq_col.append(np.nan) + aligned_resseq_col.append(np.nan) + aligned_resseq_obs_col.append(np.nan) + continue + + # Look for modified residues + pdb_modified = url_pdb_modified(smap.pdb, smap.chain, min_ping_time=min_ping_time, verbose=verbose) + + # Request observed moities and residues in structure + pdb_obs = url_pdb_observed(smap.pdb, chain_id=None, min_ping_time=min_ping_time, verbose=verbose) + obs_chains = {} + n_macro_mol = len(pdb_obs["molecules"]) + for i_macro_mol in range(n_macro_mol): + # How many chains of macro molecule type (also has key 'entity_id' : integer) + mmol_id = pdb_obs["molecules"][i_macro_mol]["entity_id"] + n_mmol_chains = len(pdb_obs["molecules"][i_macro_mol]["chains"]) + for i_mmol_chain in range(n_mmol_chains): + # ID of this chain (also has key 'struct_asym_id' : char) + chain_id = pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["chain_id"] + # How many fragments are observed for chain + n_chain_frag = len(pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["observed"]) + frag_list = [] + for i_frag in range(n_chain_frag): + start_dict = pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["observed"][i_frag]["start"] + end_dict = pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["observed"][i_frag]["end"] + + # residue_number is 1-based RESSEQ index and author_residue_number is ATOM field numbers (both numbers observed) + frag_list.append((start_dict["residue_number"], end_dict["residue_number"], )) + + # also has keys 'author_insertion_code' (typically None) and 'struct_asym_id' (typically chain id) + if not start_dict["author_insertion_code"] is None or not end_dict["author_insertion_code"] is None: + print("WARNING: PDB %s (%s) observed fragment start and end author_insertion_code is %s and %s" % + (pdb_str, target_id, start_dict["author_insertion_code"], end_dict["author_insertion_code"])) + # # Remove struct_asym_id warning, from 6n13-B it only seems to be different from chain_id when ordering is not A,B,C,... + # if start_dict["struct_asym_id"] != chain_id or end_dict["struct_asym_id"] != chain_id: + # print("WARNING: PDB %s (%s) observed fragment start and end struct_asym_id %s and %s is different from chain_id %s" % + # (pdb_str, target_id, start_dict["struct_asym_id"], end_dict["struct_asym_id"], chain_id)) + + if chain_id in obs_chains.keys(): + print(pdb_obs) + raise ValueError("ERROR: Chain id %s present multiple times in PDB observed responce" % (chain_id)) + obs_chains[chain_id] = (mmol_id, frag_list) + + obs = obs_chains[smap.chain][1] + nres = len(pdb_resseq) + pdb_resseq_obs = pdb_resseq + + # Check for non-observed N-terminal + if obs[0][0] > 1: + pdb_resseq_obs = "x"*(obs[0][0]-1) + pdb_resseq_obs[(obs[0][0]-1):] + if verbose > 0: + print("%s: Non-observed region in %s: %d-%s-%d (1-based RESSEQ index)" % + (target_id, pdb_str, 1, pdb_resseq[0:(obs[0][0]-1)], obs[0][0]-1)) + # Non-observed regions + if len(obs) > 1: + for i in range(1,len(obs)): + pdb_resseq_obs = pdb_resseq_obs[:obs[i-1][1]] + "x"*(obs[i][0]-obs[i-1][1]-1) + pdb_resseq_obs[(obs[i][0]-1):] + if verbose > 0: + print("%s: Non-observed region in %s: %d-%s-%d (1-based RESSEQ index)" % (target_id, pdb_str, obs[i-1][1]+1, + pdb_resseq[(obs[i-1][1]):(obs[i][0]-1)], obs[i][0]-1)) + # Check for non-observed C-terminal + if obs[-1][1] < nres: + pdb_resseq_obs = pdb_resseq_obs[:obs[-1][1]] + "x"*(nres-obs[-1][1]) + if verbose > 0: + print("%s: Non-observed region in %s: %d-%s-%d (1-based RESSEQ index)" % + (target_id, pdb_str, obs[-1][1]+1, pdb_resseq[obs[-1][1]:nres], nres)) + + # Modified residues + if len(pdb_modified) != 0: + for pdbmod in pdb_modified: + # This number should apply to the PDB RESSEQ sequence + resi = pdbmod['residue_number'] + aa = pdb_resseq[resi-1] + aa_obs = pdb_resseq_obs[resi-1] + if aa != aa_obs: + if verbose > 0: + print("%s: Modified residues %s%d-%s seems unobserved: %s" % (pdb_str, aa, resi, pdbmod['chem_comp_id'], aa_obs)) + if verbose > 1: + print("%s has modification %s%d-%s into %s (author_resi %d may be uniprot)" % + (pdb_str, aa, resi, pdbmod['chem_comp_id'], pdbmod['chem_comp_name'], pdbmod['author_residue_number'])) + # print("%s: %s entity_id %d, weight %.2f, alternate_conformers %s, author_insertion_code %s" % + # (target_id, pdb_str, pdbmod['entity_id'], pdbmod['weight'], pdbmod['alternate_conformers'], pdbmod['author_insertion_code'])) + # print("%s: %s modification description: %s" % (target_id, pdb_str, pdbmod['description'])) + + # Mark modified with capital X which should work in bolsum and I can still see what is gap and what is modified + pdb_resseq_obs = pdb_resseq_obs[:(resi-1)] + 'X' + pdb_resseq_obs[resi:] + + # Attempt SIFT alignment if present + identities = 0 + inserts = 0 + pdb_nres = 0.1 + if 'resseq_start' in smap._fields and 'unp_start' in smap._fields: + pdb_nres = smap.resseq_end - smap.resseq_start + 1 + unp_nres = smap.unp_end - smap.unp_start + 1 + + # Align PDB RESSEQ sequence to Uniprot sequence + if pdb_nres == unp_nres: + # Attempt align PDB to target sequence based on SIFTS + aligned_resseq = "-"*(smap.unp_start-1) + pdb_resseq[(smap.resseq_start-1):(smap.resseq_end)] + "-"*(len(target_seq)-smap.unp_end) + identities = sum( [aa1==aa2 and not aa2.lower() in "-xX" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq.lower())] ) + aligned_resseq_obs = "-"*(smap.unp_start-1) + pdb_resseq_obs[(smap.resseq_start-1):(smap.resseq_end)] + \ + "-"*(len(target_seq)-smap.unp_end) + if verbose > 0: + if identities/pdb_nres < 0.9: + print("%s: poor SIFTS alignment of PDB %s with only %d of %d identical residues - attempt custom alignment" % + (target_id, pdb_str, identities, pdb_nres)) + else: + print("%s: using SIFTS alignment for %s" % (target_id,pdb_str)) + else: + if verbose > 0: + print("%s: SIFTS residue range for %s have different length for PDB (nres %d) and Target (nres %d) - attempt custom alignment" % + (target_id, pdb_str, pdb_nres, unp_nres)) + + # If SIFTS alignment failed run pairwise alignment + if identities/pdb_nres < 0.9: + (aligned_resseq,aligned_resseq_obs,inserts) = subalign(target_record, SeqRecord(Seq(pdb_resseq), id=pdb_str), + extra_seq=pdb_resseq_obs, verbose=verbose) + + # Count identities among residues present in the experiment (not necessarily observed) + identities = sum( [aa1==aa2 and not aa2 in "-xX" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq.lower())] ) + + # Check if identity is within threshold + if len(aligned_resseq) != len(target_seq): + print(smap) + print(target_seq) + print(aligned_resseq) + raise ValueError("ERROR: %s %s alignment has length %d different from target sequence legnth %d" % + (target_id, pdb_str, len(aligned_resseq), len(target_seq))) + + if identities/pdb_nres < 0.9: + # Warn if PDB fragment has less than 90% identity + print("WARNING %s: Poor custom alignment of PDB %s with only %d of %d identical residues - attempt custom alignment" % + (target_id, pdb_str, identities, pdb_nres)) + + # Store stats and alignemnts in structure map, these should be extracted from aligned_resseq_obs or tested of consistance with this + # Number of inserted positions in PDB (should be removed from aligned sequence) + inserts_col.append(inserts) + + # Number of non-terminal positions deleted/missing in the experiment + deletions_col.append(aligned_resseq_obs.strip('-').count('-')) + + # Check consistant calculation of SIFTS coverage + cover_pos = len(aligned_resseq_obs.strip('-')) - deletions_col[-1] + cover_exp = cover_pos * 1.0/len(target_seq) + # Coverage should be the fraction of target positions that the structure covers including indels (uncertainty from 2-digit rounding) + if cover_exp - smap.cover_exp > 1e-3: #0.0051: + print("WARNING: %s %s re-calculated SIFTS coverage %.4f differ from cover_exp of %.4f (%d pos)" % + (target_id, pdb_str, cover_exp, smap.cover_exp, cover_pos)) + + # Coverage of observed residues, i.e. residues that are not deleted or unobserved. + # Mismatch and and observed modified are included in cover_obs + cover_obs_pos = len(aligned_resseq_obs) - aligned_resseq_obs.count('-') - aligned_resseq_obs.count('x') # - aligned_resseq_obs.count('X') + cover_obs = cover_obs_pos * 1.0/len(target_seq) + cover_obs_col.append(cover_obs) + + # Check indels for consistancy with SIFTS coverage ranges + # if pdb_nres - sifts['deletions'] + sifts['inserts'] - unp_nres > 0: + if pdb_nres - deletions_col[-1] + inserts - unp_nres > 0: + print("WARNING: %s %s SIFTS target range %d-%d (%d) and PDB range %d-%d (%d) is not consistant with %d deletions and %d insertions" % + (target_id, pdb_str, smap.unp_start, smap.unp_end, unp_nres, + smap.resseq_start, smap.resseq_end, pdb_nres, deletions_col[-1], inserts)) + + # How many of the covered target positions are identical + ident_exp_col.append(identities) + # For consistancy with cover_obs, a modified residue is counted as identical + # ident_obs_col.append( sum( [aa1==aa2 and not aa2.lower() in "-xX" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq_obs.lower())] ) ) + ident_obs_col.append( sum( [aa1==aa2 and not aa2.lower() in "-x" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq_obs.lower())] ) ) + + # How many covered and observe positions does not match + mismatch_col.append(sum([aa1!=aa2 and aa2 in "acdefghiklmnpqrstvwy" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq_obs.lower())])) + + # How many og the covered residues are not observed + non_obs_col.append(aligned_resseq_obs.count('x')) + + # How many og the covered residues are not observed + modified_col.append(aligned_resseq_obs.count('X')) + + # # Removed, makes many warnings. + # if not sifts['non-obs'] == pdb_resseq_obs.lower().count('x'): + # print("WARNING: %s %s has %d non-observed positions that was never in the experiment (missing)" % + # (target_id, pdb_str, pdb_resseq_obs.lower().count('x')-sifts['non-obs'])) + + resseq_col.append(pdb_resseq) + aligned_resseq_col.append(aligned_resseq) + aligned_resseq_obs_col.append(aligned_resseq_obs) + + # Report alignment + print("%s: %s alignment: ident_exp %4d ident_obs %4d cover_exp %5.3f cover_obs %5.3f deletions %d inserts %d mismatch %d" % + (target_id, pdb_str, ident_exp_col[-1], ident_obs_col[-1], smap.cover_exp, cover_obs_col[-1], + deletions_col[-1], inserts_col[-1], mismatch_col[-1])) + if verbose > 0: + print("%s: Original %s RESSEQ:" % (target_id,pdb_str)) + print(pdb_resseq) + print("%s: Aligned %s sequence:" % (target_id,pdb_str)) + print(aligned_resseq) + print("%s: Aligned observed %s sequence (-: gap/deletion, x: unobserved (but present in experiment), X: modified):" % (target_id,pdb_str)) + print(aligned_resseq_obs) + + # Add columns to structure map and return updated strucmap + strucmap['cover_obs'] = cover_obs_col + strucmap['ident_exp'] = ident_exp_col + strucmap['ident_obs'] = ident_obs_col + strucmap['inserts'] = inserts_col + strucmap['deletions'] = deletions_col + strucmap['mismatch'] = mismatch_col + strucmap['non-obs'] = non_obs_col + strucmap['modified'] = modified_col + strucmap['resseq'] = resseq_col + strucmap['aligned_resseq'] = aligned_resseq_col + strucmap['aligned_resseq_obs'] = aligned_resseq_obs_col + return(strucmap) + + +def url_sifts(uniprot_id, min_ping_time=50, verbose=0): + # Go easy on servers and wait if necessary + global __sifts_time + time_since_last = time.time() - __sifts_time + if time_since_last < min_ping_time: + if verbose > 0: + print("Avoid frequent SIFTS requests and wait %.1f seconds" % (min_ping_time - time_since_last)) + time.sleep(min_ping_time - time_since_last) + __sifts_time = time.time() + + request_url = "https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/%s" % (uniprot_id) + if verbose > 1: + print("Request EBI SIFTS URL: %s" %request_url ) + responce = urllib.request.urlopen(request_url) + # urllib does not decode, but this is rarely important + json_str = urllib.parse.unquote( responce.read().decode('utf-8') ) + # Return a list of dicts with info on each PDB + return( json.loads(json_str)[uniprot_id] ) + + +def url_uniprot(uniprot_id, min_ping_time=50, verbose=0): + """ + Returns a BioPython SeqRecord object with attributes seq, name, description and dbxrefs + """ + # Go easy on servers and wait if necessary + global __uniprot_time + time_since_last = time.time() - __uniprot_time + if time_since_last < min_ping_time: + if verbose > 0: + print("Avoid frequent Uniprot requests and wait %.1f seconds" % (min_ping_time - time_since_last)) + time.sleep(min_ping_time - time_since_last) + __uniprot_time = time.time() + + request_url = "https://www.uniprot.org/uniprot/%s.xml" % (uniprot_id) + if verbose > 1: + print("Request uniprot URL: %s" %request_url ) + responce = urllib.request.urlopen(request_url) + # if responce.status != 200: + # raise InternetError() + # Return a biopython uniprot record object + return( SeqIO.read(responce, "uniprot-xml") ) + + +def url_pdb_resseq(pdb_id, chain_id=None, min_ping_time=50, verbose=0): + # Go easy on servers and wait if necessary + global __pdb_time + time_since_last = time.time() - __pdb_time + if time_since_last < min_ping_time: + if verbose > 0: + print("Avoid frequent PDB requests and wait %.1f seconds" % (min_ping_time - time_since_last)) + time.sleep(min_ping_time - time_since_last) + __pdb_time = time.time() + + # chain_id is case sensitive, pdb_id is not + request_url = "https://www.ebi.ac.uk/pdbe/entry/pdb/%s/fasta" % (pdb_id) + if verbose > 1: + print("Request PDB URL: %s" %request_url ) + responce = urllib.request.urlopen(request_url) + fasta_str = urllib.parse.unquote( responce.read().decode('utf-8') ) + seq_records = SeqIO.parse(StringIO(fasta_str), "fasta") + + # Dict of seq per chain_id + seq_index_dict = {} + seq_list = [] + for sr in seq_records: + descrip_list = sr.description.split("|") + if descrip_list[0] != "pdb": + raise ValueError("ERROR: %s RESSEQ responce description does not start with \'pdb|\'" % (pdb_id)) + if descrip_list[1] != pdb_id.lower(): + raise ValueError("ERROR: %s RESSEQ responce description does not start with \'pdb|%s|\'" % (pdb_id, pdb_id.lower())) + + for c in descrip_list[2].split(): + seq_index_dict[c] = len(seq_list) + seq_list.append(str(sr.seq)) + # Change this when I know what to return as default, use first sequence for now + if chain_id is None: + return(seq_list[0]) + else: + if not chain_id in seq_index_dict.keys(): + raise ValueError("Error: Cannot get RESSEQ, no chain %s in PDB %s" % (chain_id,pdb_id)) + return(seq_list[seq_index_dict[chain_id]]) + + +def url_pdb_observed(pdb_id, chain_id=None, min_ping_time=50, verbose=0): + # Example of returnced list element + # https://www.ebi.ac.uk/pdbe/api/pdb/entry/polymer_coverage/6uyz + + # Apparantly, observed says nothing about why the non-observed positions are not observed, e.g if they were never in + # the experiment or just not observed + + # Go easy on servers and wait if necessary + global __pdb_time + time_since_last = time.time() - __pdb_time + if time_since_last < min_ping_time: + if verbose > 0: + print("Avoid frequent PDB requests and wait %.1f seconds" % (min_ping_time - time_since_last)) + time.sleep(min_ping_time - time_since_last) + __pdb_time = time.time() + + # chain_id is case sensitive, pdb_id is not + if chain_id is None: + request_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/polymer_coverage/%s" % (pdb_id) + else: + request_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/polymer_coverage/%s/chain/%s" % (pdb_id,chain_id) + if verbose > 1: + print("Request PDB URL: %s" %request_url ) + responce = urllib.request.urlopen(request_url) + json_str = urllib.parse.unquote( responce.read().decode('utf-8') ) + # Return a dicts with info on each PDB + # if chain is given fragments are in dict['molecules'][0]['chains'][0]['observed'] + return( json.loads(json_str)[pdb_id.lower()] ) + + +def url_pdb_modified(pdb_id, chain_id=None, min_ping_time=50, verbose=0): + """URL lookup of modified residues in PDB entry""" + + # Example of returned list element + # {'chem_comp_name': 'N(6)-ACETYLLYSINE', 'entity_id': 1, 'description': 'Small ubiquitin-related modifier 1', 'weight': 9567.801, + # 'residue_number': 32, 'author_residue_number': 46, 'chain_id': 'A', 'alternate_conformers': 0, 'author_insertion_code': '', + # 'chem_comp_id': 'ALY', 'struct_asym_id': 'A'} + + # Go easy on servers and wait if necessary + global __pdb_time + time_since_last = time.time() - __pdb_time + if time_since_last < min_ping_time: + if verbose > 0: + print("Avoid frequent PDB requests and wait %.1f seconds" % (min_ping_time - time_since_last)) + time.sleep(min_ping_time - time_since_last) + __pdb_time = time.time() + + # chain_id is case sensitive, pdb_id is not + request_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/modified_AA_or_NA/%s" % (pdb_id) + if verbose > 1: + print("Request PDB URL: %s" %request_url ) + try: + responce = urllib.request.urlopen(request_url) + except urllib.error.HTTPError as e: + if e.code == 404: + # URL error 'page not found' means that there are no modified residues annotated for this PDB + return({}) + else: + # All other errors are raised normally + raise urllib.error.HTTPError(e) + + json_str = urllib.parse.unquote( responce.read().decode('utf-8') ) + modified_dict = json.loads(json_str) + if not pdb_id.lower() in modified_dict.keys(): + raise ValueError("ERROR: PDB modified responce does not seem to cover %s" % (pdb_id)) + modified_dict = modified_dict[pdb_id.lower()] + + if chain_id is None: + return(modified_dict) + + modified_chain = {} + for ent in modified_dict: + if ent['chain_id'] != ent['struct_asym_id'] and verbose > 1: + print("%s: modified residue record has chain_id %s but struct_asym_id %s, using chain_id" % + (pdb_id, ent['chain_id'], ent['struct_asym_id'])) + + # Check if chain already has a record of residue modification + if ent['chain_id'] in modified_chain.keys(): + modified_chain[ent['chain_id']].append(ent) + else: + modified_chain[ent['chain_id']] = [ent] + + if chain_id in modified_chain.keys(): + return(modified_chain[chain_id]) + else: + # No annotations for requested chain + return({}) + + +def subalign(target_seqrecord, query_seqrecord, extra_seq=None, verbose=0): + """Align query sequence to target sequence and return aligned query sequence with gaps inserted + + Query sequence is assumed to be a subsequence of target and a warning is generated if inserts + are removed from the query + + """ + target_id = target_seqrecord.id + target_seq = str(target_seqrecord.seq) + target_nres = len(target_seq) + query_id = query_seqrecord.id + query_seq = str(query_seqrecord.seq) + + # Setup aligner + aligner = Align.PairwiseAligner() + aligner.mode = 'global' + aligner.substitution_matrix = Align.substitution_matrices.load("BLOSUM62") + aligner.open_gap_score = -4.0 # Default value for '*' mutation in BLOSUM62 + aligner.extend_gap_score = -1.0 + aligner.end_open_gap_score = -1.0 # end gap open is like extension since terminal may be viewed as a gap + score_uncertainty = 1 + # print(aligner) + # print(Align.substitution_matrices.load("BLOSUM62")) + + alignments = sorted( aligner.align(target_seq, query_seq) ) + if len(alignments) > 1: + if alignments[0].score - alignments[1].score < score_uncertainty: + print("WARNING: Second alignment of %s versus %s is within uncertainty %.1f" % + (target_id, query_id, score_uncertainty)) + print("%s: Alignment scores: %.1f and %.1f" % (target_id, alignments[0].score,alignments[1].score)) + + if verbose > 1: + print("%s: Best alignment of %d score %.1f:" % (target_id, len(alignments), alignments[0].score)) + print(alignments[0]) + if len(alignments) > 1 and verbose > 2: + print("%s: Second best alignment of %d score %.1f:" % (target_id, len(alignments), alignments[1].score)) + print(alignments[1]) + + # Get target and query alignment paths in Bio.Align.PairwiseAlignment tuple format + a = alignments[0].aligned; ta = a[0]; qa = a[1] + inserts = 0 + aligned_query_seq = "" + if extra_seq is None: + aligned_extra_seq = None + else: + aligned_extra_seq = "" + + # If there's a missing range before first target fragment, these are gaps in the query sequence + aligned_query_seq += "-"*ta[0][0] + if not extra_seq is None: + aligned_extra_seq += "-"*ta[0][0] + + # Add first query fragment + aligned_query_seq += query_seq[qa[0][0]:qa[0][1]] + if not extra_seq is None: + aligned_extra_seq += extra_seq[qa[0][0]:qa[0][1]] + + # Process following path fragments + for i in range(len(qa)-1): + # Check that end-position of query fragment i, is start position of query fragment i+1 + if qa[i][1] != qa[i+1][0]: + inserts += qa[i+1][0]-qa[i][1] + if verbose > 0: + print("%s: alignment of %s will remove an insert of %d residues: %d-%s-%d" % + (target_id, query_id, qa[i+1][0]-qa[i][1], qa[i][1], query_seq[qa[i][1]:qa[i+1][0]], qa[i+1][0])) + + # If there's a missing range in target path, these are gaps in the query sequence + aligned_query_seq += "-"*(ta[i+1][0]-ta[i][1]) + if not extra_seq is None: + aligned_extra_seq += "-"*(ta[i+1][0]-ta[i][1]) + + # Add next query fragment + aligned_query_seq += query_seq[qa[i+1][0]:qa[i+1][1]] + if not extra_seq is None: + aligned_extra_seq += extra_seq[qa[i+1][0]:qa[i+1][1]] + if verbose > 0 and ta[i][1] != ta[i+1][0]: + print("%s: query sequence %s has a deletion from (and including) target position %d to %d (%d residues)" % + (target_id, query_id, ta[i][1], ta[i+1][0]+1, ta[i+1][0]-ta[i][1])) + + # If final target fragment is not last target position, add gaps untill it is + aligned_query_seq += "-"*(target_nres-ta[0-1][1]) + if not extra_seq is None: + aligned_extra_seq += "-"*(target_nres-ta[0-1][1]) + + return((aligned_query_seq, aligned_extra_seq, inserts)) + + +def strucmap_non_redundant(strucmap, verbose=0): + # list of pdb-chain identifiers + id_list = ["%s-%s" % (pdb,chain) for pdb,chain in zip(strucmap['pdb'], strucmap['chain'])] + + # make a list of unique pdb-chain identifiers + (unq_id,unq_index,unq_counts) = np.unique(id_list, return_index=True, return_counts=True) + + remove_rows = [] + for unqi in np.where(unq_counts > 1)[0]: + # find structure map index of this structure + smapi = np.where(np.array(id_list) == unq_id[unqi])[0] + + # just checking... + if len(smapi) != unq_counts[unqi]: + raise ValueError("ERROR: %s found to have %d mappings but only %d indices found" % (unq_id[unqi], unq_counts[unqi], len(smapi))) + + col_nonid = [] + for coli in range(len(strucmap.columns)): + # values in column of the rows with identical pdb,chain + col_val = list( strucmap.iloc[smapi,coli] ) + # column is OK if values are identical and not nan + col_ok = False + if type(col_val[0]) is float: + if np.isnan(col_val[0]): + # check that all elements are nan + col_ok = np.all(np.isnan(col_val)) + else: + # check that all values are within machine precision + col_ok = np.all(np.array(col_val)-col_val[0] < 1e-10) + else: + # compare all others as identities + col_ok = np.all(np.array(col_val) == col_val[0]) or col_val[0] is np.nan + + # store column indices that are not identical + if not col_ok: + col_nonid.append(coli) + print("WARNING: Multiple maps of %s has different values in column \"%s\": %s" % + (unq_id[unqi], strucmap.columns[coli], str(col_val))) + + # raise error if columns are not OK + if len(col_nonid) > 0: + # print columns that are not identical together with pdb and chain id + print("WARNING: Structure %s occurs %d times in structures map with non-identical mapping in column(s): %s:" % + (unq_id[unqi], unq_counts[unqi], ",".join(strucmap.columns[col_nonid]))) + print(strucmap.iloc[smapi, [0,1]+col_nonid]) + # raise ValueError("ERROR: Found %d identical %s structures with non-identical mapping in columns %s:" % + # (unq_counts[unqi],unq_id[unqi],",".join(strucmap.columns[col_nonid]))) + elif verbose > 0: + print("WARNING: Structure %s occurs %d times in structure map, will only keep first occurence" % (unq_id[unqi], unq_counts[unqi])) + if verbose > 1: + print(strucmap.iloc[smapi]) + + # all entities except first occurence marked for deletion + remove_rows.extend(list(smapi[1:])) + + # remove marked rows and make new index (row labels) + strucmap.drop(remove_rows, axis=0, inplace=True) + strucmap.reset_index(inplace=True, drop=True) + + # return non-redundant structure map + return(strucmap) + + +def parse_args(): + """ + Argument parser function + """ + + parser = ArgumentParser( description="" ) + + # Initiating/setup command line arguments + parser.add_argument( 'uniprot_ids', + type=str, + help="Input uniprot IDs. Several should be separated by ;" + ) + + args = parser.parse_args() + + args.uniprot_ids = args.uniprot_ids.split(';') + + return args + +if __name__ == '__main__': + """Find the best PDB structure for a Uniprot ID based on the SIFTS database + """ + # Verbose levels: 0: Print results only, 1: Print hvor result was obtained, 2: Debug information + verbose = 2 + + overwrite_data_files = False + + # data_dir = "/Users/kristofferjohansson/work/project_human_structome/struc_select/data" + data_dir = "./data" + # struc_dir = "/Users/kristofferjohansson/work/project_human_structome/struc_select/struc" + + # filename = '/Users/kristofferjohansson/work/project_human_structome/anders_uniprot_to_pdb/TorbensGenes.csv' + # if not os.path.isdir(data_dir): + # try: + # os.mkdir(data_dir) + # except FileNotFoundError: + # print("ERROR: Could not create data directory (parent dir missing?): %s" % (data_dir)) + # sys.exit(2) + + # Test directory excistance + + if overwrite_data_files: + print("WARNING: Will overwrite all data in %s" % (data_dir)) + + min_ping_time = 1.0 + # From https://www.ebi.ac.uk/pdbe/pdbe-rest-api + # How many API calls am I allowed? At present we rely on user restraint and have not set any hard limits. But in + # future, depending on load on our servers, we may want to implement rate limiting. + + + # get user input arguments + args = parse_args() + + + # # Torbens genes + # gene_list_df = pd.read_csv(filename) + # uniprot_id_list = list(gene_list_df['Entry']) + + # Rosetta test set: PTEN CALM1 SUMO1 UBE2I TPK1 P53 MAPK1 TPMT GAL4 UBI4 + uniprot_id_list = ['P60484', 'P0DP23', 'P63165', 'P63279', 'Q9H3S4', 'P04637', 'P28482', 'P51580', 'P04386', 'P0CG63'] + # what we used 1D5R 4DJC_A 1WYW_B 2GRO 3S4Y_A 6RZ3_A 2OJG 2BZG 3COQ_A 3OLM_D + # Rosetta test set: MAPK1 TPMT GAL4 UBI4 BRCA1 ADRB2 HRas + uniprot_id_list += ['P28482', 'P51580', 'P04386', 'P0CG63', 'P38398', 'P07550', 'P01112'] + # what we used 2OJG 2BZG 3COQ_A 3OLM_D 1JM7_A 3KJ6_A 4Q21_A + + # uniprot_id_list = ['P60484','P40692','O60260','P12931','P09651','P31483','P04035'] + # uniprot_id_list = ['P01112'] + + uniprot_id_list = args.uniprot_ids + + if verbose > 0: + print("Find structures for %d uniprot entities" % (len(uniprot_id_list))) + + for uniprot_id in uniprot_id_list: + # Get information on target protein either from file or url request + uniprot_filename = "uniprot_" + uniprot_id + ".xml" + has_uniprot_file = os.path.isfile(data_dir + "/" + uniprot_filename) + if not has_uniprot_file or overwrite_data_files: + if verbose > 0: + print("%s: Getting uniprot information from URL request" % (uniprot_id)) + uniprot_record = url_uniprot(uniprot_id, min_ping_time=min_ping_time, verbose=verbose) + SeqIO.write(uniprot_record, data_dir + "/" + uniprot_filename, "seqxml") + else: + if verbose > 0: + print("%s: Getting uniprot information from data file %s" % (uniprot_id, uniprot_filename)) + # BUG: SeqIO write and read in seqxml format seems to loose the name attribute but I could not find another + # format that worked with the name attribute. Closest is 'pir' format the puts the name attribute in the + # beginning of the description attribute but looses all dbxrefs. + uniprot_record = list( SeqIO.parse(data_dir + "/" + uniprot_filename, "seqxml") )[0] + + nres_uniprot = len(uniprot_record.seq) + print("%s: Name: %s, description: %s, residues: %s" % + (uniprot_record.id, uniprot_record.name, uniprot_record.description, nres_uniprot)) + if verbose > 0: + print(str(uniprot_record.seq)) + + # Get structure map + strucmap_filename = "strucmap_" + uniprot_id + ".csv" + has_strucmap_file = os.path.isfile(data_dir + "/" + strucmap_filename) + if not has_strucmap_file or overwrite_data_files: + print("%s: Building structure map from URL requests" % (uniprot_id)) + + # Initial structure map from SIFTS URL request + try: + strucmap = url_sifts_strucmap(uniprot_record, min_ping_time=min_ping_time, verbose=verbose) + except Exception as err: + print("ERROR: %s SIFTS request failed (%s) skip uniprot entry" % (uniprot_id, err)) + continue + + n_struc_raw = len(strucmap.index) + if n_struc_raw < 1: + print("ERROR: No structures assigned in SIFTS for %s, skip uniprot entry", uniprot_id) + continue + + # Check if any PDB+chain id occurs multiple times + strucmap = strucmap_non_redundant(strucmap, verbose=verbose) + n_struc_nr = len(strucmap.index) + + # Require a minimum number of residues in the structure in order to apply Rosetta + # For P53_P04637, min_res 30 made 6rz3 go up from rank 364 to 277. Further min_res 40 to rank 22 + min_res = 40 + l = list( np.where(strucmap['cover_exp'] * nres_uniprot >= min_res)[0] ) + if len(l) == 0: + print("WARNING: No structures of %s has coverage %d or more. Continuing with poor structures") + elif len(l) < n_struc_nr: + strucmap = strucmap.iloc[l] + # if verbose > 0: + # print("%s: Reducing structure map from %d to %d structures with %d or more residues covered (SIFT coverage)" % + # (uniprot_id, n_struc_nr, len(l), min_res)) + strucmap.reset_index(inplace=True, drop=True) + else: + if verbose > 0: + print("%s: No structures with coverage < %d found" % (uniprot_id,min_res)) + n_struc_minres = len(strucmap.index) + + # Shortlist structure map based on SIFTS assignments of experimental method and coverage + n_short = 1000 + if n_struc_minres > n_short: + if verbose > 0: + print("%s: Reducing list of structures from %d to top %d based on SIFTS assignments alone" % + (uniprot_id, n_struc_minres, n_short)) + strucmap = rank_structures(uniprot_record, strucmap, verbose=verbose) + strucmap = strucmap.iloc[range(n_short)] + strucmap.reset_index(inplace=True, drop=True) + else: + if verbose > 0: + print("%s: No shortlisting applied" % (uniprot_id)) + n_struc_sl = len(strucmap.index) + + # # Find a given PDB in the strucmap + # i = np.where(strucmap['pdb']=="6rz3")[0][0] + # print(i) + # print(strucmap.iloc[range(max(0,i-20),i+10)]) + + print("%s: Number of mapped structures: sifts_raw: %d non-redundant: %d min_res_%d: %d shortlist: %d" % + (uniprot_id, n_struc_raw, n_struc_nr, min_res, n_struc_minres, n_struc_sl)) + + # Update structure map with observation of residues based on PDBe URL requests + try: + strucmap = align_strucmap(uniprot_record, strucmap, min_ping_time=min_ping_time, verbose=verbose) + except Exception as err: + print("ERROR: %s PDB alignments failed (%s)" % (uniprot_id,err)) + continue + + else: + print("%s: Getting structure map from data file %s" % (uniprot_id, strucmap_filename)) + # Read struncture map from disk + strucmap = pd.read_csv(data_dir + "/" + strucmap_filename, sep=';') + + # Columns for pretty-printing + pcol = ["pdb", "chain", "score", "select", "method_exp", "method_res", "cover_exp", "cover_obs", "ident_exp", "ident_obs", + "inserts", "deletions", "mismatch", "non-obs", "modified"] + + # Maske jeg kan bruge shortlist her? Kald den "select_structures(uniprot_record, strucmap, 20, verbose=verbose)) + # Select structures based on the SIFTS mapping + strucmap = rank_structures(uniprot_record, strucmap, verbose=verbose) + print("%s: best structure %s-%s method: %s cover_obs: %.3f deletions: %d inserts: %d mismatch: %d modified: %d" % + (uniprot_id, strucmap.loc[0,'pdb'], strucmap.loc[0,'chain'], strucmap.loc[0,'method_exp'], strucmap.loc[0,'cover_obs'], + strucmap.loc[0,'deletions'], strucmap.loc[0,'inserts'], strucmap.loc[0,'mismatch'], strucmap.loc[0,'modified'])) + if verbose > 0: + ntop = 20 + print("%s top %d structures" % (uniprot_id,ntop)) + if len(strucmap.index) < ntop: + print(strucmap[pcol]) + else: + print(strucmap[pcol].iloc[range(ntop)]) + + # Write ranked structure map to disk + strucmap.to_csv(data_dir + "/" + strucmap_filename, index=False, sep=';') + + # Structure with highest cover_obs and cover_exp (secondary score) + if verbose > 0: + i_max_cover_obs = strucmap.sort_values(by=['cover_obs','score'], ascending=False).index.values[0] + i_max_cover_exp = strucmap.sort_values(by=['cover_exp','score'], ascending=False).index.values[0] + print("%s best structure and structures with highest cover_obs and cover_exp" % (uniprot_id)) + print(strucmap[pcol].iloc[[0,i_max_cover_obs,i_max_cover_exp]]) + + # # Determine the optimal number of structures that cover target protein + # n_select = number_of_covering_structures(uniprot_record, strucmap, verbose=verbose) + + # Write aligned sequences to FASTA file + seqrec_list = [SeqRecord(uniprot_record.seq, id=uniprot_record.id, description=uniprot_record.description)] + seqrec_list += [SeqRecord(Seq(seq_str), id="%s-%s" % (pdb,c), description="score=%.3f cover_obs=%.3f" % (score,cover_obs)) + for (seq_str,pdb,c,score,cover_obs) in + zip(strucmap['aligned_resseq_obs'], strucmap['pdb'], strucmap['chain'], strucmap['score'], strucmap['cover_obs'])] + SeqIO.write(seqrec_list, data_dir + "/alignments_" + uniprot_id + ".fasta", "fasta") + + print("")