From 218f37a7e7dbbb6ccbe749d20222e599df50b616 Mon Sep 17 00:00:00 2001
From: j0kaso <johanna.tiemann@gmail.com>
Date: Fri, 17 Jun 2022 00:03:06 +0200
Subject: [PATCH] [prism_parser] add static files for sharing

---
 software/scripts/FillVariants.py       |  139 +++
 software/scripts/PrismData.py          | 1407 ++++++++++++++++++++++++
 software/scripts/struc_select_sifts.py | 1077 ++++++++++++++++++
 3 files changed, 2623 insertions(+)
 create mode 100644 software/scripts/FillVariants.py
 create mode 100755 software/scripts/PrismData.py
 create mode 100644 software/scripts/struc_select_sifts.py

diff --git a/software/scripts/FillVariants.py b/software/scripts/FillVariants.py
new file mode 100644
index 0000000..315f8f0
--- /dev/null
+++ b/software/scripts/FillVariants.py
@@ -0,0 +1,139 @@
+##############################################################
+#
+# Author: Johanna K. S. Tiemann
+#
+# Copyright: CC
+#
+# Last edited: 2020-11-12
+##############################################################
+
+"""
+Function creating each possible variant with WT information
+
+Commands:
+=======
+
+
+Example:
+=======
+
+python FillVariants.py prism_uniprot_XXX_uniprot_id.txt
+
+
+"""
+
+# Standard library imports
+from argparse import ArgumentParser
+from datetime import datetime
+import os
+
+# Third party imports
+import pandas as pd 
+
+# Local application imports
+from PrismData import PrismParser, VariantData
+
+
+def parse_args():
+    """
+    Argument parser function
+    """
+
+    parser = ArgumentParser( description="" )
+
+    # Initiating/setup command line arguments
+    parser.add_argument( 'file',
+        type=str,
+        help="Prism data file to process"
+        )
+    parser.add_argument('--output_dir', '-o',
+        type=str,
+        default='',
+        help="Directory where files will be written. If not specified, the input directory will be used."
+        )
+    parser.add_argument('--incl_del', '-i',
+        default=False,
+        type=lambda s: s.lower() in ['true', 't', 'yes', '1'],
+        help="generates values for ~ and * too. Default False"
+        )
+    
+    args = parser.parse_args()
+
+    # Generate and check saved outputs
+    if args.output_dir!='':
+        if not os.path.isdir(args.output_dir):
+            os.makedirs(args.output_dir)
+    else:
+        args.output_dir = os.path.dirname(os.path.abspath(args.file))
+    if args.incl_del != False:
+        args.incl_del = True
+    return args
+
+
+def copy_wt_variants(dataframe, incl_del=False):
+    #copy wt columns to all variants
+    
+    if 'aa_var' in dataframe.columns:
+        dataframe = dataframe.drop(columns=['aa_var'])
+    final_df = dataframe.copy()
+    final_df['variant'] = final_df['variant'].str.replace(r'=', 'A')
+    final_df['aa_var'] = ['A']*len(dataframe['variant'])
+    if incl_del:
+        resi_list = ['C','D','E', 'F','G','H', 'I','K','L','M','N','P','Q','R','S','T','V','W','Y', '~', '*']
+    else:
+        resi_list = ['C','D','E', 'F','G','H', 'I','K','L','M','N','P','Q','R','S','T','V','W','Y']
+    for variant in resi_list:
+        df = dataframe.copy()
+        df['variant'] = df['variant'].str.replace('=', variant)
+        df['aa_var'] = [variant]*len(dataframe['variant'])
+        final_df = pd.concat([final_df, df], axis=0, ignore_index=True)
+    final_df['aa_var'] = final_df['variant'].apply(lambda x: x[-1])
+    final_df['aa_ref'] = final_df['variant'].apply(lambda x: x[0])
+    final_df['res'] = final_df['variant'].apply(lambda x: int(x[1:-1]))
+    mask = final_df['aa_ref']==final_df['aa_var']
+    final_df.loc[mask, 'aa_var'] = '='
+    final_df['variant'] = final_df['aa_ref'] + final_df['res'].astype(str) + final_df['aa_var']
+    final_df = final_df.sort_values(by=['res','aa_var'])
+    for elem in ['n_mut', 'aa_ref', 'resi', 'aa_var', 'res']:
+        if elem in final_df.columns:
+            final_df = final_df.drop(columns=elem)
+    final_df = final_df.drop_duplicates().reset_index(drop=True)
+    return(final_df)
+
+
+def write_prism(metadata, dataframe, prism_file, comment=''):
+    variant_dataset = VariantData(metadata, dataframe)
+    parser = PrismParser()
+    parser.write(prism_file, variant_dataset, comment_lines=comment)
+
+
+def read_from_prism(primsfile):
+    parser = PrismParser()
+    dataframe = parser.read(primsfile).dataframe
+    meta_data = parser.read_header(primsfile)
+    meta_data.pop('variants')
+    return meta_data, dataframe
+
+
+def main():
+    """
+    Main function called as default at the end.
+    """
+    # get user input arguments
+    args = parse_args()
+    
+    # read metadata and dataframe
+    meta_data, dataframe = read_from_prism(args.file)
+    
+    # create new dataframe
+    new_df = copy_wt_variants(dataframe, incl_del=args.incl_del)
+    
+    # write dataframe
+    file_name = os.path.basename(args.file)
+    new_prism_file = os.path.join(args.output_dir, f'{file_name[:-4]}_filled{file_name[-4:]}')
+    comment = [ f"{datetime.date(datetime.now())} - expanded variant data",] 
+    write_prism(meta_data, new_df, new_prism_file, comment)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/software/scripts/PrismData.py b/software/scripts/PrismData.py
new file mode 100755
index 0000000..3e42da4
--- /dev/null
+++ b/software/scripts/PrismData.py
@@ -0,0 +1,1407 @@
+#!/storage1/shared/software/anaconda3/bin/python3
+
+# Copyright (C) 2020 Kristoffer Enoe Johansson <kristoffer.johansson@bio.ku.dk>
+
+"""Module for data handling in the PRISM project
+
+This module implments classes for parsing (PrismParser) and handling 
+(PrismData derived classes) of data files.
+
+In general, calling PrismParser.read(filename) will return a data object of 
+the same derived class, e.g. a VariantData object.
+See documentation of derived parser and data classes for help.
+
+See README.md for general file format definitions.
+"""
+
+__version__ = 1.002
+
+from Bio import Seq,SeqRecord,SeqIO,pairwise2,SubsMat
+from Bio.SubsMat import MatrixInfo
+import numpy as np
+import pandas as pd
+import yaml,csv,copy,time
+
+
+class PrismFormatError(Exception):
+    """Exception raised for format errors in prism data files"""
+
+class PrismValueFail(Exception):
+    """Exception raised when a threshold was not met"""
+
+
+class PrismParser:
+    """Read and write prism data files
+
+    Examples
+    --------
+    Parsing a data file
+    >>> parser = PrismData.PrismParser()
+    >>> data = parser.read("prism_mave/prism_mave_001_GFP.txt")
+
+    Parsing the YAML file header only is much faster for large files
+    >>> meta_data = parser.read_header("prism_mave/prism_mave_001_GFP.txt")
+    """
+
+    def read(self, filename, verbose=0):
+        """Read data and meta data from a PRISM data file
+
+        Read the meta-data section (via read_header) and data section (via Pandas.read_csv) and 
+        check format adherence and the consistancy between the two.
+
+        Parameters
+        ----------
+        filename : string
+            Name and path of file to read
+        verbose : int, optional
+            Level of output
+
+        Returns
+        -------
+        PrismData derived instance
+        """
+
+        if verbose > 0:
+            start_time = time.process_time()
+            
+        # Read meta-data and check for necessary fields
+        metadata = self.read_header(filename, verbose=verbose)
+
+        # Read data section
+        dataframe = pd.read_csv(filename, delim_whitespace=True, comment='#', header=0,
+                             keep_default_na=True, na_values=['Na','na'])
+    
+        # Determine type of data
+        data = None
+        if dataframe.keys()[0] == "variant":
+            data = VariantData(metadata, dataframe)
+        # elif dataframe.keys()[0] == "resi":
+        #     data = ResidueData(metadata, dataframe)
+        #     checks ...
+        else:
+            raise PrismFormatError("Could not determine data type of file %s from first column name %s" % (filename, dataframe.columns[0]))
+
+        data.check(verbose=verbose)
+        
+        if verbose > 0:
+            elapsed_time = time.process_time() - start_time
+            print("Total time of parsing %s %.1f sec" % (filename,elapsed_time))
+            
+        return(data)
+
+    def write(self, filename, data, comment_lines=None, no_overwrite_metadata=False, verbose=0):
+        """Write a PrismData object to a file
+
+        Will check the consistency of variants and cloumns meta-data fields.
+
+        Parameters
+        ----------
+        filename : string
+            Name and path of file to write
+        data : VariantData
+            Data set to write
+        comment_lines : list of strings
+            Each list element is printed as a comment line between header and data sections
+        no_overwrite_metadata : bool
+            Do not allow the check function to overwrite meta data. Except instead
+        """
+
+        # Replace whitespace in column names if present
+        has_whitespace = [cn.find(" ") > -.5 for cn in data.metadata['columns'].keys()]
+        if any(has_whitespace):
+            if verbose > 0:
+                print("White space in columns names replaced with _ (they are white-space separated)") 
+            # Replace in data frame
+            data.dataframe.columns =  [cn.strip().replace(" ","_") for cn in data.dataframe.columns]
+            # Replace in meta data
+            data.metadata["columns"] = {k.strip().replace(" ","_"):v for (k,v) in data.metadata['columns'].items()}
+        
+        # Check data before printing
+        data.check(no_overwrite_metadata=no_overwrite_metadata, verbose=verbose)
+
+        # if present, remove the index columns (aa_ref, etc.)
+        write_cols = list(data.dataframe.columns)
+        for col in data.index_column_names:
+            if col in write_cols:
+                write_cols.remove(col)
+                                                    
+        # Write a file
+        with open(filename, "w") as fh:
+            fh.write(data.header_to_string(skip_fields=['filename']))
+            if comment_lines:
+                for line in comment_lines:
+                    fh.write("# "+line+"\n")
+            fh.write("#\n")
+            # data.dataframe.to_csv(fh, float_format='%8.4f', sep=' ', header=True, index=False, columns=write_cols,
+            data.dataframe.to_csv(fh, sep=' ', header=True, index=False, columns=write_cols,
+                                  quoting=csv.QUOTE_NONE, escapechar=' ', na_rep="    NA")
+
+    def is_aa_one_nat(self, sequence, additional=""):
+        """Test if a sequence consists of single-letter natural amino acids only (case-insensitive)
+        
+        Parameters
+        ----------
+        sequence : string
+            Amino acid sequence
+        additional : string, optional
+            Additional characters to allow, case insensitive
+        """
+        for aa in sequence.upper():
+            if not (aa in "ACDEFGHIKLMNPQRSTVWY" or aa in additional.upper()):
+                return(False)
+        return(True)
+    
+    def read_header(self, filename, verbose=0):
+        """Read the YAML header of a data file and run check_header
+
+        Parameters
+        ----------
+        filename : string
+            Path and name of the file to read
+        verbose : int, optional
+            Level of output        
+        
+        Returns
+        -------
+        dict
+            The meta data as a dictonary of one or more levels of dictionaries
+        """
+        header_begin = False
+        header_str = ""
+        with open(filename, "r") as file_handle:
+            for line in file_handle:
+                if line.replace(' ','')[0:4] == '#---':
+                    if header_begin:
+                        header = yaml.safe_load(header_str)
+                        header['filename'] = filename.split('/')[-1]
+                        self.check_header(header)
+                        return(header)
+                    else:
+                        header_begin = True
+                        continue
+                if header_begin:
+                    if not line.strip()[0] == "#":
+                        raise PrismFormatError("Missing comment char inside YAML header")
+                    s = line.replace('#','',1)
+                    header_str += s
+        if header_begin:
+            raise PrismFormatError("Header never ended")
+        else:
+            raise PrismFormatError("No header found")
+
+    def check_header(self, header):
+        """Check a header for fields required by all data files"""
+        if not 'version' in header.keys():
+            raise PrismFormatError("Header has no \'version\' field")
+        if not 'protein' in header.keys():
+            raise PrismFormatError("Header has no \'protein\' field")
+        if not 'name' in header['protein'].keys():
+            raise PrismFormatError("Header has no \'protein: name\' field")
+        if not 'sequence' in header['protein'].keys():
+            raise PrismFormatError("Header has no \'protein: sequence\' field")
+        if not 'uniprot' in header['protein'].keys():
+            raise PrismFormatError("Header has no \'protein: uniprot\' field")
+        if 'first_residue_number' in header['protein'].keys():
+            if int(header['protein']['first_residue_number']) < 0:
+                raise PrismFormatError("First residue number must be non-negative")
+        if not 'columns' in header.keys():
+            raise PrismFormatError("Header has no \'columns\' field")
+        if 'filename' in header.keys():
+            # Upon parsing, filename sould be added to header and the second field (split by '_') should give the data type
+            data_type = header['filename'].split('_')[1]
+            if not data_type.lower() in [hk.lower() for hk in header.keys()]:
+                print("WARNING: Header has no \'%s\' field but filename indicates this data type" % (data_type))
+                # raise PrismFormatError("Header has no \'%s\' field but filename indicates this data type" % (data_type))
+
+    def write_meta_file(self, data_list, filename, output_type="csv"):
+        """Write a overview file of meta-data (headers) from a list of data sets
+
+        Parameters
+        ----------
+        data_list : list of PrismData-derived objects or meta data dictionaries
+            The data to output
+        filename : string
+            Name and path of output file
+        output_type : string, optional
+            One of the following
+            -csv: Excel-readable CSV overview of all meta-data
+            -fasta: Header sequences in FASTA format
+        """
+        header_list = [data.metadata if isinstance(data,PrismData) else data for data in data_list]
+        if output_type == "csv":
+            self.__dump_header_csv(filename, header_list)
+        elif output_type == "fasta":
+            self.__dump_fasta(filename, header_list)
+        else:
+            raise ValueError("write_meta_file argument output_type must be \'csv\' or \'fasta\'")
+
+    def __merge_header_fields(self, header_list, mode="union"):
+        """Determine a common set of header fields"""
+        
+        # Recursive grap of keys - because we can-can
+        def update_keys(key_dic, dic):
+            """Update key_dic with keys from dic recursively"""
+            for key in dic.keys():
+                if not key in key_dic.keys():
+                    key_dic[key] = {}
+                if type(dic[key]) == dict:
+                    update_keys(key_dic[key],dic[key])
+                    
+        # Read all header keys
+        common_fields = {}
+        for header in header_list:
+            update_keys(common_fields, header)
+        # Return according to mode
+        if mode.lower()=="union":
+            return(common_fields)
+        elif mode.lower()=="intersect":
+            raise NotImplementedError("__merge_header_fields mode==intersect not implemented")
+        else:
+            raise ValueError("Header merge mode must be \'union\' or \'intersect\'")
+
+    def __dump_header_csv(self, filename, header_list):
+        common_header_keys = self.__merge_header_fields(header_list, mode="union")
+        
+        # Columns fields have different names so rename to column01, column02, etc
+        ncol_list = [len(header['columns'].keys()) for header in header_list]
+        new_col_names = ["column%02d" % (c+1) for c in range(max(ncol_list))]
+        common_header_keys['columns'] = dict(zip(new_col_names, [{}]*len(new_col_names)))
+        
+        with open(filename, 'w', newline='') as csvfile:
+            csv_dialect = csv.excel()
+            csv_dialect.delimiter = ";"
+            writer = csv.writer(csvfile, dialect=csv_dialect)
+            # First layer of keys
+            row = []
+            for key in common_header_keys:
+                row += [key]+['']*np.max([0,len(common_header_keys[key])-1])
+            writer.writerow(row)
+            # Second layer of keys
+            row = []
+            for key in common_header_keys:
+                if len(common_header_keys[key]) == 0:
+                    row += [key]
+                else:
+                    row += [kkey for kkey in common_header_keys[key].keys()]
+            writer.writerow(row)
+            # Fill in values for each header
+            for header in header_list:
+                row = []
+                for key in common_header_keys.keys():
+                    if not key in header:
+                        row += ['']*np.max([1,len(common_header_keys[key])])
+                    elif len(common_header_keys[key].keys()) == 0:
+                        row += [header[key]]
+                    elif key == 'columns':
+                        for keyi in range(len(common_header_keys[key])):
+                            key_list = list(header['columns'].keys())
+                            value_list = list(header['columns'].values())
+                            if keyi < len(header['columns']):
+                                row += [key_list[keyi]+": "+value_list[keyi]]
+                            else:
+                                row += ['']
+                    else:
+                        for kkey in common_header_keys[key].keys():
+                            if kkey in header[key].keys():
+                                row += [header[key][kkey]]
+                            else:
+                                row += ['']
+                writer.writerow(row)
+
+    def __dump_fasta(self, filename, header_list):
+        sequences = []
+        for header in header_list:
+            file_id = header['filename'].rsplit(".",1)[0] + "_v" + str(header['version'])
+            seq = Seq.Seq(header['protein']['sequence'])
+            sequences.append(SeqRecord.SeqRecord(seq, id=file_id, description=""))
+        with open(filename, "w", newline='') as file_handle:
+            count = SeqIO.write(sequences, file_handle, "fasta")
+        return(count)
+            
+class PrismData:
+    """Base class of PRISM data types
+
+    This class is thought as abstract and should not be instantiated. See documentation of derived classes.
+    """
+    
+    def __init__(self, metadata, dataframe):
+        """Constructor
+
+        See documentation of derived classes
+        """
+        self.metadata = metadata
+        self.dataframe = dataframe
+        self.index_column_names = None
+
+    def __repr__(self):
+        """Representation - used e.g. for interactive printing"""
+        return(self.header_to_string() + str(self.dataframe))
+
+    def __str__(self):
+        """Cast to string - used e.g. by print"""
+        return("%s object of protein %s (uniprot %s) with dataframe shape %s" %
+               (type(self), self.metadata['protein']['name'], self.metadata['protein']['uniprot'], str(self.dataframe.shape)))
+
+    def copy(self):
+        """Make a deep copy of self"""
+        new_meta = copy.deepcopy(self.metadata)
+        new_dataframe = self.dataframe.copy(deep=True)
+        new_obj = PrismData(new_meta, new_dataframe)
+        new_obj.__class__ = self.__class__
+        return(new_obj)
+        
+    def seq_from_data(self):
+        """Builds a sequence based on aa_ref and resi data columns
+
+        Data is assumed to have index columns resi and aa_ref
+        """
+
+        if not 'aa_ref' in self.dataframe.columns:
+            self.add_index_columns()
+            
+        n_res = 0
+        seq = np.full(100,'X')
+        for (aa_list,resi_list) in zip(self.dataframe['aa_ref'],self.dataframe['resi']):
+            for (aa,resi) in zip(aa_list,resi_list):
+                n_res = max(n_res,resi)
+                if n_res > len(seq):
+                    seq = np.concatenate([seq,np.full(max(100,n_res-len(seq)),'X')])
+                if seq[resi-1] == 'X':
+                    seq[resi-1] = aa
+                elif seq[resi-1] != aa:
+                    raise PrismFormatError("Data reference amino acid mismatch at position %d: % and %s" %
+                                           (resi, seq[resi-1], aa))
+        return("".join(seq[:n_res]))
+
+    def check_header_and_data_sequence(self, verbose=0):
+        """Check reference sequence from header against the data"""
+        data_seq = self.seq_from_data()
+        meta_seq = self.metadata['protein']['sequence']
+        first_resn = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1
+        meta_seq = 'X'*(first_resn-1) + meta_seq # For checking purposes, matching indices makes things easier
+
+        # Check sequence length
+        if len(data_seq) < len(meta_seq):
+            # Add X for unobserved positions
+            data_seq = data_seq + 'X'*(len(meta_seq)-len(data_seq))
+        elif (len(data_seq) != len(meta_seq)):
+            raise PrismFormatError("Data has more residues than header sequence")
+        
+        # Check all substitution amino acids against header sequence
+        match = [aa1==aa2 if aa1!='X' and aa2!='X' else True for (aa1,aa2) in zip(meta_seq,data_seq)]
+        if not np.all(match):
+            mismatch = np.where(np.logical_not(match))[0] + 1
+            raise PrismFormatError("Mismatch between header sequence and data sequence at positions %s:\n%s\n%s" %
+                                   (str(mismatch), meta_seq,data_seq))
+        if (verbose > 0):
+            print("Data and header amino acid sequence match with %d of %d positions not observed in data" % (data_seq.upper().count('X'),len(meta_seq)))
+        return(data_seq)
+
+    def header_to_string(self, skip_fields=[]):
+        """Format meta-data into a string"""
+        # Recursive function to write the next level of a dictionary
+        def dump_subfields(dic, indent_level):
+            return_str = ""
+            dict_keys = list(dic.keys())
+            # For the top level, order fields (dict is not ordered)
+            if indent_level == 0:
+                dict_keys_first = list()
+                dict_keys_last = list()
+                for k in ['version','protein']:
+                    if k in dict_keys:
+                        dict_keys.remove(k)
+                        dict_keys_first.append(k)
+                for k in ['variants','columns','filename']:
+                    if k in dict_keys:
+                        dict_keys.remove(k)
+                        dict_keys_last.append(k)
+                dict_keys = dict_keys_first + dict_keys + dict_keys_last
+            # Make new indent and run recursively or dump values
+            for key in dict_keys:
+                if key in skip_fields:
+                    continue
+                if isinstance(dic[key], dict):
+                    return_str += "# " + "    "*indent_level + key + ":\n"
+                    return_str += dump_subfields(dic[key], indent_level+1)
+                else:
+                    # format floats values and use standard cast for others
+                    s = "%.3f" % (dic[key]) if isinstance(dic[key],float) else str(str(dic[key])) 
+                    return_str += "# " + "    "*indent_level + key + ": " + s + "\n"
+            return(return_str)
+        
+        return_str = ""
+        return_str += "# --------------------\n"
+        return_str += dump_subfields(self.metadata, 0)                
+        return_str += "# --------------------\n"
+        return_str += "#\n"
+        return(return_str)
+
+    def check_column_names(self, verbose=0):
+        # Check column names - case sensitive
+        meta_colnames = list(self.metadata['columns'].keys()) + ['n_mut','aa_ref','resi','aa_var']
+        data_colnames = self.dataframe.keys()[1:] # first column is data specific and not in header
+        for cn in data_colnames:
+            if not cn in meta_colnames:
+                raise PrismFormatError("Could not find column name \'%s\' in header" % (cn))
+        for cn in meta_colnames:
+            if not cn in data_colnames:
+                raise PrismFormatError("Could not find header column name \'%s\' in data" % (cn))
+        if verbose > 0:
+            print("Column names checks ok: %s" % (",".join(self.dataframe.keys())))
+                
+
+class VariantData(PrismData):
+    """Container class for protein variant data
+
+    Parameters
+    ----------
+    metadata : dict
+        Metadata for data set
+    dataframe : pandas.DataFrame
+        Data
+
+    Examples
+    --------
+    Get meta-data dictionary
+    >>> meta_data = VariantData.metadata
+
+    Get sequence from header
+    >>> sequence = meta_data['protein']['sequence']
+
+    Get pandas.DataFrame
+    >>> data_frame = VariantData.dataframe
+
+    Get all double mutants
+    >>> data_frame[data_frame['n_mut']==2]
+
+    Get all variants with substitutions into Ala
+    >>> data_frame = VariantData.get_var_into_aa('A')
+
+    Get all variants involving positions 30-45 (both included)
+    >>> data_frame = VariantData.get_var_at_pos(range(30,45+1))
+
+    Get a complete site-saturation (single mutant) library
+    >>> data_frame = VariantData.saturate_variants()
+
+    Merge three VariantData files and retrieve the union of variants (merge=outer)
+    >>> merged_data = data1.merge([data2, data3], merge="outer")
+    """
+    # To be implmented:
+    # Get all (multi-mutant) variants that involves D83V and L11P
+    # >>> subst2var = VariantData.make_subst2var_map()
+    # >>> data_frame = VariantData.dataframe[subst2var['D83V'] and subst2var['L11P']]
+
+    def add_index_columns(self):
+        var_split = [var.split(":") if var!="WT" else [] for var in self.dataframe['variant']]
+        resi_ll = [[int(sub[1:-1]) for sub in var] if len(var)>0 else [] for var in var_split]
+        aa_ref_ll = [[mut[0] for mut in var] if len(var)>0 else [] for var in var_split]
+        aa_mut_ll = [[mut[-1] for mut in var] if len(var)>0 else [] for var in var_split]
+        self.dataframe['n_mut'] = [len(var) for var in var_split]
+        self.dataframe['aa_ref'] = aa_ref_ll
+        self.dataframe['resi'] = resi_ll
+        self.dataframe['aa_var'] = aa_mut_ll
+        self.index_column_names = ['n_mut','aa_ref','resi','aa_var']
+
+    def remove_index_columns(self):
+        """Remove the indexing columns, i.e. ['n_mut','aa_ref','resi','aa_var']"""
+        if 'aa_ref' in self.dataframe.columns:
+            self.dataframe.drop(['n_mut','aa_ref','resi','aa_var'], axis=1, inplace=True)
+        self.index_column_names = None
+
+    def get_var_into_aa(self, aa, multimutant_mode='any'):
+        """Get a data frame of variants with substitutions into a given amino acid
+
+        Parameters
+        ----------
+        aa : character
+            Amino acid in one-letter code
+        multimutant_mode : string
+            Mode of action for multi-mutants
+            - any : At least one substitution per variant into aa required
+            - all : All substitutions in variants must be into aa
+            - exclude : Only return single mutants
+        """
+        # Check argument
+        if not multimutant_mode in ['any','all','exclude']:
+            raise ValueError("Function get_var_into_aa argument multimutant_mode must be \'any\', \'all\' or \'exclude\'")
+        # Make a series of boleans to select rows
+        if multimutant_mode=='any':
+            bool_mask = [aa in aa_list for aa_list in self.dataframe['aa_var']]
+        elif multimutant_mode=='all':
+            bool_mask = [np.all(np.array(aa_list)==aa) if len(aa_list)>0 else False for aa_list in self.dataframe['aa_var']]
+        elif multimutant_mode=='exclude':
+            bool_mask = [aa_list==[aa] for aa_list in self.dataframe['aa_var']]
+        else:
+            assert False
+        return(self.dataframe[bool_mask])
+    
+    def get_var_from_aa(self, aa, multimutant_mode='any'):
+        # Exclude synonymous and nonsense?
+        """Get a data frame of variants with substitutions from a given amino acid
+
+        Parameters
+        ----------
+        aa : character
+            Amino acid in one-letter code
+        multimutant_mode : string
+            Mode of action for multi-mutants
+            - any : At least one substitution per variant from aa required
+            - all : All substitutions in variants must be from aa
+            - exclude : Only return single mutants
+        """
+        # Check argument
+        if not multimutant_mode in ['any','all','exclude']:
+            raise ValueError("Function get_var_from_aa argument multimutant_mode must be \'any\', \'all\' or \'exclude\'")
+        # Make a series of boleans to select rows
+        if multimutant_mode=='any':
+            bool_mask = [aa in aa_list for aa_list in self.dataframe['aa_ref']]
+        elif multimutant_mode=='all':
+            bool_mask = [np.all(np.array(aa_list)==aa) if len(aa_list)>0 else False for aa_list in self.dataframe['aa_ref']]
+        elif multimutant_mode=='exclude':
+            bool_mask = [aa_list==[aa] for aa_list in self.dataframe['aa_ref']]
+        else:
+            assert False
+        return(self.dataframe[bool_mask])
+    
+    def get_var_at_pos(self, target_resi, mode='any'):
+        # Exclude synonymous and nonsense?
+        """Get a data frame of variants with substitutions in one or more positions
+
+        Parameters
+        ----------
+        target_resi : int or iterable of int
+            Residue number as given in variants
+        mode : string
+            Mode of action for multi-mutants
+            - any : At least one substitution in one of the given position
+            - all : Substitutions at all given position
+            - exact : Substitutions at all given position and no others
+        """
+        # Check argument mode
+        if not mode in ['any','all','exact']:
+            raise ValueError("Function get_var_from_aa argument mode must be \'any\', \'all\' or \'exact\'")
+        # Check argument target_resi
+        if isinstance(target_resi, int):
+            target_resi = [target_resi]
+        elif not isinstance(target_resi, list):
+            target_resi = list(target_resi)
+        target_resi = list(np.sort(target_resi))
+        if mode == 'any':
+            bool_mask = [np.any([ti in resi_list for ti in target_resi]) if len(resi_list)>0 else False for resi_list in self.dataframe['resi']]
+        elif mode == 'all':
+            bool_mask = [np.all([ti in resi_list for ti in target_resi]) if len(resi_list)>0 else False for resi_list in self.dataframe['resi']]            
+        elif mode=='exact':
+            bool_mask = [resi_list==target_resi for resi_list in self.dataframe['resi']]
+        else:
+            assert False
+        return(self.dataframe[bool_mask])
+            
+    def get_subst(self, subst, single_only=True):
+        """Get a data frame of variants that all contain a given substitution
+
+        Parameters
+        ----------
+        subst : string
+            Substitution, e.g. M1A
+        single_only : bool
+            Only return single mutants. If false, all multi-mutants with the given substitution will be returned
+        """
+        bool_mask = [subst in var.split(':') if var!='WT' else False for var in self.dataframe['variant']]
+        # Add requirement on number of mutations if requested
+        if single_only:
+            bool_mask = [a and b for a,b in zip(bool_mask,self.dataframe['n_mut']==1)]
+        # Return data frame slice
+        return(self.dataframe[bool_mask])
+
+    def order_variants(self, aa_order="CDEKRHNQAGSTVMLIFYWP=*~"):
+        """Order variant according to number of substitutions, residue number, and a specified order of 
+        amino acids. Non-specified amino acids are keps in original order. Inserts are not supported.
+        """
+        n_aa = len(aa_order)
+        if len(set(aa_order)) != n_aa:
+            raise ValueError("Argument aa_order to VariantData.order_variants must be unique characters")
+        aa_dict = dict(zip(aa_order,range(n_aa)))
+        aa_dict['WT'] = -1
+        aa_list = [l[0] if len(l)>0 else 'WT' for l in self.dataframe['aa_var']]        
+        self.dataframe['aa0'] = [aa_dict[aa] if aa in aa_dict.keys() else n_aa for aa in aa_list]
+        self.dataframe['resi0'] = [l[0] if len(l)>0 else 0 for l in self.dataframe['resi']]
+        self.dataframe.sort_values(by=['n_mut','resi0','aa0'], inplace=True)
+        self.dataframe.index = range(self.dataframe.shape[0])
+        self.dataframe.drop(columns=['aa0','resi0'], inplace=True)
+        
+    def saturate_variants(self, aa_order="CDEKRHNQAGSTVMLIFYWP", verbose=0):
+        """Saturate variant list to make a full site-saturation library
+
+        Order single mutants according to position and target amino acid and add NA 
+        rows for missing single variants. Inserts are not supported.
+
+        Parameters
+        ----------
+        aa_order : string
+            Which amino acids to include and the order, default CDEKRHNQAGSTVMLIFYWP
+        verbose : int, optional
+            Level of output        
+        """
+        n_aa = len(aa_order)
+        if len(set(aa_order)) != n_aa:
+            raise ValueError("Argument aa_order to VariantData.saturate_variants must be unique characters")
+        seq = self.metadata['protein']['sequence']
+        first_resn = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1
+        var_list = [seq[resi]+str(resi+first_resn)+var if seq[resi] != var else seq[resi]+str(resi+first_resn)+"=" for resi in range(len(seq)) for var in aa_order]
+        if verbose > 0:
+            remove_set = set(self.dataframe['variant'])-set(var_list)
+            if len(remove_set) > 0:
+                if len(remove_set) > 500:
+                    rm_list = list(remove_set)
+                    print("Saturate variants will remove the following %d variants:\n%s\n... [skip %d variants] ...\n%s" %
+                          (len(rm_list), ",".join([str(i) for i in rm_list[0:25]]), len(rm_list)-50, ",".join([str(i) for i in rm_list[-25:]])))
+                else:
+                    print("Saturate variants will remove the following %d variants:\n%s" % (len(remove_set), ",".join([str(i) for i in remove_set])))
+        new_dataframe = pd.DataFrame({'variant':var_list})
+        new_dataframe = new_dataframe.merge(self.dataframe, on='variant', how='left', copy=False)
+        self.dataframe = new_dataframe
+        self.add_index_columns()
+        self.metadata['variants'] = self.calc_variants_metadata(verbose=verbose)
+
+    def __missing_mask(self, verbose=0):
+        """Private helper function for clean_na_variants and others
+        
+        Returns a bool array that marks rows with NA only. Should be fast."""
+        # pd.DataFrame.iterrows() is not vectorized and very slow, DataFrame.apply is not too good either
+        # return([np.all(row[1]) for row in self.dataframe.drop(columns=['variant','n_mut','aa_ref','resi','aa_var']).isnull().iterrows()])
+        if 'aa_ref' in self.dataframe.columns:
+            return(np.all(self.dataframe.drop(columns=['variant','n_mut','aa_ref','resi','aa_var']).isnull(), axis=1))
+        else:
+            return(np.all(self.dataframe.drop(columns='variant').isnull(), axis=1))
+
+    def clean_na_variants(self, verbose=0):
+        """Remove variants rows with all data missing"""
+        missing_mask = self.__missing_mask()
+        i = np.where(missing_mask)[0]
+        if len(i) > 0:
+            self.dataframe.drop(self.dataframe.index[i], axis=0, inplace=True)
+            self.dataframe.index = range(self.dataframe.shape[0])
+            if verbose > 0:
+                print("Removed %d variant rows with all data missing" % (len(i)))
+
+    def check_variants_metadata(self, no_overwrite=False, verbose=0):
+        recalc_variants = self.calc_variants_metadata(verbose=verbose)
+        if 'variants' in self.metadata:
+            # Number of variants
+            if 'number' in self.metadata['variants'].keys():
+                if recalc_variants['number'] != self.metadata['variants']['number']:
+                    s = "Header field \'variants:number\' is %d but %d variants (possible including WT) were found" % \
+                        (self.metadata['variants']['number'], recalc_variants['number'])
+                    if no_overwrite:
+                        raise PrismFormatError(s)
+                    elif verbose > 0:
+                        print("WARNING: "+s)
+            elif verbose>0 and not no_overwrite:
+                print("Will add variants:number: %d to header" % (recalc_variants['number']))
+                
+            # Coverage
+            if 'coverage' in self.metadata['variants'].keys():
+                if np.abs(recalc_variants['coverage'] - self.metadata['variants']['coverage']) > 0.01:
+                    s = "Header field \'variants:coverage\' is %.2f but variants calculated to cover %.2f" % \
+                        (self.metadata['variants']['coverage'], recalc_variants['coverage'])
+                    if no_overwrite:
+                        raise PrismFormatError(s)
+                    elif verbose > 0:
+                        print("WARNING: "+s)
+            elif verbose>0 and not no_overwrite:
+                print("Will add variants:coverage: %.2f to header" % (recalc_variants['coverage']))
+                    
+            # Depth
+            if 'depth' in self.metadata['variants'].keys():
+                if np.abs(recalc_variants['depth'] - self.metadata['variants']['depth']) > 0.01:
+                    s = "Header field \'variants:depth\' is %.2f but variants calculated to depth %.2f" % \
+                        (self.metadata['variants']['depth'], recalc_variants['depth'])
+                    if no_overwrite:
+                        raise PrismFormatError(s)
+                    elif verbose > 0:
+                        print("WARNING: "+s)
+            elif verbose>0 and not no_overwrite:
+                print("Will add variants:depth: %.2f to header" % (recalc_variants['depth']))
+         
+            # Helper function
+            def strip_all(s):
+                return(s.lower().replace('.', '').replace('-','').replace('+','').replace('/','').replace(' ',''))
+
+            # Variant width
+            if 'width' in self.metadata['variants'].keys():
+                if recalc_variants['width'] == "single mutants":
+                    if not strip_all(self.metadata['variants']['width']) in ['singlemutants','singlemutant','singlemut','single','singleonly']:
+                        s = "Header field \'variants:width\' is \'%s\' but only single mutants were found - use \'single mutants\'" % \
+                            (self.metadata['variants']['width'])
+                        if no_overwrite:
+                            raise PrismFormatError(s)
+                        elif verbose > 0:
+                            print("WARNING: "+s)
+                elif recalc_variants['width'] == "multi mutants":
+                    if not strip_all(self.metadata['variants']['width']) in ['multimutants','multimutant','multimut','multi']:
+                        # multimut = [var_list[i] for i in np.where(n_mut > 2)[0]]
+                        s =  "Header field \'variants:width\' is \'%s\' but the following multi-mutant were found - use \'multi mutants\'" % \
+                            (self.metadata['variants']['width'])
+                        if no_overwrite:
+                            raise PrismFormatError(s)
+                        elif verbose > 0:
+                            print("WARNING: "+s)
+                elif recalc_variants['width'] == "single and double mutants":
+                    if not strip_all(self.metadata['variants']['width']) in ['singleanddoublemutants','singleanddoublemutant','singleanddouble']:
+                        s = "Header field \'variants:width\' is \'%s\' but variants are single and double mutants - use \'single and double mutants\'" % \
+                            (self.metadata['variants']['width'])
+                        if no_overwrite:
+                            raise PrismFormatError(s)
+                        elif verbose > 0:
+                            print("WARNING: "+s)
+                else:
+                    raise ValueError("Recalculated variants width has an unknown value \'%s\'" % (recalc_variants['width']))
+            elif verbose>0 and not no_overwrite:
+                print("Will add variants:width: %s to header" % (recalc_variants['width']))
+        else:
+            if no_overwrite:
+                raise PrismFormatError("No variants field in meta data")
+                
+        if not no_overwrite:
+            self.metadata['variants'] = recalc_variants
+        
+    def calc_variants_metadata(self, verbose=0, return_missing_mask=False):
+        ret = {}
+        
+        if verbose > 1:
+            print("===============================================================")
+            print("====    Calculate variants meta-data fields with timing    ====")
+            start_time = time.process_time() 
+            print("Calc number of variants (incl. missing_mask) ...", end=" ", flush=True)
+
+        # Number of variants
+        missing_mask = self.__missing_mask(verbose=verbose)
+        ret['number'] = len(self.dataframe['variant']) - np.sum(missing_mask)
+
+        if verbose > 1:
+            elapsed_time = time.process_time() - start_time
+            print("done. Elapsed time %.1f sec" % (elapsed_time))
+            start_time = time.process_time() 
+            print("Calc coverage and depth of variants ...", end=" ", flush=True)
+
+        # Coverage and depth
+        # Positions with substitutions
+        non_missing_indices = np.where([not b for b in missing_mask])[0]
+        flat_resi_list = [resi for resi_list in self.dataframe['resi'][non_missing_indices] for resi in resi_list]
+        (resi_list,resi_counts) = np.unique(flat_resi_list, return_counts=True)
+        
+        # Coverage is positions with substitutions per residue in reference sequence
+        ret['coverage'] = len(resi_list)/len(self.metadata['protein']['sequence'])
+        # Depth is the average number of substitutions per position with substitutions
+        ret['depth'] = np.mean(resi_counts)
+
+        if verbose > 1:
+            elapsed_time = time.process_time() - start_time
+            print("done. Elapsed time %.1f sec" % (elapsed_time))
+            start_time = time.process_time() 
+            print("Calc width of variants ...", end=" ", flush=True)
+
+        # Width
+        n_mut = self.dataframe['n_mut']
+        if np.all(n_mut <= 1):
+            ret['width'] = "single mutants"
+        elif np.max(n_mut) > 2:
+            ret['width'] = "multi mutants"
+        else:
+            ret['width'] = "single and double mutants"
+
+        if verbose > 1:
+            elapsed_time = time.process_time() - start_time
+            print("done. Elapsed time %.1f sec" % (elapsed_time))
+            print("===============================================================")
+
+        if return_missing_mask:
+            return((ret,missing_mask))
+        else:
+            return(ret)
+        
+    def check(self, no_overwrite_metadata=False, verbose=0):
+        """Perform all checks of data and meta data
+
+        Checks formatting of all variants in the first column and sorts multimutants 
+        and that the list is unique. Adds index columns.
+
+        Consistancy checks of data and meta data includes column names, reference amino 
+        acid sequence and all variants sub fields.
+
+        Parameters
+        ----------
+        no_overwrite_metadata : bool
+            Allow overwriting of the variants meta-data fields
+        verbose : int, optional
+            Level of output        
+        """
+        # Check and update variant column
+        if verbose > 1:
+            print("===============================================================")
+            print("====     Check data section variant list with timing       ====")
+            start_time = time.process_time() 
+            print("Make var_list ...", end=" ", flush=True)
+
+        var_list = [s.upper() for s in self.dataframe['variant']]
+
+        # At least 2 characters for 'WT'
+        if np.any([len(var) < 2 for var in var_list]):
+            il = np.where([len(var) for var in var_list] < 2)[0]
+            raise PrismFormatError("Bad variants: %s" % (str([var_list[i] for i in il])))
+
+        if verbose > 1:
+            elapsed_time = time.process_time() - start_time
+            print("done. Elapsed time %.1f sec" % (elapsed_time))
+            start_time = time.process_time() 
+            print("Sort variants ...", end=" ", flush=True)
+
+        # Sort multi-mutants according to residue number
+        var_split_unordered = [var.split(":") if var!="WT" else [] for var in var_list]
+        resi_ll_unordered = [[int(subst[1:-1]) for subst in var] if len(var)>0 else [] for var in var_split_unordered]
+
+        # Make a new variant list with ordered multi-mutants
+        var_split = []
+        for ivar in range(len(var_split_unordered)):
+            if len(set(resi_ll_unordered[ivar])) < len(resi_ll_unordered[ivar]):
+                raise PrismFormatError("Variant %d %s has more substitutions on the same position" % (ivar+1, var_list[ivar]))
+            var_split.append( [var_split_unordered[ivar][i] for i in np.array(resi_ll_unordered[ivar]).argsort()] )
+
+        if verbose > 1:
+            elapsed_time = time.process_time() - start_time
+            print("done. Elapsed time %.1f sec" % (elapsed_time))
+            start_time = time.process_time() 
+            print("Split all substitutions ...", end=" ", flush=True)
+
+        # Split substitutions in amino acids and residue number
+        resi_ll = [[int(sub[1:-1]) for sub in var] if len(var)>0 else [] for var in var_split]
+        aa_ref_ll = [[mut[0] for mut in var] if len(var)>0 else [] for var in var_split]
+        aa_mut_ll = [[mut[-1] for mut in var] if len(var)>0 else [] for var in var_split]
+
+        if verbose > 1:
+            elapsed_time = time.process_time() - start_time
+            print("done. Elapsed time %.1f sec" % (elapsed_time))
+            start_time = time.process_time() 
+            print("Make new_var_list ...", end=" ", flush=True)
+
+        # Check all substitutions and make new variant list
+        new_var_list = []
+        for ivar in range(len(resi_ll)):
+            subst_list = []
+            for isub in range(len(resi_ll[ivar])):
+                aa1 = aa_ref_ll[ivar][isub]
+                resi = resi_ll[ivar][isub]
+                aa2 = aa_mut_ll[ivar][isub]
+                if not PrismParser.is_aa_one_nat(None, aa2, additional="*=~"):
+                    raise PrismFormatError("Substitution %s%d%s is not to a standard amino acid" % (aa1,resi,aa2))
+                if aa1 == aa2:
+                    if verbose > 0:
+                        print("Synonymous substitution %s%d%s renamed to %s%d=" % (aa1,resi,aa2,aa1,resi))
+                    aa_mut_ll[ivar][isub] = '='
+                subst_list.append("%s%d%s" % (aa_ref_ll[ivar][isub], resi_ll[ivar][isub], aa_mut_ll[ivar][isub]))
+            new_var_list.append(":".join(subst_list) if len(subst_list)>0 else "WT")
+
+        # Check if variants are unique
+        (var_unique,var_counts) = np.unique(new_var_list, return_counts=True)
+        if np.any(var_counts > 1):
+            raise PrismFormatError("The following variants are listed multiple times:\n%s" % (var_unique[np.where(var_counts > 1)[0]]))
+
+        # Update data frame
+        self.dataframe['variant'] = new_var_list
+
+        # Update index columns now that we have them
+        self.dataframe['n_mut'] = [len(var) for var in var_split]
+        self.dataframe['aa_ref'] = aa_ref_ll
+        self.dataframe['resi'] = resi_ll
+        self.dataframe['aa_var'] = aa_mut_ll
+        self.index_column_names = ['n_mut','aa_ref','resi','aa_var']
+
+        if verbose > 1:
+            elapsed_time = time.process_time() - start_time
+            print("done. Elapsed time %.1f sec" % (elapsed_time))
+            print("===============================================================")
+
+        self.check_column_names(verbose=verbose)
+        self.check_header_and_data_sequence(verbose=verbose)
+        self.check_variants_metadata(no_overwrite=no_overwrite_metadata, verbose=verbose)
+
+    def to_new_reference(self, target_seq=None, first_resn=None, verbose=0,
+                         min_identity=.8, min_coverage=.1, mismatch="remove", allow_inserts=True, allow_deletions=True):
+        """Map variants from data file onto a new reference sequence
+        
+        This is the workhorse of data set merging
+
+        Parameters
+        ----------
+        target_seq : string
+            Amino acid sequence that variants are aligned to
+        first_resn: integer >= 0
+            If None, the header value is used if present, else one. If given the header value is replaced
+        min_identity : float in [0.0, 1.0], optional
+            Raise exception if the sequence identity is less than this (in covered region)
+        min_coverage : float in [0.0, 1.0], optional
+            Raise exception if the fraction of matched positions (i.e. not indel/gap) is less than this 
+            This is a symmetric kind of coverage: (N_res-N_indel)/N_res where N_res is the lengths of the 
+            longest sequence
+        mismatch : string, optional
+            How to handle mismatches in the alignment
+            -convert: Change reference amino acid to match the new reference
+            -remove: Remove variants that involve mismatch positions
+            -not_allowed: Raise exception is a mismatch is incountered
+        allow_inserts : bool, optional
+            How to handle target sequence position missing in the sequence of the data 
+            If true, the residue numbering will be ajusted accordingly
+        allow_deletions : bool, optional
+            How to handle data sequence positions missing in the target sequence
+            If true, the residue numbering will be ajusted accordingly
+        verbose : int, optional
+            Level of output
+        """
+
+        # Determine new sequence offset and how much to shift residue numbering
+        first_resn_meta = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1
+        if target_seq is None and first_resn is None:
+            # Nothing to do
+            return self
+        elif first_resn is None:
+            # Only alignment to target seq
+            first_resn = first_resn_meta
+            resi_shift_init = 0
+        else:
+            # Shift residue numbering (and possible align to target sequence)
+            resi_shift_init = first_resn - first_resn_meta
+            
+        if first_resn < 0:
+            raise ValueError("Argument first_resn to VariantData.to_new_reference must be a non-negative integer")
+
+        seq = self.metadata['protein']['sequence']
+        n_res_data = len(seq)
+        
+        # Which original data residue indices (0-based) to change
+        resi_rm = []
+        resi_shift = np.full(n_res_data,resi_shift_init)
+        aa_change = {}
+        if not target_seq is None:
+            if not PrismParser.is_aa_one_nat(None, target_seq, "X"):
+                raise ValueError("Argument target_seq to VariantData.to_new_reference must be a single-letter amino acid string (or None)")
+            n_res_target = len(target_seq)
+            # Align sequences
+            # MatrixInfo supports a wildcard 'X' so use upper case. Open and extend penalty 11 and 1 is blast default
+            # 2DO: Consider to write gap_function where X--XX is 2 openings and --XXX is only one, i.e. NT-gap is not an opening
+            align = pairwise2.align.globalds(target_seq.upper(), seq.upper(), MatrixInfo.blosum62, -3, -1)
+            # Check for alternative alignments
+            if len(align) > 1 and verbose > 0:
+                print("Found %d alignments" % (len(align)))
+            if verbose > 0:
+                # Dump verbose number of alignments
+                for ia in range(min([verbose,len(align)])):
+                    print("==== alignment %d of %d ====" % (ia+1,len(align)))
+                    print(pairwise2.format_alignment(*align[ia]))
+            # Check for equally good alignments, a good alignment has high score
+            if len(align) > 1:
+                align_scores = np.array([a[2] for a in align])
+                if any(align_scores[0] - align_scores[1:] < 1e-3):
+                    print("WARNING: There are alignments for %s with same or better scores (try verbose). Sores: %s" %
+                          (self.metadata['protein']['name'],str(align_scores)))
+            # Use first alignment
+            align = align[0]
+            assert len(align[0]) == len(align[1])
+            n_align = len(align[0])
+            # Number to convert 0-based alignment index to 0-based data (original) residue numbering
+            ialign_shift = 0
+            n_indel = n_match = n_mismatch = 0
+            # Iterate over alignment, possible including gaps
+            for ialign in range(len(align[0])):
+                if align[0][ialign] == '-':
+                    # Deletion: Position in data sequence not present in target sequence
+                    if not allow_deletions:
+                        raise PrismValueFail("Aborting alignment due to deletion at position %d which is not allowed" % (ialign+ialign_shift))
+                    # Remove all substitutions at this position
+                    resi_rm.append(ialign+ialign_shift)
+                    # Downstream position are shifted to the left
+                    resi_shift[(ialign+ialign_shift):] -= 1
+                    n_indel += 1
+                elif align[1][ialign] == '-':
+                    # Insertion: Position in target sequence not present in data sequence
+                    if not allow_inserts:
+                        raise PrismValueFail("Aborting alignment due to insertion at position %d which is not allowed" % (ialign+ialign_shift))
+                    resi_shift[(ialign+ialign_shift):] += 1
+                    # Shift conversion number
+                    ialign_shift -= 1
+                    n_indel += 1
+                elif align[0][ialign] != align[1][ialign]:
+                    # Mismatch of 'X' always allowed since it is always mismatched
+                    # Variants matched to X are always removed because position is ambiguous if adjacent to gap and because X is used for numbering offset
+                    if align[0][ialign] == 'X' or align[1][ialign] == 'X':
+                        resi_rm.append(ialign+ialign_shift)
+                    elif mismatch == "not_allowed":
+                        raise PrismValueFail("Aborting alignment due to mismatch at position %d which is not allowed" % (ialign+ialign_shift))
+                    elif mismatch == "remove":
+                        # Mark the substitutions at this position for removal
+                        resi_rm.append(ialign+ialign_shift)
+                    elif mismatch == "convert":
+                        # Change reference amino acid
+                        aa_change[ialign+ialign_shift] = align[0][ialign]
+                    else:
+                        raise ValueError("mismatch argument must be in [not_allowed,convert,remove]")
+                    n_mismatch += 1
+                else:
+                    n_match += 1
+            # Is alignment ok?
+            n_res = max(n_res_target, n_res_data)
+            coverage = (n_res-n_indel)/n_res
+            if coverage < min_coverage:
+                raise PrismValueFail("Sequence coverage %.3f (%d/%d) below threshold of %.3f" % (coverage, n_res-n_indel, n_res, min_coverage))
+            identity = (n_res_target-n_mismatch)/n_res_target
+            if identity < min_identity:
+                raise PrismValueFail("Sequence identity %.3f below threshold of %.3f" % (identity, min_identity))
+            # Update header if alignment is ok
+            self.metadata['protein']['sequence'] = target_seq
+            # Report actions
+            if verbose > 0:
+                print("Making new reference of %s with coverage %.3f and identity %.3f. Maximum shift in residue number is %d" %
+                      (self.metadata['filename'], coverage, identity, max(abs(resi_shift))))
+            if verbose > 0 and len(resi_rm) > 0:
+                print("New reference sequence will remove variants at %d positions (mismatch setting: %s; indels: %d):\n%s" %
+                      (len(resi_rm), mismatch, n_indel, ",".join([str(resi+first_resn_meta) for resi in resi_rm])))
+            if verbose > 0 and len(aa_change) > 0:
+                print("New reference sequence will change variants at %d positions (mismatch setting: %s; indels: %d):\n%s" %
+                      (len(aa_change), mismatch, n_indel, ",".join([str(int(resi)+first_resn_meta) for resi in aa_change.keys()])))
+
+        if not first_resn is None:
+            self.metadata['protein']['first_residue_number'] = first_resn
+        assert int(self.metadata['protein']['first_residue_number']) == first_resn
+                
+        # Split variant in amino acid letters and residue numbers
+        aa_ref_ll = list(self.dataframe['aa_ref'])
+        resi_ll = list(self.dataframe['resi'])
+        aa_mut_ll = list(self.dataframe['aa_var'])
+
+        # Update all variants
+        new_var_list = []
+        ivar_rm = []
+        for ivar in range(len(resi_ll)):
+            subst_list = []
+            for isub in range(len(resi_ll[ivar])):
+                # 0-based index of original sequence without offset
+                resi_orig = resi_ll[ivar][isub] - first_resn_meta
+                # Is substitution marked for removal?
+                if resi_orig in resi_rm:
+                    ivar_rm.append(ivar)
+                    break
+                # Update residue number
+                resi_ll[ivar][isub] += resi_shift[resi_orig]
+                # Update reference amino acid
+                if resi_orig in aa_change.keys():
+                    if aa_change[resi_orig] == aa_mut_ll[ivar][isub]:
+                        if verbose > 2:
+                            print("Data contains a substitution, %s%d%s, into the new reference which will be removed" %
+                                  (aa_ref_ll[ivar][isub],resi_ll[ivar][isub],aa_mut_ll[ivar][isub]))
+                        ivar_rm.append(ivar)
+                        break
+                    else:
+                        if verbose > 2:
+                            print("Convert variant due to mismatch: %s%d%s -> %s%d%s" %
+                                  (aa_ref_ll[ivar][isub], resi_ll[ivar][isub]-resi_shift[resi_orig], aa_mut_ll[ivar][isub],
+                                   aa_change[resi_orig], resi_ll[ivar][isub], aa_mut_ll[ivar][isub]))
+                        aa_ref_ll[ivar][isub] = aa_change[resi_orig]
+                subst_list.append("%s%d%s" % (aa_ref_ll[ivar][isub], resi_ll[ivar][isub], aa_mut_ll[ivar][isub]))
+            if len(subst_list) > 0:
+                new_var_list.append(":".join(subst_list))
+            else:
+                new_var_list.append("WT")
+                
+        if verbose > 0 and len(ivar_rm) > 0:
+            rm_list = [str(self.dataframe['variant'].iat[i]) for i in ivar_rm]
+            if len(rm_list) > 500:
+                print("New reference sequence will remove %d variants (mismatch setting: %s; indels: %d):\n%s\n... [skip %d variants] ...\n%s" %
+                          (len(rm_list), mismatch, n_indel, ",".join(rm_list[0:25]), len(rm_list)-50, ",".join(rm_list[-25:])))
+            else:
+                print("New reference sequence will remove %d variants (mismatch setting: %s; indels: %d):\n%s" %
+                          (len(rm_list), mismatch, n_indel, ",".join(rm_list)))
+
+        self.dataframe['variant'] = new_var_list
+        self.dataframe['aa_ref'] = aa_ref_ll
+        self.dataframe['resi'] = resi_ll
+        self.dataframe['aa_var'] = aa_mut_ll
+        # Remove rows containing variants with deleted substitutions
+        self.dataframe = self.dataframe.drop(self.dataframe.index[ivar_rm], axis=0)
+        self.dataframe.index = range(self.dataframe.shape[0])
+
+    def merge(self, data_list, target_seq=None, first_resn=None, merge='outer', **kwargs):
+        """Merge a list of data files into a single DataFrame and dump it in a file
+
+        Parameters
+        ----------
+        data_list : list of VariantData
+            The list of data sets to merge
+        target_seq : string
+            Amino acid sequence that all data sets are aligned to. If None, 
+            the sequence of the first data set will be used.
+        first_resn : Integer >= 0
+            Residue number of first amino acid in target_seq, default one. If 
+            target_seq is None, the header value will be used if present or 
+            overwritten if an argument is given here.
+        merge : string
+            How to merge:
+            - outer: output the union of variants, default
+            - left: Only keep veriants that are present in the self data set
+            - inner: output the intersect of variants
+        **kwargs : keyword arguments
+            Passed to to_new_reference function
+        """
+        if not merge in ['left', 'outer', 'inner']:
+            raise ValueError("Allowed merge arguments are left, outer or inner")
+
+        merged_data = self.copy()
+
+        if isinstance(data_list, VariantData):
+            data_list = [data_list]
+        elif len(data_list) < 1:
+            raise ValueError("List of data set to merge is empty")
+        
+        # Target sequence
+        if target_seq is None:
+            # Make from meta data, variant residue numbers will match the index of this
+            target_seq = self.metadata['protein']['sequence']
+            if not first_resn is None:
+                raise ValueError("merge argument first_resn can only be set if target_seq != None\n" + \
+                                 "Use VariantData.to_new_reference to only shift residue numbering")
+            first_resn = int(self.metadata['protein']['first_residue_number']) if 'first_residue_number' in self.metadata['protein'].keys() else 1
+        else:
+            # Align self data set to target
+            if first_resn is None:
+                first_resn = 1
+            else:
+                if first_resn < 0:
+                    raise ValueError("Argument first_resn must be non-negative")
+            try:
+                merged_data.to_new_reference(target_seq, first_resn, **kwargs)
+            except PrismValueFail as msg:
+                print("Could not align data to target sequence: %s" % (msg))
+                return(None)
+            
+        # Remove the index columns ['n_mut','aa_ref','resi','aa_mut'] before merging
+        merged_data.remove_index_columns()
+        # Add '_00' to the names of all but variant column
+        merged_data.dataframe.columns = [merged_data.dataframe.columns[0]] + [s+"_00" for s in merged_data.dataframe.columns[1:]]
+
+        # Init new meta data dictionary
+        merged_metadata = {'version':0, 'protein':{}, 'merged':{}, 'variants':{}, 'columns':{}}
+        merged_metadata['protein'] = copy.deepcopy(self.metadata['protein'])
+        
+        merged_metadata['merged']["file_00"] = self.metadata['filename']+" (version "+str(self.metadata['version'])+")"
+        for key in self.metadata['columns'].keys():
+            merged_metadata['columns'][key+"_00"] = self.metadata['columns'][key]
+
+        # Align and merge other data sets
+        c = 0
+        for data in data_list:
+            data_copy = data.copy()
+            # 2DO: Consider to check uniprot (others are human readable and may have typoes) of data to be merged
+            try:
+                data_copy.to_new_reference(target_seq, first_resn, **kwargs)
+            except PrismValueFail as msg:
+                print("Skip merge of file %s: %s" % (data_copy.metadata['filename'],msg))
+                continue
+            data_copy.remove_index_columns()
+            suffix = "_%02d" % (c+1)
+            data_copy.dataframe.columns = [data_copy.dataframe.columns[0]] + [s+suffix for s in data_copy.dataframe.columns[1:]]
+            merged_data.dataframe = merged_data.dataframe.merge(data_copy.dataframe, on='variant', how=merge, suffixes=(False,False), copy=False)
+            if merged_metadata['protein']['uniprot'] != data.metadata['protein']['uniprot']:
+                print("WARNING: Uniprot mismatch on merge: Merging file %s uniprot %s with file %s of different uniprot %s" % (
+                    merged_metadata['merged']["file_00"], merged_metadata['protein']['uniprot'],
+                    data.metadata['filename'], data.metadata['protein']['uniprot']))
+            merged_metadata['merged']["file_%02d" % (c+1)] = data_copy.metadata['filename']+" (version "+str(data_copy.metadata['version'])+")"
+            for key in data_copy.metadata['columns'].keys():
+                merged_metadata['columns'][key+"_%02d" % (c+1)] = data_copy.metadata['columns'][key]
+            c += 1
+                
+        # Add the index columns ['n_mut','aa_ref','resi','aa_mut'] again
+        merged_data.add_index_columns()
+        
+        merged_metadata['protein']['sequence'] = target_seq
+        if first_resn != 1:
+            merged_metadata['protein']['first_residue_number'] = first_resn
+        merged_data.metadata = merged_metadata
+        merged_data.metadata['variants'] = merged_data.calc_variants_metadata()
+        return(merged_data)
+
+        
+# class ResidueData(PrismData):
+#     """Container class for per-residue data
+#     """
+#
+#     def _init(self):
+#         self.index_column_names = ['aa_ref']
+#
+#     def add_index_columns(self):
+#         pass
+#
+#     def remove_index_columns(self):
+#         """Remove the indexing columns, i.e. 'aa_ref'"""
+#         self.dataframe.drop(['aa_ref'], axis=1, inplace=True)
+#
+#     def check(self, verbose=0):
+#         # check ResidueData specific stuff
+#         check_status = None
+#         try:
+#             self.check_column_names(verbose=verbose)
+#             self.check_header_and_data_sequence(verbose=verbose)
+#             self.check_variants_metadata(no_overwrite=no_overwrite_metadata, verbose=verbose)
+#         except PrismFormatError as msg:
+#             raise PrismFormatError("VariantData.check failed. First fail message:\n%s" % (msg))
+#         else:
+#             check_status = True
+#
+#         return(check_status)
+#
+#     def to_new_reference(self, target_seq=None, first_resn=None, verbose=0):
+#         pass
+
+
+if __name__ == "__main__":
+    # Parse commandline arguments
+    import argparse,sys
+    arg_parser = argparse.ArgumentParser(description="PRISM data file processing and alignment")
+    # positional arguments
+    arg_parser.add_argument("files", nargs="+", metavar="DATAFILE",
+                        help="Data files to process")
+    # keyword arguments
+    arg_parser.add_argument("--dump_header_csv", metavar="FILE",
+                      help="Dump a .csv file containing an overview of file headers")
+    arg_parser.add_argument("--dump_fasta", metavar="FILE", 
+                      help="Dump all sequences in a fasta file")
+    arg_parser.add_argument("--parse", action='store_true',
+                      help="Read data file(s), run all checks and write new data file(s)")
+    arg_parser.add_argument("--merge", metavar="FILE", 
+                      help="Merge data files concerning the (approximately) same target protein")
+    # A run of this main can only have a single target sequence so to parse more files with different sequence, run multiple times
+    arg_parser.add_argument("--target_seq", metavar="FILE_OR_SEQUENCE",
+                      help="Align all output to this sequence, default is the sequence given in the first file")
+    arg_parser.add_argument("--saturate", action='store_true',
+                      help="In the output data, add rows to cover all 20 substitutions at all considered positions")
+    arg_parser.add_argument("--clean_na", action='store_true',
+                      help="In the output data, add rows to cover all 20 substitutions at all considered positions")
+    arg_parser.add_argument("--first_res_num", metavar="INT", 
+                      help="In the output data, assign this number to the first given amino acid of the sequence")
+    arg_parser.add_argument("-v","--verbose", action="count", default=0,
+                      help="Level of output, default zero is no output")
+    args = arg_parser.parse_args()
+    if args.saturate and args.clean_na:
+        raise ValueError("Arguments saturate and clean_na cannot be used together")
+    
+    # Read data files
+    data_parser = PrismParser()
+    data_list = []
+    for filename in args.files:
+        try:
+            data = None
+            if args.merge or args.parse:
+                if args.verbose > 0:
+                    print("\nRead data file %s " % (filename),flush=True)
+                data = data_parser.read(filename, verbose=args.verbose)
+            elif args.dump_header_csv or args.dump_fasta:
+                # Only read header which is much faster for large files
+                if args.verbose > 0:
+                    print("\nRead header of %s " % (filename),flush=True) 
+                header = data_parser.read_header(filename, verbose=args.verbose)
+                data = PrismData(header, None)
+            else:
+                break
+        except PrismFormatError as error:
+            print("ERROR reading file %s:\nBad file format: %s\n" % (filename,error))
+        except yaml.parser.ParserError as error:
+            print("ERROR reading file %s:\nBad YAML syntax (ParserError) in header:\n%s\n" % (filename,error))
+        except yaml.scanner.ScannerError as error:
+            print("ERROR reading file %s:\nBad YAML syntax (ScannerError, perhaps a missing whitespace after colon?) in header:\n%s\n" % (filename,error))
+        except FileNotFoundError:
+            print("ERROR reading file %s:\nCannot find file\n" % (filename))
+        else:
+            # Only if all is good
+            data_list.append(data)
+
+    # Check input
+    if len(data_list) < 1:
+        print("\nERROR: Missing data files or arguments")
+        arg_parser.print_usage()
+        sys.exit(2)
+    if args.verbose > 0:
+        print("Done loading %d file(s)\n" % (len(data_list)))        
+
+    # Dump overview csv if requested
+    if args.dump_header_csv:
+        if args.target_seq or args.first_res_num or args.saturate or args.clean_na:
+            print("WARNING: Arguments target_seq, first_res_num, saturate and clean_na have no effect on dump_header_csv")
+        data_parser.write_meta_file(data_list, args.dump_header_csv, output_type="csv")
+        print("Dumped header overveiw of %d files in %s" % (len(data_list),args.dump_header_csv))
+
+    # Dump amino acid sequences in fasta format if requested
+    if args.dump_fasta:
+        if args.target_seq or args.first_res_num or args.saturate or args.clean_na:
+            print("WARNING: Arguments target_seq, first_res_num, saturate and clean_na have no effect on dump_fasta")
+        data_parser.write_meta_file(data_list, args.dump_fasta, output_type="fasta")
+        print("Dumped %d sequences in %s" % (len(data_list),args.dump_fasta))
+
+    # Set target sequence and offset for all data files
+    target_seq = None
+    first_resn = 1
+    if args.target_seq:
+        if args.target_seq[-6:].lower() == ".fasta" or args.target_seq[-4:].lower() == ".fas":
+            record = None
+            for r in SeqIO.parse(args.target_seq, "fasta"):
+                if not record is None:
+                    # if args.verbose > 0:
+                    print("WARNING: Only using the first sequence record in %s" % (args.target_seq))
+                    break
+                record = r
+            target_seq = str(record.seq)
+            print("Target sequence \'%s\' of %d residues from file %s" % (record.description, len(target_seq), args.target_seq))
+        else:
+            target_seq = args.target_seq
+        if args.first_res_num:
+            first_resn = int(args.first_res_num)
+    else:
+        target_seq = (data_list[0]).metadata['protein']['sequence']
+        if args.first_res_num:
+            # Overwrite offset given in header of first file
+            first_resn = int(args.first_res_num)
+        elif 'first_residue_number' in (data_list[0]).metadata['protein']:
+            # Only use offset of first file with that sequence (and not the commandline input)
+            first_resn = int((data_list[0]).metadata['protein']['first_residue_number'])
+
+    if not data_parser.is_aa_one_nat(target_seq, "X"):
+        print("ERROR: Target sequence seems not to be one-letter amino acids or \'X\':")
+        print("Target sequence: %s" % (target_seq))
+        sys.exit(2)    
+
+    # Parse & dump all files if requested
+    if args.parse:
+        filename_suffix = "_parsed"
+        for data in data_list:
+            if args.target_seq or first_resn:
+                data.to_new_reference(target_seq, first_resn, verbose=args.verbose,
+                                      min_identity=.9, min_coverage=.1, mismatch="remove", allow_inserts=True, allow_deletions=True)
+            if args.saturate:
+                data.saturate_variants()
+            elif args.clean_na:
+                data.clean_na_variants()
+            filename_parts = data.metadata['filename'].split("/")[-1].rsplit(".",1)
+            outfilename = filename_parts[0] + filename_suffix + "." + filename_parts[1]
+            data_parser.write(outfilename, data)
+        print("Parsed and wrote %d data file(s) with output filename suffix %s" % (len(data_list), filename_suffix))
+    
+    # Dump a merged data file if requested
+    if args.merge:
+        merged_data = data_list[0].merge(data_list[1:], target_seq, first_resn, merge='outer', verbose=args.verbose,
+                                         min_identity=.9, min_coverage=.1, mismatch="remove", allow_inserts=True, allow_deletions=True)
+        if merged_data is None:
+            print("ERROR: Could not merge data")
+            sys.exit(2)    
+        
+        if args.saturate:
+            merged_data.saturate_variants()
+        elif args.clean_na:
+            merged_data.clean_na_variants()
+        data_parser.write(args.merge, merged_data)
+        print("Dumped merge of %d data files in %s " % (len(merged_data.metadata['merged']), args.merge))
diff --git a/software/scripts/struc_select_sifts.py b/software/scripts/struc_select_sifts.py
new file mode 100644
index 0000000..26d3c82
--- /dev/null
+++ b/software/scripts/struc_select_sifts.py
@@ -0,0 +1,1077 @@
+#!/usr/local/bin/python3
+
+# Copyright (C) 2021 Kristoffer Enoe Johansson <kristoffer.johansson@bio.ku.dk>
+
+__version__ = 0.03
+
+from argparse import ArgumentParser
+import os,sys,argparse
+import urllib
+import json
+import time
+
+from Bio import SeqIO
+from Bio import Align
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+from io import StringIO
+
+import numpy as np
+import pandas as pd
+pd.set_option('display.max_colwidth', 20)
+
+# Global variables to ensure that servers are not pinged too often
+__pdb_time = 0.0
+__uniprot_time = 0.0
+__sifts_time = 0.0
+
+
+def select_structures(uniprot_record, struc_map, n_select=1, min_cover_positions=40, identity_power=1.0, verbose=0):
+    """Select a set of structures that best represent a uniprot protein based on a structure map
+
+    Assigns a score to each structure and selects the best scoring structure per position:
+    score = 2.5/method_score * coverage * identitiy**identity_power
+    where method_score is X-ray resolution, 3 for cryo-em, 4 for nmr and 10 for others
+
+    Parameters
+    ----------
+    uniprot_record : BioPythin SeqRecord object parsed using uniprot-xml format
+        Information on target protein
+    struc_map : Pandas DateFrame
+        Structure map as generated by url_strucmap function below
+    identity_power : float
+        In the score function, the fraction identity is raised to this power to achieve a steeper 
+        decent from identity 1.0. Default 1.0 for linear scaling.
+    min_cover_positions : integer
+        The minimum number of position that a structure should be optimal for in order to 
+        appear in the final selection. Default 40 thought as a minimum size of a folded domain.
+    verbose : integer
+        Default 0 is silent and higher integers increses level of terminal output
+    """
+    uniprot_id = uniprot_record.id
+    uniprot_seq = str(uniprot_record.seq)
+    unp_nres = len(uniprot_seq)
+    nstruc = len(struc_map.index)
+    res_score_matrix = np.ones([nstruc, unp_nres]) * -1
+    score_list = [-1.0]*nstruc
+    for i in range(nstruc):
+        pdb_str = "%s-%s" % (struc_map['pdb_id'][i], struc_map['chain_id'][i])
+        seq = struc_map['aligned_resseq'][i]
+        # pdb_nres = struc_map['end'][i] - struc_map['start'][i] + 1
+
+        # # The indentity measure in the score function compeates with coverage because the coverage can be made larger at the expence
+        # # of identity. 
+        # ident = struc_map['ident'][i] / pdb_nres
+
+        # Count identities among observed residues 
+        ident_obs = sum( [aa1==aa2 and not aa2 in "-x" for aa1,aa2 in zip(uniprot_seq.lower(),struc_map['aligned_resseq_obs'][i].lower())] )
+        # Number of positions that align, checked to be SIFTS coverage times len(uniprot_seq)
+        cover = struc_map['coverage'][i]
+        ident_score = ( ident_obs*1.0/cover )**identity_power
+
+        # Number of oberserved positions
+        align_obs = struc_map['aligned_resseq_obs'][i].lower()
+        cover_obs = (unp_nres - align_obs.count('-') - align_obs.count('x')) / unp_nres
+        
+        # Assign a score to the method used to determine the structure
+        method = (struc_map['experimental_method'][i]).lower()
+        if 'x-ray' in method or 'xray' in method:
+            method_score = struc_map['resolution'][i]
+        elif 'cryoem' in method or 'cryo-em' in method:
+            # Cryo-EM structure same as 3Å X-ray
+            method_score = 3.0
+            # Consider to look for EM resilution
+        elif 'nmr' in method:
+            # NMR structure same as 4Å X-ray
+            method_score = 4.0
+        else:
+            # Assume model, only prefer when nothing else is available
+            method_score = 5.0
+            
+        # Structure score used for selecting best structure for uniprot
+        score_list[i] = 1/(1+np.exp((method_score-3.5)*2)) + cover_obs * ident_score
+        # method_score:   1.8,   2.2,   2.5,   2.7,   3.0,   4.0,   5.0
+        # sigmoid       0.968, 0.931, 0.881, 0.832, 0.731, 0.269, 0.047
+        
+        # Structures of identical score is typically different but identical chains of the same asym unit, here argmax selects the first
+        if verbose > 0:
+            print("%s %s score %.2f (%s, method_score %.2f, cover_obs %.3f, ident_score = %d/%d^%.1f = %.3f)" %
+                  (uniprot_id, pdb_str, score_list[i], method, method_score, cover_obs, ident_obs, cover, identity_power, ident_score))
+            
+        # Assign structure score to positions that structure cover
+        seq_array = np.array(list(struc_map['aligned_resseq_obs'][i]))
+        pos_mask = np.logical_and(seq_array != '-', seq_array != 'x')
+        res_score_matrix[i,pos_mask] = score_list[i]
+        
+    struc_map['score'] = score_list
+    
+    # For each position (column in res_score_matrix), find the best (highest scoring) structure
+    best_struc_index = np.argmax(res_score_matrix, axis=0)
+
+    # Count how many positions each structure best represent
+    best_struc_counts = np.bincount(best_struc_index)
+
+    # Fill array with structure indices that are never optimal and store in structure map
+    if best_struc_counts.size < nstruc:
+        best_struc_counts = np.append(best_struc_counts, [0]*(nstruc-best_struc_counts.size))
+    struc_map['selected_pos'] = best_struc_counts
+
+    # Which structure(s) covers most positions
+    struc_rank = np.flip( np.argsort(best_struc_counts) )
+    if verbose > 1:
+        # print("Matrix with indices of best structure per position")
+        # print(res_score_matrix)
+        print("Index of best scoring structure per position")
+        print(best_struc_index)
+        print("Number of positions that each structure (row) is selected for: %s" % (str(best_struc_counts)))
+        print("Ranked indices of the structure map raows: %s" % (str(struc_rank)))
+
+    # Do final selection
+    selected_indices = []
+    for rank in range(len(struc_rank)):
+        struc_index = struc_rank[rank]
+        if verbose > 0:
+            print("Rank %2d is index %2d %s-%s that covers %d positions" %
+                  (rank, struc_index, struc_map['pdb_id'][struc_index],struc_map['chain_id'][struc_index], best_struc_counts[struc_index]))
+        # Check if structure is ok to use
+        if best_struc_counts[struc_index] > min_cover_positions:
+            # print("Using %s chain %s" % (struc_map['pdb_id'][struc_index],struc_map['chain_id'][struc_index]))
+            selected_indices.append(struc_index)
+
+    # Report
+    if verbose > 0:
+        nsel = len(selected_indices)
+        if nsel == 1:
+            i = selected_indices[0]
+            print("%s: %s covers %d of %d residues" % (uniprot_id, pdb_str, struc_map['coverage'][i], len(uniprot_seq)))
+        elif nsel > 1:
+            struc_str = ", ".join( ["%s-%s cover %d" % (pdb,c,cover*100) for (pdb,c,cover) in
+                                    zip(struc_map['pdb_id'][selected_indices],
+                                        struc_map['chain_id'][selected_indices],
+                                        struc_map['coverage'][selected_indices])] )
+            print("%s covered by %d structures: %s" % (uniprot_id, nsel, struc_str))
+
+            # Determine overlap of structures
+            overlap = np.zeros([nsel,nsel])
+            for i in range(nsel-1):
+                for j in range(i+1,nsel):
+                    overlap[i,j] = sum( np.logical_and(res_score_matrix[i,] > 0, res_score_matrix[j,] > 0) )
+            if nsel < 3:
+                print("The two structures overlap at %d positions" % (overlap[1,1]))
+            else:
+                print("Overlaping positions matrix:")
+                print(overlap)
+        else:
+           print("ERROR: No structures found for %s" % (uniprot_id))
+
+    # Return the rows of stuc_map input that was selected
+    return(struc_map.iloc[selected_indices,:])
+
+
+def get_method_score(smap_row, verbose=0):
+    method = smap_row.method_exp.lower()
+    if 'x-ray' in method or 'xray' in method:
+        method_score = smap_row.method_res
+        if method_score is None:
+            if verbose > 0:
+                print("%s-%s: No resolution given for X-ray structure. Using 2.5" % (smap_row.pdb, smap_row.chain))
+            method_score = 2.5
+    elif 'electron microscopy' in method:
+        method_score = smap_row.method_res
+        if method_score is None:
+            if verbose > 0:
+                print(" %s-%s: No resolution given for EM structure. Using 3.5" % (smap_row.pdb, smap_row.chain))
+            method_score = 3.5
+    elif 'nmr' in method:
+        # NMR structure same as 4Å X-ray                                                                                                                 
+        method_score = 4.0
+    else:
+        # Assume model, only prefer when nothing else is available
+        if verbose > 0:
+            print("%s-%s: Assuming method '%s' is model" % (smap_row.pdb, smap_row.chain, smap_row.method_exp))
+        method_score = 5.0
+    return(method_score)
+
+
+def rank_structures(target_record, strucmap, verbose=0):
+    nres_target = len(target_record.seq)
+
+    # Coverage score
+    # Conceptually, score terms are weighted in relation to coverage because this is available early on from SIFTS
+    # Cover_exp includes un-observed residues but not deletions (actually deleted from the molecules)
+    # Cover_obs includes mismatch and modified residues but not un-observed
+    cover_score = 0.0
+    if 'cover_obs' in strucmap.columns:
+        cover_score = strucmap['cover_obs']
+    elif 'cover_exp' in strucmap.columns:
+        cover_score = strucmap['cover_exp']
+
+    # Score to evaluate the experimental method by which the structure was determined
+    method_scores = 0.0
+    if 'method_exp' in strucmap.columns:
+        method_scores = np.array( [get_method_score(row, verbose=verbose) for row in strucmap.itertuples(index=False)] )
+        # method_scores    1.3,    1.5,    1.7,    1.9,    2.1,    2.2,    2.3,    2.5,    3.0,    4.0,    5.0
+        # sigmoid       0.9955, 0.9933, 0.9900, 0.9852, 0.9781, 0.9734, 0.9677, 0.9526, 0.8808, 0.5000, 0.1192
+        method_scores = 1/(1+np.exp((method_scores-3.8)*2))
+
+    # Score based on inserts in the structure (not present in the target sequence), see case P07550
+    insert_score = 0
+    if 'inserts' in strucmap.columns:
+        # Inserts are completely removed from the alignment so they need to be penalized
+        insert_score = -5.0 * strucmap['inserts'] / nres_target
+        
+    # Score based on chemically modified residues 
+    mod_score = 0
+    if 'modified' in strucmap.columns:
+        # If cover and identity is the same, prefer structure with least modified residues
+        # If a structure has one additional aligned residue, it counts 1.5/nres, and a modified sould be worse
+        # since this is typically in the folded part of the structure whereas the additionl residue likely to
+        # be in the terminals
+        mod_score = - 1.7 * strucmap['modified'] / nres_target
+
+    # Mismatch score
+    mismatch_score = 0
+    if 'mismatch' in strucmap.columns:
+        # A mismatch should weight more than 2 residue coverage because coverage is typically extended in
+        # terminals and a mismatch in the folded region
+        mismatch_score = -2.5 * strucmap['mismatch'] / nres_target
+
+    # Everything else equal, prefer chain A
+    chain_score = 0.0
+    if 'chain' in strucmap.columns:
+        chain_score = 1e-5 * (strucmap['chain']=='A')
+        
+    # Total score
+    scores = method_scores + cover_score + insert_score + mod_score + mismatch_score + chain_score
+    strucmap['score'] = scores
+
+    # Reorder structure map to have highest scoring structures on top
+    strucmap = strucmap.iloc[ np.flip(np.argsort(scores)) ]
+    # Rename row to running indices and do not create a column with old indices
+    strucmap.reset_index(inplace=True, drop=True)
+    
+    return(strucmap)
+
+
+def url_sifts_strucmap(target_record, min_ping_time=5.0, verbose=0):
+    target_id = target_record.id
+    target_seq = str(target_record.seq)
+    if verbose > 0:
+        # print("")
+        print("%s: Initial structure map from URL requests to SIFTS" % (target_id))
+        # if verbose > 1:
+        #     print("%s target sequence:" % (target_id))
+        #     print(target_seq)
+            
+    # Get best structure mappings from sifts
+    sifts_list = url_sifts(target_id, min_ping_time, verbose=verbose)
+    n_struc = len(sifts_list)
+    if verbose > 0:
+        print("%s: Recieved map of %d structures from SIFTS" % (target_id, n_struc))
+        if verbose > 2:
+            print("%s: raw SIFTS record:" % (target_id))
+            print(sifts_list)
+
+    # SIFTS keys {'end': 101, 'chain_id': 'C', 'pdb_id': '6xog', 'start': 1, 'unp_end': 101, 'coverage': 1, 'unp_start': 1, 'resolution': 1.98,
+    #             'experimental_method': 'X-ray diffraction', 'tax_id': 9606}
+    columns = ['pdb', 'chain', 'score', 'select', 'method_exp', 'method_res', 'unp_start', 'unp_end', 'resseq_start', 'resseq_end', 'cover_exp']
+    
+    sifts_dict = {'pdb': [d['pdb_id'] for d in sifts_list],  'chain': [d['chain_id'] for d in sifts_list],
+                  'score': [0.0]*n_struc, 'select': [1]*n_struc,
+                  'method_exp': [d['experimental_method'] for d in sifts_list], 'method_res': [d['resolution'] for d in sifts_list],
+                  'cover_exp': [d['coverage'] for d in sifts_list],
+                  'unp_start': [d['unp_start'] for d in sifts_list], 'unp_end': [d['unp_end'] for d in sifts_list],
+                  'resseq_start': [d['start'] for d in sifts_list], 'resseq_end': [d['end'] for d in sifts_list]}
+    
+    return( pd.DataFrame(sifts_dict, columns=columns) )
+    
+    
+def align_strucmap(target_record, strucmap, min_ping_time=5.0, verbose=0):
+    """Update a structure map with aligned RESSEQ from PDBe URL request"""
+
+    target_id = target_record.id
+    target_seq = str(target_record.seq)
+    n_struc = len(strucmap.index)
+
+    # New columns to add to strucmap
+    cover_obs_col = []
+    ident_exp_col = []
+    ident_obs_col = []
+    inserts_col = []
+    deletions_col = []
+    mismatch_col = []
+    non_obs_col = []
+    modified_col = []
+    resseq_col = []
+    aligned_resseq_col = []
+    aligned_resseq_obs_col = []
+        
+    c = 0
+    for smap in list(strucmap.itertuples(index=False))[c:]:
+        c += 1
+        if verbose > 0:
+            print("=== Structure %d of %d for %s: %s-%s" % (c,n_struc,target_id,smap.pdb,smap.chain))
+        
+        # A structure map entry must have pdb and chain fields
+        if not 'pdb' in smap._fields:
+            raise ValueError("ERROR: Structure map row with no pdb field: %s" % (str(smap)))
+        if not 'chain' in smap._fields:
+            raise ValueError("ERROR: Structure map row with no chain field: %s" % (str(smap)))
+        pdb_str = "%s-%s" % (smap.pdb, smap.chain)
+        
+        # Get RESSEQ sequence, i.e. the sequence of the protein actually used to determine structure (including residues not observed in the structure)
+        try:
+            pdb_resseq = url_pdb_resseq(smap.pdb, smap.chain, min_ping_time=min_ping_time, verbose=verbose)
+        except Exception as err:
+            print("ERROR: RESSEQ request for %s failed (%s)" % (pdb_str, err))
+            cover_obs_col.append(np.nan)
+            ident_exp_col.append(np.nan)
+            ident_obs_col.append(np.nan)
+            inserts_col.append(np.nan)
+            deletions_col.append(np.nan)
+            mismatch_col.append(np.nan)
+            non_obs_col.append(np.nan)
+            modified_col.append(np.nan)
+            resseq_col.append(np.nan)
+            aligned_resseq_col.append(np.nan)
+            aligned_resseq_obs_col.append(np.nan)
+            continue
+
+        # Look for modified residues
+        pdb_modified = url_pdb_modified(smap.pdb, smap.chain, min_ping_time=min_ping_time, verbose=verbose)
+        
+        # Request observed moities and residues in structure
+        pdb_obs = url_pdb_observed(smap.pdb, chain_id=None, min_ping_time=min_ping_time, verbose=verbose)
+        obs_chains = {}
+        n_macro_mol = len(pdb_obs["molecules"])
+        for i_macro_mol in range(n_macro_mol):
+            # How many chains of macro molecule type (also has key 'entity_id' : integer)
+            mmol_id = pdb_obs["molecules"][i_macro_mol]["entity_id"]
+            n_mmol_chains = len(pdb_obs["molecules"][i_macro_mol]["chains"])
+            for i_mmol_chain in range(n_mmol_chains):
+                # ID of this chain (also has key 'struct_asym_id' : char)
+                chain_id = pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["chain_id"]
+                # How many fragments are observed for chain 
+                n_chain_frag = len(pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["observed"])
+                frag_list = []
+                for i_frag in range(n_chain_frag):
+                    start_dict = pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["observed"][i_frag]["start"]
+                    end_dict = pdb_obs["molecules"][i_macro_mol]["chains"][i_mmol_chain]["observed"][i_frag]["end"]
+
+                    # residue_number is 1-based RESSEQ index and author_residue_number is ATOM field numbers (both numbers observed)
+                    frag_list.append((start_dict["residue_number"], end_dict["residue_number"], ))
+
+                    # also has keys 'author_insertion_code' (typically None) and 'struct_asym_id' (typically chain id)
+                    if not start_dict["author_insertion_code"] is None or not end_dict["author_insertion_code"] is None:
+                        print("WARNING: PDB %s (%s) observed fragment start and end author_insertion_code is %s and %s" %
+                              (pdb_str, target_id, start_dict["author_insertion_code"], end_dict["author_insertion_code"]))
+                    # # Remove struct_asym_id warning, from 6n13-B it only seems to be different from chain_id when ordering is not A,B,C,...
+                    # if start_dict["struct_asym_id"] != chain_id or end_dict["struct_asym_id"] != chain_id:
+                    #     print("WARNING: PDB %s (%s) observed fragment start and end struct_asym_id %s and %s is different from chain_id %s" %
+                    #           (pdb_str, target_id, start_dict["struct_asym_id"], end_dict["struct_asym_id"], chain_id))
+                    
+                if chain_id in obs_chains.keys():
+                    print(pdb_obs)
+                    raise ValueError("ERROR: Chain id %s present multiple times in PDB observed responce" % (chain_id))
+                obs_chains[chain_id] = (mmol_id, frag_list)
+                
+        obs = obs_chains[smap.chain][1]
+        nres = len(pdb_resseq)
+        pdb_resseq_obs = pdb_resseq
+        
+        # Check for non-observed N-terminal
+        if obs[0][0] > 1:
+            pdb_resseq_obs = "x"*(obs[0][0]-1) + pdb_resseq_obs[(obs[0][0]-1):]
+            if verbose > 0:
+                print("%s: Non-observed region in %s: %d-%s-%d (1-based RESSEQ index)" %
+                      (target_id, pdb_str, 1, pdb_resseq[0:(obs[0][0]-1)], obs[0][0]-1))
+        # Non-observed regions
+        if len(obs) > 1:
+            for i in range(1,len(obs)):
+                pdb_resseq_obs = pdb_resseq_obs[:obs[i-1][1]] + "x"*(obs[i][0]-obs[i-1][1]-1) + pdb_resseq_obs[(obs[i][0]-1):]
+                if verbose > 0:
+                    print("%s: Non-observed region in %s: %d-%s-%d (1-based RESSEQ index)" % (target_id, pdb_str, obs[i-1][1]+1,
+                                                                                              pdb_resseq[(obs[i-1][1]):(obs[i][0]-1)], obs[i][0]-1))
+        # Check for non-observed C-terminal
+        if obs[-1][1] < nres:
+            pdb_resseq_obs = pdb_resseq_obs[:obs[-1][1]] + "x"*(nres-obs[-1][1])
+            if verbose > 0:
+                print("%s: Non-observed region in %s: %d-%s-%d (1-based RESSEQ index)" %
+                      (target_id, pdb_str, obs[-1][1]+1, pdb_resseq[obs[-1][1]:nres], nres))
+
+        # Modified residues
+        if len(pdb_modified) != 0:
+            for pdbmod in pdb_modified:
+                # This number should apply to the PDB RESSEQ sequence
+                resi = pdbmod['residue_number']
+                aa = pdb_resseq[resi-1]
+                aa_obs = pdb_resseq_obs[resi-1]
+                if aa != aa_obs:
+                    if verbose > 0:
+                        print("%s: Modified residues %s%d-%s seems unobserved: %s" % (pdb_str, aa, resi, pdbmod['chem_comp_id'], aa_obs))
+                if verbose > 1:
+                    print("%s has modification %s%d-%s into %s (author_resi %d may be uniprot)" %
+                          (pdb_str, aa, resi, pdbmod['chem_comp_id'], pdbmod['chem_comp_name'], pdbmod['author_residue_number']))
+                    # print("%s: %s entity_id %d, weight %.2f, alternate_conformers %s, author_insertion_code %s" %
+                    #       (target_id, pdb_str, pdbmod['entity_id'], pdbmod['weight'], pdbmod['alternate_conformers'], pdbmod['author_insertion_code']))
+                    # print("%s: %s modification description: %s" % (target_id, pdb_str, pdbmod['description']))
+                    
+                # Mark modified with capital X which should work in bolsum and I can still see what is gap and what is modified
+                pdb_resseq_obs = pdb_resseq_obs[:(resi-1)] + 'X' + pdb_resseq_obs[resi:]
+        
+        # Attempt SIFT alignment if present
+        identities = 0
+        inserts = 0
+        pdb_nres = 0.1
+        if 'resseq_start' in smap._fields and 'unp_start' in smap._fields:
+            pdb_nres = smap.resseq_end - smap.resseq_start + 1
+            unp_nres = smap.unp_end - smap.unp_start + 1
+            
+            # Align PDB RESSEQ sequence to Uniprot sequence
+            if pdb_nres == unp_nres:
+                # Attempt align PDB to target sequence based on SIFTS
+                aligned_resseq = "-"*(smap.unp_start-1) + pdb_resseq[(smap.resseq_start-1):(smap.resseq_end)] + "-"*(len(target_seq)-smap.unp_end)
+                identities = sum( [aa1==aa2 and not aa2.lower() in "-xX" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq.lower())] )
+                aligned_resseq_obs = "-"*(smap.unp_start-1) + pdb_resseq_obs[(smap.resseq_start-1):(smap.resseq_end)] + \
+                    "-"*(len(target_seq)-smap.unp_end)
+                if verbose > 0:
+                    if identities/pdb_nres < 0.9:
+                        print("%s: poor SIFTS alignment of PDB %s with only %d of %d identical residues - attempt custom alignment" %
+                              (target_id, pdb_str, identities, pdb_nres))
+                    else:
+                        print("%s: using SIFTS alignment for %s" % (target_id,pdb_str))
+            else:
+                if verbose > 0:
+                    print("%s: SIFTS residue range for %s have different length for PDB (nres %d) and Target (nres %d) - attempt custom alignment" %
+                          (target_id, pdb_str, pdb_nres, unp_nres))
+                
+        # If SIFTS alignment failed run pairwise alignment
+        if identities/pdb_nres < 0.9:
+            (aligned_resseq,aligned_resseq_obs,inserts) = subalign(target_record, SeqRecord(Seq(pdb_resseq), id=pdb_str),
+                                                                   extra_seq=pdb_resseq_obs, verbose=verbose)
+            
+            # Count identities among residues present in the experiment (not necessarily observed)
+            identities = sum( [aa1==aa2 and not aa2 in "-xX" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq.lower())] )
+
+        # Check if identity is within threshold
+        if len(aligned_resseq) != len(target_seq):
+            print(smap)
+            print(target_seq)
+            print(aligned_resseq)
+            raise ValueError("ERROR: %s %s alignment has length %d different from target sequence legnth %d" %
+                             (target_id, pdb_str, len(aligned_resseq), len(target_seq)))
+
+        if identities/pdb_nres < 0.9:
+            # Warn if PDB fragment has less than 90% identity
+            print("WARNING %s: Poor custom alignment of PDB %s with only %d of %d identical residues - attempt custom alignment" %
+                  (target_id, pdb_str, identities, pdb_nres))
+
+        # Store stats and alignemnts in structure map, these should be extracted from aligned_resseq_obs or tested of consistance with this
+        # Number of inserted positions in PDB (should be removed from aligned sequence)
+        inserts_col.append(inserts)
+        
+        # Number of non-terminal positions deleted/missing in the experiment
+        deletions_col.append(aligned_resseq_obs.strip('-').count('-'))
+
+        # Check consistant calculation of SIFTS coverage 
+        cover_pos = len(aligned_resseq_obs.strip('-')) - deletions_col[-1]
+        cover_exp = cover_pos * 1.0/len(target_seq)
+        # Coverage should be the fraction of target positions that the structure covers including indels (uncertainty from 2-digit rounding)
+        if cover_exp - smap.cover_exp > 1e-3:   #0.0051:
+            print("WARNING: %s %s re-calculated SIFTS coverage %.4f differ from cover_exp of %.4f (%d pos)" %
+                  (target_id, pdb_str, cover_exp, smap.cover_exp, cover_pos))
+
+        # Coverage of observed residues, i.e. residues that are not deleted or unobserved.
+        # Mismatch and and observed modified are included in cover_obs
+        cover_obs_pos = len(aligned_resseq_obs) - aligned_resseq_obs.count('-') - aligned_resseq_obs.count('x') # - aligned_resseq_obs.count('X')
+        cover_obs = cover_obs_pos * 1.0/len(target_seq)
+        cover_obs_col.append(cover_obs)
+        
+        # Check indels for consistancy with SIFTS coverage ranges
+        # if pdb_nres - sifts['deletions'] + sifts['inserts'] - unp_nres > 0:
+        if pdb_nres - deletions_col[-1] + inserts - unp_nres > 0:
+            print("WARNING: %s %s SIFTS target range %d-%d (%d) and PDB range %d-%d (%d) is not consistant with %d deletions and %d insertions" %
+                  (target_id, pdb_str, smap.unp_start, smap.unp_end, unp_nres,
+                   smap.resseq_start, smap.resseq_end, pdb_nres, deletions_col[-1], inserts))
+        
+        # How many of the covered target positions are identical
+        ident_exp_col.append(identities)
+        # For consistancy with cover_obs, a modified residue is counted as identical
+        # ident_obs_col.append( sum( [aa1==aa2 and not aa2.lower() in "-xX" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq_obs.lower())] ) )
+        ident_obs_col.append( sum( [aa1==aa2 and not aa2.lower() in "-x" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq_obs.lower())] ) )
+        
+        # How many covered and observe positions does not match
+        mismatch_col.append(sum([aa1!=aa2 and aa2 in "acdefghiklmnpqrstvwy" for aa1,aa2 in zip(target_seq.lower(),aligned_resseq_obs.lower())]))
+        
+        # How many og the covered residues are not observed
+        non_obs_col.append(aligned_resseq_obs.count('x'))
+        
+        # How many og the covered residues are not observed
+        modified_col.append(aligned_resseq_obs.count('X'))
+        
+        # # Removed, makes many warnings. 
+        # if not sifts['non-obs'] == pdb_resseq_obs.lower().count('x'):
+        #     print("WARNING: %s %s has %d non-observed positions that was never in the experiment (missing)" %
+        #           (target_id, pdb_str, pdb_resseq_obs.lower().count('x')-sifts['non-obs']))
+
+        resseq_col.append(pdb_resseq)
+        aligned_resseq_col.append(aligned_resseq)
+        aligned_resseq_obs_col.append(aligned_resseq_obs)
+        
+        # Report alignment
+        print("%s: %s alignment: ident_exp %4d   ident_obs %4d  cover_exp %5.3f  cover_obs %5.3f  deletions %d  inserts %d  mismatch %d" %
+              (target_id, pdb_str, ident_exp_col[-1], ident_obs_col[-1], smap.cover_exp, cover_obs_col[-1],
+                deletions_col[-1], inserts_col[-1], mismatch_col[-1]))
+        if verbose > 0:
+            print("%s: Original %s RESSEQ:" % (target_id,pdb_str))
+            print(pdb_resseq)
+            print("%s: Aligned %s sequence:" % (target_id,pdb_str))
+            print(aligned_resseq)
+            print("%s: Aligned observed %s sequence (-: gap/deletion, x: unobserved (but present in experiment), X: modified):" % (target_id,pdb_str))
+            print(aligned_resseq_obs)
+
+    # Add columns to structure map and return updated strucmap
+    strucmap['cover_obs'] = cover_obs_col
+    strucmap['ident_exp'] = ident_exp_col
+    strucmap['ident_obs'] = ident_obs_col
+    strucmap['inserts'] = inserts_col
+    strucmap['deletions'] = deletions_col
+    strucmap['mismatch'] = mismatch_col
+    strucmap['non-obs'] = non_obs_col
+    strucmap['modified'] = modified_col
+    strucmap['resseq'] = resseq_col
+    strucmap['aligned_resseq'] = aligned_resseq_col
+    strucmap['aligned_resseq_obs'] = aligned_resseq_obs_col
+    return(strucmap)
+
+
+def url_sifts(uniprot_id, min_ping_time=50, verbose=0):
+    # Go easy on servers and wait if necessary
+    global __sifts_time
+    time_since_last = time.time() - __sifts_time
+    if time_since_last < min_ping_time:
+        if verbose > 0:
+            print("Avoid frequent SIFTS requests and wait %.1f seconds" % (min_ping_time - time_since_last))
+        time.sleep(min_ping_time - time_since_last)
+    __sifts_time = time.time()
+        
+    request_url = "https://www.ebi.ac.uk/pdbe/api/mappings/best_structures/%s" % (uniprot_id)
+    if verbose > 1:
+        print("Request EBI SIFTS URL: %s" %request_url )
+    responce = urllib.request.urlopen(request_url)
+    # urllib does not decode, but this is rarely important
+    json_str = urllib.parse.unquote( responce.read().decode('utf-8') )
+    # Return a list of dicts with info on each PDB
+    return( json.loads(json_str)[uniprot_id] )
+    
+
+def url_uniprot(uniprot_id, min_ping_time=50, verbose=0):
+    """
+    Returns a BioPython SeqRecord object with attributes seq, name, description and dbxrefs
+    """
+    # Go easy on servers and wait if necessary
+    global __uniprot_time
+    time_since_last = time.time() - __uniprot_time
+    if time_since_last < min_ping_time:
+        if verbose > 0:
+            print("Avoid frequent Uniprot requests and wait %.1f seconds" % (min_ping_time - time_since_last))
+        time.sleep(min_ping_time - time_since_last)
+    __uniprot_time = time.time()
+    
+    request_url = "https://www.uniprot.org/uniprot/%s.xml" % (uniprot_id)
+    if verbose > 1:
+        print("Request uniprot URL: %s" %request_url )
+    responce = urllib.request.urlopen(request_url)
+    # if responce.status != 200:
+    #     raise InternetError()
+    # Return a biopython uniprot record object
+    return( SeqIO.read(responce, "uniprot-xml") )
+
+
+def url_pdb_resseq(pdb_id, chain_id=None, min_ping_time=50, verbose=0):
+    # Go easy on servers and wait if necessary
+    global __pdb_time
+    time_since_last = time.time() - __pdb_time
+    if time_since_last < min_ping_time:
+        if verbose > 0:
+            print("Avoid frequent PDB requests and wait %.1f seconds" % (min_ping_time - time_since_last))
+        time.sleep(min_ping_time - time_since_last)
+    __pdb_time = time.time()
+    
+    # chain_id is case sensitive, pdb_id is not
+    request_url = "https://www.ebi.ac.uk/pdbe/entry/pdb/%s/fasta" % (pdb_id)
+    if verbose > 1:
+        print("Request PDB URL: %s" %request_url )
+    responce = urllib.request.urlopen(request_url)
+    fasta_str = urllib.parse.unquote( responce.read().decode('utf-8') )
+    seq_records = SeqIO.parse(StringIO(fasta_str), "fasta")
+
+    # Dict of seq per chain_id
+    seq_index_dict = {}
+    seq_list = []
+    for sr in seq_records:
+        descrip_list = sr.description.split("|")
+        if descrip_list[0] != "pdb":
+            raise ValueError("ERROR: %s RESSEQ responce description does not start with \'pdb|\'" % (pdb_id))
+        if descrip_list[1] != pdb_id.lower():
+            raise ValueError("ERROR: %s RESSEQ responce description does not start with \'pdb|%s|\'" % (pdb_id, pdb_id.lower()))
+        
+        for c in descrip_list[2].split():
+            seq_index_dict[c] = len(seq_list)
+        seq_list.append(str(sr.seq))
+    # Change this when I know what to return as default, use first sequence for now
+    if chain_id is None:
+        return(seq_list[0])
+    else:
+        if not chain_id in seq_index_dict.keys():
+            raise ValueError("Error: Cannot get RESSEQ, no chain %s in PDB %s" % (chain_id,pdb_id))
+        return(seq_list[seq_index_dict[chain_id]])
+
+
+def url_pdb_observed(pdb_id, chain_id=None, min_ping_time=50, verbose=0):
+    # Example of returnced list element
+    # https://www.ebi.ac.uk/pdbe/api/pdb/entry/polymer_coverage/6uyz
+
+    # Apparantly, observed says nothing about why the non-observed positions are not observed, e.g if they were never in
+    # the experiment or just not observed
+    
+    # Go easy on servers and wait if necessary
+    global __pdb_time
+    time_since_last = time.time() - __pdb_time
+    if time_since_last < min_ping_time:
+        if verbose > 0:
+            print("Avoid frequent PDB requests and wait %.1f seconds" % (min_ping_time - time_since_last))
+        time.sleep(min_ping_time - time_since_last)
+    __pdb_time = time.time()
+
+    # chain_id is case sensitive, pdb_id is not
+    if chain_id is None:
+        request_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/polymer_coverage/%s" % (pdb_id)
+    else:
+        request_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/polymer_coverage/%s/chain/%s" % (pdb_id,chain_id)
+    if verbose > 1:
+        print("Request PDB URL: %s" %request_url )
+    responce = urllib.request.urlopen(request_url)
+    json_str = urllib.parse.unquote( responce.read().decode('utf-8') )
+    # Return a dicts with info on each PDB
+    # if chain is given fragments are in dict['molecules'][0]['chains'][0]['observed']
+    return( json.loads(json_str)[pdb_id.lower()] )
+    
+
+def url_pdb_modified(pdb_id, chain_id=None, min_ping_time=50, verbose=0):
+    """URL lookup of modified residues in PDB entry"""
+
+    # Example of returned list element
+    # {'chem_comp_name': 'N(6)-ACETYLLYSINE', 'entity_id': 1, 'description': 'Small ubiquitin-related modifier 1', 'weight': 9567.801,
+    # 'residue_number': 32, 'author_residue_number': 46, 'chain_id': 'A', 'alternate_conformers': 0, 'author_insertion_code': '',
+    # 'chem_comp_id': 'ALY', 'struct_asym_id': 'A'}
+    
+    # Go easy on servers and wait if necessary
+    global __pdb_time
+    time_since_last = time.time() - __pdb_time
+    if time_since_last < min_ping_time:
+        if verbose > 0:
+            print("Avoid frequent PDB requests and wait %.1f seconds" % (min_ping_time - time_since_last))
+        time.sleep(min_ping_time - time_since_last)
+    __pdb_time = time.time()
+
+    # chain_id is case sensitive, pdb_id is not
+    request_url = "https://www.ebi.ac.uk/pdbe/api/pdb/entry/modified_AA_or_NA/%s" % (pdb_id)
+    if verbose > 1:
+        print("Request PDB URL: %s" %request_url )
+    try:
+        responce = urllib.request.urlopen(request_url)
+    except urllib.error.HTTPError as e:
+        if e.code == 404:
+            # URL error 'page not found' means that there are no modified residues annotated for this PDB
+            return({})
+        else:
+            # All other errors are raised normally
+            raise urllib.error.HTTPError(e)
+        
+    json_str = urllib.parse.unquote( responce.read().decode('utf-8') )
+    modified_dict = json.loads(json_str)
+    if not pdb_id.lower() in modified_dict.keys():
+        raise ValueError("ERROR: PDB modified responce does not seem to cover %s" % (pdb_id))
+    modified_dict = modified_dict[pdb_id.lower()]
+        
+    if chain_id is None:
+        return(modified_dict)
+    
+    modified_chain = {}
+    for ent in modified_dict:
+        if ent['chain_id'] != ent['struct_asym_id'] and verbose > 1:
+            print("%s: modified residue record has chain_id %s but struct_asym_id %s, using chain_id" %
+                  (pdb_id, ent['chain_id'], ent['struct_asym_id']))
+    
+        # Check if chain already has a record of residue modification 
+        if ent['chain_id'] in modified_chain.keys():
+            modified_chain[ent['chain_id']].append(ent)
+        else:
+            modified_chain[ent['chain_id']] = [ent]
+            
+    if chain_id in modified_chain.keys():
+        return(modified_chain[chain_id])
+    else:
+        # No annotations for requested chain
+        return({})
+
+
+def subalign(target_seqrecord, query_seqrecord, extra_seq=None, verbose=0):
+    """Align query sequence to target sequence and return aligned query sequence with gaps inserted
+
+    Query sequence is assumed to be a subsequence of target and a warning is generated if inserts 
+    are removed from the query
+
+    """
+    target_id = target_seqrecord.id
+    target_seq = str(target_seqrecord.seq)
+    target_nres = len(target_seq)
+    query_id = query_seqrecord.id
+    query_seq = str(query_seqrecord.seq)
+    
+    # Setup aligner
+    aligner = Align.PairwiseAligner()
+    aligner.mode = 'global'
+    aligner.substitution_matrix = Align.substitution_matrices.load("BLOSUM62")
+    aligner.open_gap_score = -4.0 # Default value for '*' mutation in BLOSUM62
+    aligner.extend_gap_score = -1.0
+    aligner.end_open_gap_score = -1.0 # end gap open is like extension since terminal may be viewed as a gap
+    score_uncertainty = 1
+    # print(aligner)
+    # print(Align.substitution_matrices.load("BLOSUM62"))
+    
+    alignments = sorted( aligner.align(target_seq, query_seq) )
+    if len(alignments) > 1:
+        if alignments[0].score - alignments[1].score < score_uncertainty:
+            print("WARNING: Second alignment of %s versus %s is within uncertainty %.1f" %
+                  (target_id, query_id, score_uncertainty))
+            print("%s: Alignment scores: %.1f and %.1f" % (target_id, alignments[0].score,alignments[1].score))
+
+    if verbose > 1:
+        print("%s: Best alignment of %d score %.1f:" % (target_id, len(alignments), alignments[0].score))
+        print(alignments[0])
+        if len(alignments) > 1 and verbose > 2:
+            print("%s: Second best alignment of %d score %.1f:" % (target_id, len(alignments), alignments[1].score))
+            print(alignments[1])
+        
+    # Get target and query alignment paths in Bio.Align.PairwiseAlignment tuple format
+    a = alignments[0].aligned; ta = a[0]; qa = a[1]
+    inserts = 0
+    aligned_query_seq = ""
+    if extra_seq is None:
+        aligned_extra_seq = None
+    else:
+        aligned_extra_seq = ""
+        
+    # If there's a missing range before first target fragment, these are gaps in the query sequence
+    aligned_query_seq += "-"*ta[0][0]
+    if not extra_seq is None:
+        aligned_extra_seq += "-"*ta[0][0]
+  
+    # Add first query fragment
+    aligned_query_seq += query_seq[qa[0][0]:qa[0][1]]
+    if not extra_seq is None:
+        aligned_extra_seq += extra_seq[qa[0][0]:qa[0][1]]
+
+    # Process following path fragments
+    for i in range(len(qa)-1):
+        # Check that end-position of query fragment i, is start position of query fragment i+1
+        if qa[i][1] != qa[i+1][0]:
+            inserts += qa[i+1][0]-qa[i][1]
+            if verbose > 0:
+                print("%s: alignment of %s will remove an insert of %d residues: %d-%s-%d" %
+                      (target_id, query_id, qa[i+1][0]-qa[i][1], qa[i][1], query_seq[qa[i][1]:qa[i+1][0]], qa[i+1][0]))
+            
+        # If there's a missing range in target path, these are gaps in the query sequence
+        aligned_query_seq += "-"*(ta[i+1][0]-ta[i][1])
+        if not extra_seq is None:
+            aligned_extra_seq += "-"*(ta[i+1][0]-ta[i][1])
+        
+        # Add next query fragment
+        aligned_query_seq += query_seq[qa[i+1][0]:qa[i+1][1]]
+        if not extra_seq is None:
+            aligned_extra_seq += extra_seq[qa[i+1][0]:qa[i+1][1]]
+        if verbose > 0 and ta[i][1] != ta[i+1][0]:
+            print("%s: query sequence %s has a deletion from (and including) target position %d to %d (%d residues)" %
+                  (target_id, query_id, ta[i][1], ta[i+1][0]+1, ta[i+1][0]-ta[i][1]))
+        
+    # If final target fragment is not last target position, add gaps untill it is
+    aligned_query_seq += "-"*(target_nres-ta[0-1][1])
+    if not extra_seq is None:
+        aligned_extra_seq += "-"*(target_nres-ta[0-1][1])
+            
+    return((aligned_query_seq, aligned_extra_seq, inserts))
+
+
+def strucmap_non_redundant(strucmap, verbose=0):
+    # list of pdb-chain identifiers
+    id_list = ["%s-%s" % (pdb,chain) for pdb,chain in zip(strucmap['pdb'], strucmap['chain'])]
+
+    # make a list of unique pdb-chain identifiers
+    (unq_id,unq_index,unq_counts) = np.unique(id_list, return_index=True, return_counts=True)
+
+    remove_rows = []
+    for unqi in np.where(unq_counts > 1)[0]:
+        # find structure map index of this structure
+        smapi = np.where(np.array(id_list) == unq_id[unqi])[0]
+        
+        # just checking...
+        if len(smapi) != unq_counts[unqi]:
+            raise ValueError("ERROR: %s found to have %d mappings but only %d indices found" % (unq_id[unqi], unq_counts[unqi], len(smapi)))
+        
+        col_nonid = []
+        for coli in range(len(strucmap.columns)):
+            # values in column of the rows with identical pdb,chain
+            col_val = list( strucmap.iloc[smapi,coli] )
+            # column is OK if values are identical and not nan
+            col_ok = False
+            if type(col_val[0]) is float:
+                if np.isnan(col_val[0]):
+                    # check that all elements are nan
+                    col_ok = np.all(np.isnan(col_val))
+                else:
+                    # check that all values are within machine precision
+                    col_ok = np.all(np.array(col_val)-col_val[0] < 1e-10)
+            else:
+                # compare all others as identities
+                col_ok = np.all(np.array(col_val) == col_val[0]) or col_val[0] is np.nan
+                
+            # store column indices that are not identical 
+            if not col_ok:
+                col_nonid.append(coli)
+                print("WARNING: Multiple maps of %s has different values in column \"%s\": %s" %
+                      (unq_id[unqi], strucmap.columns[coli], str(col_val)))
+
+        # raise error if columns are not OK
+        if len(col_nonid) > 0:
+            # print columns that are not identical together with pdb and chain id
+            print("WARNING: Structure %s occurs %d times in structures map with non-identical mapping in column(s): %s:" %
+                  (unq_id[unqi], unq_counts[unqi], ",".join(strucmap.columns[col_nonid])))
+            print(strucmap.iloc[smapi, [0,1]+col_nonid])
+            # raise ValueError("ERROR: Found %d identical %s structures with non-identical mapping in columns %s:" %
+            #                  (unq_counts[unqi],unq_id[unqi],",".join(strucmap.columns[col_nonid])))
+        elif verbose > 0:
+            print("WARNING: Structure %s occurs %d times in structure map, will only keep first occurence" % (unq_id[unqi], unq_counts[unqi]))
+            if verbose > 1:
+                print(strucmap.iloc[smapi])
+            
+        # all entities except first occurence marked for deletion 
+        remove_rows.extend(list(smapi[1:]))
+
+    # remove marked rows and make new index (row labels)
+    strucmap.drop(remove_rows, axis=0, inplace=True)
+    strucmap.reset_index(inplace=True, drop=True)
+    
+    # return non-redundant structure map
+    return(strucmap)
+
+
+def parse_args():
+    """
+    Argument parser function
+    """
+
+    parser = ArgumentParser( description="" )
+
+    # Initiating/setup command line arguments
+    parser.add_argument( 'uniprot_ids',
+        type=str,
+        help="Input uniprot IDs. Several should be separated by ;"
+        )
+    
+    args = parser.parse_args()
+
+    args.uniprot_ids = args.uniprot_ids.split(';')
+
+    return args
+
+if __name__ == '__main__':
+    """Find the best PDB structure for a Uniprot ID based on the SIFTS database 
+    """
+    # Verbose levels: 0: Print results only, 1: Print hvor result was obtained, 2: Debug information
+    verbose = 2
+
+    overwrite_data_files = False
+    
+    # data_dir = "/Users/kristofferjohansson/work/project_human_structome/struc_select/data"
+    data_dir = "./data"
+    # struc_dir = "/Users/kristofferjohansson/work/project_human_structome/struc_select/struc"
+    
+    # filename = '/Users/kristofferjohansson/work/project_human_structome/anders_uniprot_to_pdb/TorbensGenes.csv'
+    # if not os.path.isdir(data_dir):
+    #     try:
+    #         os.mkdir(data_dir)
+    #     except FileNotFoundError:
+    #         print("ERROR: Could not create data directory (parent dir missing?): %s" % (data_dir))
+    #         sys.exit(2)
+            
+    # Test directory excistance
+
+    if overwrite_data_files:
+        print("WARNING: Will overwrite all data in %s" % (data_dir))
+        
+    min_ping_time = 1.0
+    # From https://www.ebi.ac.uk/pdbe/pdbe-rest-api
+    # How many API calls am I allowed? At present we rely on user restraint and have not set any hard limits. But in
+    # future, depending on load on our servers, we may want to implement rate limiting.
+
+
+    # get user input arguments
+    args = parse_args()
+
+
+    # # Torbens genes
+    # gene_list_df = pd.read_csv(filename)
+    # uniprot_id_list = list(gene_list_df['Entry'])
+
+    # Rosetta test set:    PTEN     CALM1     SUMO1     UBE2I      TPK1       P53     MAPK1      TPMT      GAL4      UBI4
+    uniprot_id_list  = ['P60484', 'P0DP23', 'P63165', 'P63279', 'Q9H3S4', 'P04637', 'P28482', 'P51580', 'P04386', 'P0CG63']
+    # what we used         1D5R    4DJC_A    1WYW_B      2GRO    3S4Y_A    6RZ3_A      2OJG      2BZG    3COQ_A    3OLM_D
+    # Rosetta test set:   MAPK1      TPMT      GAL4      UBI4     BRCA1     ADRB2      HRas
+    uniprot_id_list += ['P28482', 'P51580', 'P04386', 'P0CG63', 'P38398', 'P07550', 'P01112']
+    # what we used         2OJG      2BZG    3COQ_A    3OLM_D    1JM7_A    3KJ6_A    4Q21_A
+    
+    # uniprot_id_list = ['P60484','P40692','O60260','P12931','P09651','P31483','P04035']
+    # uniprot_id_list = ['P01112']
+
+    uniprot_id_list = args.uniprot_ids
+
+    if verbose > 0:
+        print("Find structures for %d uniprot entities" % (len(uniprot_id_list)))
+    
+    for uniprot_id in uniprot_id_list:
+        # Get information on target protein either from file or url request
+        uniprot_filename = "uniprot_" + uniprot_id + ".xml"
+        has_uniprot_file = os.path.isfile(data_dir + "/" + uniprot_filename)
+        if not has_uniprot_file or overwrite_data_files:
+            if verbose > 0:
+                print("%s: Getting uniprot information from URL request" % (uniprot_id))
+            uniprot_record = url_uniprot(uniprot_id, min_ping_time=min_ping_time, verbose=verbose)
+            SeqIO.write(uniprot_record, data_dir + "/" + uniprot_filename, "seqxml")
+        else:
+            if verbose > 0:
+                print("%s: Getting uniprot information from data file %s" % (uniprot_id, uniprot_filename))
+            # BUG: SeqIO write and read in seqxml format seems to loose the name attribute but I could not find another
+            #   format that worked with the name attribute. Closest is 'pir' format the puts the name attribute in the
+            #   beginning of the description attribute but looses all dbxrefs.
+            uniprot_record = list( SeqIO.parse(data_dir + "/" + uniprot_filename, "seqxml") )[0]
+
+        nres_uniprot = len(uniprot_record.seq)
+        print("%s: Name: %s, description: %s, residues: %s" %
+              (uniprot_record.id, uniprot_record.name, uniprot_record.description, nres_uniprot))
+        if verbose > 0:
+            print(str(uniprot_record.seq))
+            
+        # Get structure map
+        strucmap_filename = "strucmap_" + uniprot_id + ".csv"
+        has_strucmap_file = os.path.isfile(data_dir + "/" + strucmap_filename)
+        if not has_strucmap_file or overwrite_data_files:
+            print("%s: Building structure map from URL requests" % (uniprot_id))
+
+            # Initial structure map from SIFTS URL request
+            try:
+                strucmap = url_sifts_strucmap(uniprot_record, min_ping_time=min_ping_time, verbose=verbose)
+            except Exception as err:
+                print("ERROR: %s SIFTS request failed (%s) skip uniprot entry" % (uniprot_id, err))
+                continue
+            
+            n_struc_raw = len(strucmap.index)
+            if n_struc_raw < 1:
+                print("ERROR: No structures assigned in SIFTS for %s, skip uniprot entry", uniprot_id)
+                continue
+
+            # Check if any PDB+chain id occurs multiple times
+            strucmap = strucmap_non_redundant(strucmap, verbose=verbose)
+            n_struc_nr = len(strucmap.index)
+        
+            # Require a minimum number of residues in the structure in order to apply Rosetta
+            # For P53_P04637, min_res 30 made 6rz3 go up from rank 364 to 277. Further min_res 40 to rank 22
+            min_res = 40
+            l = list( np.where(strucmap['cover_exp'] * nres_uniprot >= min_res)[0] )
+            if len(l) == 0:
+                print("WARNING: No structures of %s has coverage %d or more. Continuing with poor structures")
+            elif len(l) < n_struc_nr:
+                strucmap = strucmap.iloc[l]
+                # if verbose > 0:
+                #     print("%s: Reducing structure map from %d to %d structures with %d or more residues covered (SIFT coverage)" %
+                #           (uniprot_id, n_struc_nr, len(l), min_res))
+                strucmap.reset_index(inplace=True, drop=True)
+            else:
+                if verbose > 0:
+                    print("%s: No structures with coverage < %d found" % (uniprot_id,min_res))
+            n_struc_minres = len(strucmap.index)
+            
+            # Shortlist structure map based on SIFTS assignments of experimental method and coverage
+            n_short = 1000
+            if n_struc_minres > n_short:
+                if verbose > 0:
+                    print("%s: Reducing list of structures from %d to top %d based on SIFTS assignments alone" %
+                          (uniprot_id, n_struc_minres, n_short))
+                strucmap = rank_structures(uniprot_record, strucmap, verbose=verbose)
+                strucmap = strucmap.iloc[range(n_short)]
+                strucmap.reset_index(inplace=True, drop=True)
+            else:
+                if verbose > 0:
+                    print("%s: No shortlisting applied" % (uniprot_id))
+            n_struc_sl = len(strucmap.index)
+
+            # # Find a given PDB in the strucmap
+            # i = np.where(strucmap['pdb']=="6rz3")[0][0]
+            # print(i)
+            # print(strucmap.iloc[range(max(0,i-20),i+10)])
+                
+            print("%s: Number of mapped structures: sifts_raw: %d  non-redundant: %d  min_res_%d: %d  shortlist: %d" %
+                  (uniprot_id, n_struc_raw, n_struc_nr, min_res, n_struc_minres, n_struc_sl))
+            
+            # Update structure map with observation of residues based on PDBe URL requests
+            try:
+                strucmap = align_strucmap(uniprot_record, strucmap, min_ping_time=min_ping_time, verbose=verbose)
+            except Exception as err:
+                print("ERROR: %s PDB alignments failed (%s)" % (uniprot_id,err))
+                continue
+
+        else:
+            print("%s: Getting structure map from data file %s" % (uniprot_id, strucmap_filename))
+            # Read struncture map from disk
+            strucmap = pd.read_csv(data_dir + "/" + strucmap_filename, sep=';')
+
+        # Columns for pretty-printing
+        pcol = ["pdb", "chain", "score", "select", "method_exp", "method_res", "cover_exp", "cover_obs", "ident_exp", "ident_obs",
+                "inserts", "deletions", "mismatch", "non-obs", "modified"]
+
+        # Maske jeg kan bruge shortlist her? Kald den "select_structures(uniprot_record, strucmap, 20, verbose=verbose))
+        # Select structures based on the SIFTS mapping
+        strucmap = rank_structures(uniprot_record, strucmap, verbose=verbose)
+        print("%s: best structure %s-%s  method: %s  cover_obs: %.3f  deletions: %d  inserts: %d  mismatch: %d  modified: %d" %
+              (uniprot_id, strucmap.loc[0,'pdb'], strucmap.loc[0,'chain'], strucmap.loc[0,'method_exp'], strucmap.loc[0,'cover_obs'],
+               strucmap.loc[0,'deletions'], strucmap.loc[0,'inserts'], strucmap.loc[0,'mismatch'], strucmap.loc[0,'modified']))
+        if verbose > 0:
+            ntop = 20
+            print("%s top %d structures" % (uniprot_id,ntop))
+            if len(strucmap.index) < ntop:
+                print(strucmap[pcol])
+            else:
+                print(strucmap[pcol].iloc[range(ntop)])
+
+        # Write ranked structure map to disk
+        strucmap.to_csv(data_dir + "/" + strucmap_filename, index=False, sep=';')
+
+        # Structure with highest cover_obs and cover_exp (secondary score)
+        if verbose > 0:
+            i_max_cover_obs = strucmap.sort_values(by=['cover_obs','score'], ascending=False).index.values[0]
+            i_max_cover_exp = strucmap.sort_values(by=['cover_exp','score'], ascending=False).index.values[0]
+            print("%s best structure and structures with highest cover_obs and cover_exp" % (uniprot_id))
+            print(strucmap[pcol].iloc[[0,i_max_cover_obs,i_max_cover_exp]])
+            
+        # # Determine the optimal number of structures that cover target protein
+        # n_select = number_of_covering_structures(uniprot_record, strucmap, verbose=verbose)
+
+        # Write aligned sequences to FASTA file
+        seqrec_list = [SeqRecord(uniprot_record.seq, id=uniprot_record.id, description=uniprot_record.description)]
+        seqrec_list += [SeqRecord(Seq(seq_str), id="%s-%s" % (pdb,c), description="score=%.3f cover_obs=%.3f" % (score,cover_obs))
+                        for (seq_str,pdb,c,score,cover_obs) in
+                        zip(strucmap['aligned_resseq_obs'], strucmap['pdb'], strucmap['chain'], strucmap['score'], strucmap['cover_obs'])]
+        SeqIO.write(seqrec_list, data_dir + "/alignments_" + uniprot_id + ".fasta", "fasta")
+        
+        print("")