Source code for drsip

"""
========================================================
DR-SIP package (:mod:`drsip`)
========================================================

Package contains filters and protocols for predicting homo-oligomeric
transmembrane proteins (HoTPs).

Functions
---------

.. autofunction:: DR_SIP
.. autofunction:: load_data

Classes
-------

.. autoclass:: DR_SIP_Base
    :members:

.. autoclass:: DR_SIP_Membrane
    :members:

.. autoclass:: DR_SIP_Soluble
    :members:

"""


import pandas as pd
import numpy as np
import os
import string
from collections import Counter
import docking_eval
import MDAnalysis
import umsgpack
import zlib
import Cn_symm_criterion
import dist_restraints_filter
import struct_clustering
import tilt_angle_criterion
import save_load
import drsip_common as common
import readers


[docs]def load_data(filename, reader_obj=None): """ Load saved DR_SIP data from file and return DR_SIP instance. Parameters ---------- filename : str Full path to save file. Returns ------- drsip_inst Instance of DR_SIP with data from save file. """ with open(filename, 'rb') as input_file: temp_storage_dict = umsgpack.unpackb( zlib.decompress(input_file.read())) if temp_storage_dict['protocol'] == 'soluble': soluble_state = True elif temp_storage_dict['protocol'] == 'membrane': soluble_state = False else: raise ValueError('Invalid protocol type: {0}'.format( temp_storage_dict['protocol'])) if temp_storage_dict['has_dist_rest_data']: save_load.load_StrIO(temp_storage_dict, 'dist_rest_file') dist_rest_file_input = temp_storage_dict.pop('dist_rest_file') save_load.load_pd_table( temp_storage_dict, 'dist_rest_conform_dist', pop=True) else: dist_rest_file_input = '' if reader_obj is None: if ('reader' in temp_storage_dict) and \ (temp_storage_dict['reader'] in readers.readers_dict): reader_class = readers.readers_dict[temp_storage_dict['reader']] reader_obj = reader_class.load_data(temp_storage_dict) if not soluble_state: transmem_helix_sel_strs = temp_storage_dict.pop('transmem_helix_sel_strs') else: transmem_helix_sel_strs = [] drsip_inst = DR_SIP(reader_obj, dist_rest_file_input, transmem_helix_sel_strs, soluble_state ) for pd_var in ['filter_data', 'filter_pass_results', 'filtered_poses_data', 'final_results']: save_load.load_pd_table(temp_storage_dict, pd_var, pop=True) if 'poses_CAPRI_class_val' in temp_storage_dict: save_load.load_pd_table( temp_storage_dict, 'poses_CAPRI_class', pop=True) if not soluble_state: temp_storage_dict['static_orient_unit_vec'] = np.array( temp_storage_dict['static_orient_unit_vec'], dtype=np.float32) temp_storage_dict['trans_vect'] = np.array( temp_storage_dict['trans_vect'], dtype=np.float32) temp_storage_dict['axis_of_rot'] = np.array( temp_storage_dict['axis_of_rot'], dtype=np.float32) temp_storage_dict['cylindrical_axis'] = np.array( temp_storage_dict['cylindrical_axis'], dtype=np.float32) drsip_inst.__dict__.update(temp_storage_dict) return drsip_inst
[docs]def DR_SIP(reader_obj=None, dist_rest_file='', transmem_helix_sel_strs=[], soluble=False): """ Initialize the correct docking protocol. The default protocol is the membrane protocol. To select the soluble protocol, set soluble to True. Currently, only supports docking results from ZDOCK 3.0.2. Parameters ---------- static_pdb, mobile_pdb : str Full path to the PDB files containing the static and mobile monomers. zdock_output_file : str Full path to the ZDOCK 3.0.2 generated output file. dist_rest_file : str, optional Full path to the distance restraints file, defaults to an empty string. When empty, the distance restraints filter is not used. Required when soluble is set to True. File is formatted such that each line corresponds to a distance restraint (tab delimited):: chain_id_1 res_id_1 chain_id_2 res_id_2 distance transmem_helix_sel_strs : lists of strings, optional Define which residues are the transmembrane helices. Each string should select for 1 transmembrane helix using the MDAnalysis' `selection language <https://www.mdanalysis.org/docs/documentation_pages/selections.html>`_. Required if soluble is False. soluble : bool, optional Defaults to False, which uses DR-SIP's membrane protocol. When set to True, uses the membrane protocol. """ if soluble: # Soluble protocol if dist_rest_file != '': return DR_SIP_Soluble(reader_obj, dist_rest_file) else: raise IOError( 'No distance restraints file') else: # Membrane protocol # Check if transmem_helix_sel_strs is provided if len(transmem_helix_sel_strs) > 0: return DR_SIP_Membrane(reader_obj, transmem_helix_sel_strs, dist_rest_file ) else: raise ValueError("Transmembrane helix selection strings " "(transmem_helix_sel_strs) are required to " "perform the membrane protocol." )
[docs]class DR_SIP_Base(object): """ DR_SIP base class. For subclassing only. Parameters ---------- static_pdb, mobile_pdb : str Full path to the PDB files containing the static and mobile monomers. zdock_output_file : str Full path to the ZDOCK 3.0.2 generated output file. dist_rest_file : str, optional Full path to the distance restraints file, defaults to an empty string. File is formatted such that each line corresponds to a distance restraint (tab delimited):: chain_id_1 res_id_1 chain_id_2 res_id_2 distance chain_id_1 res_id_1 chain_id_2 res_id_2 distance """ def __init__(self, dist_rest_file=''): if dist_rest_file != '': self.dist_rest_file = dist_rest_file self.run_status = False def _parse_dist_res_input(self): """Parses the distance restraints input file into a Pandas table.""" self.dist_rest_table = pd.read_csv( self.dist_rest_file, delimiter='\t', header=None, index_col=None) self.dist_rest_table.columns = [ 'Segid 1', 'Resid 1', 'Segid 2', 'Resid 2', 'Dist'] # Swap columns of rows to make sure that the segid 1 corresponds to # the static monomer swap_rows = np.where( ~self.dist_rest_table['Segid 1'].isin(self.static_segids))[0] self.dist_rest_table.iloc[swap_rows, 0:4] = \ self.dist_rest_table.iloc[swap_rows][['Segid 2', 'Resid 2', 'Segid 1', 'Resid 1']].values # Make sure each column's dtype is correct self.dist_rest_table = self.dist_rest_table.astype( dtype={'Segid 1': 'string', 'Resid 1': 'int32', 'Segid 2': 'string', 'Resid 2': 'int32', 'Dist': 'float32'}) # Make sure the segids are valid segids segids_only = self.dist_rest_table.loc[:, [ 'Segid 1', 'Segid 2']] num_unknown_segids = np.where(~segids_only.isin( np.unique(self.static_segids + self.mobile_segids)))[0].shape[0] if num_unknown_segids > 0: raise Exception( 'Invalid chain IDs in the distance restraints file:' ' Must be from the static/mobile structure' ) # Check and see if there are pairs whose segids are identical num_identical_segids = np.where( segids_only.iloc[:, 0] == segids_only.iloc[:, 1])[0].shape[0] if num_identical_segids > 0: raise Exception( 'Distance restraints file contains pairs of residues' ' with the same chain ID' ) # Select of residues involved in the the distance restraints sel_str_format = '(segid {0} and resid {1})'.format static_sel_list = [sel_str_format(*resid) for resid in self.dist_rest_table[['Segid 1', 'Resid 1']].values ] self.dist_res_static_sel_str = ' or '.join( static_sel_list) + ' and name CA' self.dist_res_static_sel = self.static_sel.select_atoms( self.dist_res_static_sel_str) mobile_sel_list = [sel_str_format(*resid) for resid in self.dist_rest_table[['Segid 2', 'Resid 2']].values ] self.dist_res_mobile_sel_str = ' or '.join( mobile_sel_list) + ' and name CA' self.dist_res_mobile_sel = self.poses_uni.select_atoms( self.dist_res_mobile_sel_str) seg_res_1_pairs_ordered = zip( self.dist_res_static_sel.segids, self.dist_res_static_sel.resids) seg_res_2_pairs_ordered = zip( self.dist_res_mobile_sel.segids, self.dist_res_mobile_sel.resids) # Indices for extracting the relevent residue pairs from the distance # matrix self.dist_mat_idx = [ [seg_res_1_pairs_ordered.index( tuple(x[1].tolist()) ) for x in self.dist_rest_table.loc[:, ['Segid 1', 'Resid 1']].iterrows() ], [seg_res_2_pairs_ordered.index( tuple(x[1].tolist()) ) for x in self.dist_rest_table.loc[:, ['Segid 2', 'Resid 2']].iterrows() ] ] # Distance restraints data self.dist_rest = self.dist_rest_table.loc[:, 'Dist'].values # Initialize table to store the distances of the residue pairs for # each pose col_name = ['{0}-{1}-{2}-{3}'.format( *self.dist_rest_table.iloc[idx, 0:4]) for idx in xrange(self.dist_rest_table.shape[0])] self.dist_rest_conform_dist = pd.DataFrame(np.zeros( (self.total_num_poses, self.dist_rest_table.shape[0])), columns=col_name, index=range(1, self.total_num_poses + 1) )
[docs] def write_results_file(self, filename): """ Write the top results to a CSV file. Parameters ---------- filename : str Full path to save the comma-seperated (CSV) file. """ raise NotImplementedError
[docs] def build_final_results_table(self, clusters, filtered_poses_data): """ Returns the final results table. Table contains the final ranking of the docking poses. Needs to be implemented by the child class. Parameters ---------- clusters : list of int List containing the cluster number that each pose is in. filtered_poses_data : pandas.Dataframe Table containing the filter data and cluster number of each pose. Returns ------- results_table : pandas.Dataframe Final results table. """ raise NotImplementedError
[docs] def get_final_results_table(self): """Returns the final results table. Returns ------- pandas.Dataframe Table of the final results. """ if not self.run_status: self.run() return self.final_results
[docs] def calc_CAPRI_class(self, ref_pdb_file, static_sel_str, mobile_sel_str): """ Classify each docking pose to one of four CAPRI classes. Each docking pose is compared to the reference pose in ref_pdb_file and assigned to one of the following classes: 'Incorrect', 'Acceptable', 'Medium' or 'High'. Uses the CAPRI criteria to classify the poses. See :py:class:`docking_eval.CAPRI_Criteria`. Needs to be implemented by the child class. Parameters ---------- ref_pdb_file : str Full file path to PDB file containing the reference pose. static_sel_str, mobile_sel_str : str MDAnalysis style selection string to select the atoms in ref_pdb_file that corresponds to the static and mobile monomers, respectively. Returns ------- CAPRI_class : pandas.Dataframe Table of the docking poses and their respective classifications. """ raise NotImplementedError
[docs] def get_CAPRI_class_table(self): """Returns the CAPRI classification table. Returns a table containing the CAPRI classification of each docking pose. Run the calc_CAPRI_class (:py:meth:`membrane <drsip.DR_SIP_Membrane.calc_CAPRI_class>` or :py:meth:`soluble <drsip.DR_SIP_Soluble.calc_CAPRI_class>`) method before calling this method. Returns ------- pandas.Dataframe Table of the docking poses with their respective classifications, % Native Contacts and iRMSD. """ if not self.CAPRI_assign_status: raise Exception('Run the calc_CAPRI_class method before ' 'running this method' ) return self.poses_CAPRI_class
[docs] def run(self): """ Executes the docking protocol. Needs to be implemented by the child class. """ raise NotImplementedError
def not_available(self): raise NotImplementedError
[docs]class DR_SIP_Membrane(DR_SIP_Base): """ DR-SIP's membrane protocol. Protocol applies the C\ :sub:`n` symmetry, tilt angle and distance restraints (optional) filters on docking poses. Currently, only supports docking results from ZDOCK 3.0.2. Parameters ---------- static_pdb, mobile_pdb : str Full path to the PDB files containing the static and mobile monomers. zdock_output_file : str Full path to the ZDOCK 3.0.2 generated output file. transmem_helix_sel_strs : lists of strings Define which residues are the transmembrane helices. Each string should select for 1 transmembrane helix using the MDAnalysis' `selection language <https://www.mdanalysis.org/docs/documentation_pages/selections.html>`_. dist_rest_file : str, optional Full path to the distance restraints file, defaults to an empty string. File is formatted such that each line corresponds to a distance restraint (tab delimited):: chain_1 resid_1 chain_2 resid_2 distance """ def __init__(self, reader_obj=None, transmem_helix_sel_strs=[], dist_rest_file=''): super(DR_SIP_Membrane, self).__init__(dist_rest_file) if reader_obj: self.reader_obj = reader_obj self.total_num_poses = self.reader_obj.total_num_poses self.static_segids = self.reader_obj.static_segids self.mobile_segids = self.reader_obj.mobile_segids self.static_sel = self.reader_obj.static_sel self.mobile_sel = self.reader_obj.mobile_sel self.poses_uni = self.reader_obj.poses_uni if dist_rest_file != '': self._parse_dist_res_input() self.has_dist_rest_data = True else: self.has_dist_rest_data = False if not transmem_helix_sel_strs: raise Exception('Transmembrane helix selection ' 'strings are required' ) self.transmem_helix_sel_strs = transmem_helix_sel_strs else: self.run = self.not_available self.calc_CAPRI_class = self.not_available self.write_topN_poses = self.not_available self.write_pose = self.not_available
[docs] def build_final_results_table(self, clusters, filtered_poses_data): """ Returns the final results table. Table contains the representative poses of each cluster ranked based on the size of the clusters (in descending order). For each representative pose the Cluster ID, Order of Symmetry, C\ :sub:`n` symmetry RMSD, Tilt Angle, Pearson and Spearman correlations are included in the table. The representative pose of each cluster are chosen based on 2 criteria: 1. The consensus order of symmetry within a cluster. Only poses that have the consensus order of symmetry are chosen. 2. The pose with the lowest C\ :sub:`n` symmetry RMSD. Use :py:meth:`get_final_results_table <drsip.DR_SIP_Soluble.get_final_results_table>` to return the table. Parameters ---------- clusters : list of int List containing the cluster number that each pose is in. filtered_poses_data : pandas.Dataframe Table containing the filter data and cluster number of each pose. """ cluster_counts = Counter(clusters) final_table = pd.DataFrame() for cluster_num, counts in cluster_counts.most_common(): if counts < 3: break cluster_members = filtered_poses_data[ filtered_poses_data['Cluster ID'] == cluster_num ].copy() consensus_order_of_symm = \ cluster_members['Order of Symmetry'].value_counts( ).index.values[0] consensus_cluster_members = cluster_members[ cluster_members['Order of Symmetry'] == consensus_order_of_symm ].copy() consensus_cluster_members['C-n Symm Rank'] = np.argsort( consensus_cluster_members['Cn Symm RMSD'].values) final_table = final_table.append( consensus_cluster_members.sort_values('C-n Symm Rank').iloc[0].copy(), sort=True ) if final_table.shape[0] != 0: final_table.drop('C-n Symm Rank', axis=1, inplace=True) else: final_table = self.filtered_poses_data.copy() final_table['DRSIP Rank'] = range(1, final_table.shape[0]+1) final_table.index.name = 'ZDOCK Rank' self.final_results = final_table.copy()
[docs] def calc_CAPRI_class(self, ref_pdb_file, static_sel_str, mobile_sel_str, num_mers): """ Classify each docking pose to one of four CAPRI classes. Each docking pose is compared to the reference pose in ref_pdb_file and assigned to one of the following classes: 'Incorrect', 'Acceptable', 'Medium' or 'High'. Uses the modified CAPRI criteria for HoTPs to classify the poses. See :py:meth:`docking_eval.assign_Cn_symm_CAPRI_class <docking_eval.assign_Cn_symm_CAPRI_class>`. The modified CAPRI criteria includes an additional criterion where the predicted size ("n") of the pose must be equal the size of the reference complex (num_mers). Parameters ---------- ref_pdb_file : str Full file path to PDB file containing the reference pose. static_sel_str, mobile_sel_str : str MDAnalysis style selection string to select the atoms in ref_pdb_file that corresponds to the static and mobile monomer, respectively. num_mers : int The size of the original C\ :sub:`n` symmetric HoTP complex. Returns ------- CAPRI_class : pandas.Dataframe Table of the docking poses with their respective classifications, % Native Contacts and iRMSD. """ if not self.run_status: self.run() universe = MDAnalysis.Universe(ref_pdb_file) self.poses_uni.trajectory[0] self.poses_CAPRI_class = docking_eval.assign_Cn_symm_CAPRI_class( universe.atoms, static_sel_str, mobile_sel_str, self.poses_uni.atoms, self.static_segids, self.mobile_segids, num_mers, self.filter_data['Order of Symmetry'].values) self.CAPRI_assign_status = True
[docs] def run(self, Cn_rmsd_cutoff=2.0, dist_rest_cutoff=0.3, tilt_angle_cutoff=0.541052, cluster_RMSD_cutoff=12.0, num_mers=None): """ Executes the membrane protocol. Parameters ---------- Cn_rmsd_cutoff : float, optional The C\ :sub:`n` symmetry RMSD filter cutoff value (in Angstrom). Default is 2 Angstrom. dist_rest_cutoff : float, optional The distance restraints filter cutoff value (correlation). Default correlation 0.3. tilt_angle_cutoff : float, optional The tilt angle filter cutoff value (radians). Default angle is 0.541052 rad (31 degrees). cluster_RMSD_cutoff : float, optional Cutoff used to cluster the docking poses. Default cutoff is 12 Angstrom. """ # Get the selection, coordinates and COM of the static monomer self.static_CA_sel = self.static_sel.select_atoms( 'segid ' + ' '.join(self.static_segids) + ' and name CA') static_CA_coord = self.static_CA_sel.positions.astype('float64') static_CA_com = static_CA_coord.mean(axis=0) static_CA_ori_coord = static_CA_coord - static_CA_com # Get the selection of the mobile monomer self.mobile_CA_sel = self.poses_uni.select_atoms( 'segid ' + ' '.join(self.mobile_segids) + ' and name CA') # Initialize the temp arrays to store the filter results and # the filter_data and filter_pass_results tables if self.has_dist_rest_data: filter_data_num_cols = 6 filter_pass_results_num_cols = 4 filter_data_col_names = ['DR Spearman Corr', 'DR Pearson Corr', 'Order of Symmetry', 'Cn Symm RMSD', 'Tilt Angle', 'Filter Pass Status' ] filter_pass_status_col_names = ['DR Pass Status', 'Cn Symm Pass Status', 'Tilt Angle Pass Status', 'All Pass Status' ] static_dist_rest_CA_coord = self.dist_res_static_sel.positions else: filter_data_num_cols = 4 filter_pass_results_num_cols = 3 filter_data_col_names = ['Order of Symmetry', 'Cn Symm RMSD', 'Tilt Angle', 'Filter Pass Status' ] filter_pass_status_col_names = ['Cn Symm Pass Status', 'Tilt Angle Pass Status', 'All Pass Status' ] if num_mers: filter_data_num_cols += 1 filter_pass_results_num_cols += 1 filter_data_col_names.insert(-1, 'Num Mers Pass Status') filter_pass_status_col_names.insert(-1, 'Num Mers Pass Status') tmp_filter_status = np.zeros( (self.total_num_poses, filter_pass_results_num_cols), dtype='bool' ) tmp_filter_results = np.zeros( (self.total_num_poses, filter_data_num_cols), dtype='float32' ) self.filter_data = pd.DataFrame(np.zeros( (self.total_num_poses, filter_data_num_cols), dtype='float32' ), index=range( 1, self.total_num_poses + 1), columns=filter_data_col_names ) self.filter_data.index.name = 'Original Rank' self.filter_pass_results = pd.DataFrame(np.zeros( (self.total_num_poses, filter_pass_results_num_cols), dtype='bool' ), index=range( 1, self.total_num_poses + 1), columns=filter_pass_status_col_names ) self.filter_pass_results.index.name = 'Original Rank' self.trans_vect = np.zeros( (self.total_num_poses, 3), dtype='float32') self.axis_of_rot = np.zeros( (self.total_num_poses, 3), dtype='float32') self.cylindrical_axis = np.zeros(3, dtype='float32') self.cylindrical_axis[:] = tilt_angle_criterion \ .get_cylindrical_axis( self.static_CA_sel, self.transmem_helix_sel_strs ) self.static_orient_unit_vec = static_CA_ori_coord[0] / \ np.linalg.norm(static_CA_ori_coord[0]) # Temp arrays to store mobile coords that have passed the filters poses_mobile_coords = np.zeros( (self.total_num_poses, static_CA_coord.shape[0], 3), dtype='float32') poses_mobile_coords_idx = 0 for pose_idx, ts in enumerate(self.poses_uni.trajectory): filter_results_idx = 0 filter_status_idx = 0 # Distance Restraints (DR) filter: if self.has_dist_rest_data: mobile_dist_rest_CA_coord = self.dist_res_mobile_sel.positions (current_distances, spearman_correl, pearson_correl, dist_rest_pass_status ) = dist_restraints_filter \ .dist_restraints_filter( static_dist_rest_CA_coord, mobile_dist_rest_CA_coord, self.dist_mat_idx, self.dist_rest, cutoff=dist_rest_cutoff ) self.dist_rest_conform_dist.values[pose_idx] = \ current_distances tmp_filter_results[pose_idx, filter_results_idx] = \ spearman_correl tmp_filter_results[pose_idx, filter_results_idx+1] = \ pearson_correl tmp_filter_status[pose_idx, filter_status_idx] = \ dist_rest_pass_status filter_results_idx += 2 filter_status_idx += 1 # Get the selection, coordinates and COM of the mobile monomer mobile_CA_coord = self.mobile_CA_sel.positions.astype( 'float64') mobile_CA_com = mobile_CA_coord.mean(axis=0) mobile_CA_ori_coord = mobile_CA_coord - mobile_CA_com diff_vect = mobile_CA_com - static_CA_com # SIP filter: # Cn symmetry criterion (axis_of_rot, static_trans_vect, pred_num_mers, RMSD, C_n_pass_status ) = Cn_symm_criterion.Cn_symm_criterion( static_CA_ori_coord, mobile_CA_ori_coord, diff_vect, Cn_rmsd_cutoff ) # Tilt angle criterion self.axis_of_rot[pose_idx] = axis_of_rot self.trans_vect[pose_idx] = static_trans_vect tilt_angle, tilt_angle_pass_status = tilt_angle_criterion \ .tilt_angle_criterion( self.cylindrical_axis, axis_of_rot, cutoff=tilt_angle_cutoff ) # Num of mers filter if num_mers: num_mers_status = pred_num_mers == num_mers else: num_mers_status = True # Store filter results tmp_filter_results[pose_idx, filter_results_idx] = pred_num_mers tmp_filter_results[pose_idx, filter_results_idx+1] = RMSD tmp_filter_status[pose_idx, filter_status_idx] = C_n_pass_status tmp_filter_results[pose_idx, filter_results_idx+2] = np.degrees( tilt_angle ) tmp_filter_status[pose_idx, filter_status_idx+1] = \ tilt_angle_pass_status if num_mers: tmp_filter_status[pose_idx, filter_status_idx+2] = \ num_mers_status tmp_filter_results[pose_idx, filter_results_idx+3] = \ num_mers_status filter_status_idx += 1 filter_results_idx += 1 if self.has_dist_rest_data: tmp_filter_status[pose_idx, filter_status_idx+2] = \ dist_rest_pass_status * \ C_n_pass_status * \ tilt_angle_pass_status * \ num_mers_status else: tmp_filter_status[pose_idx, filter_status_idx+2] = \ C_n_pass_status * \ tilt_angle_pass_status * \ num_mers_status tmp_filter_results[pose_idx, filter_results_idx+3] = \ tmp_filter_status[pose_idx, filter_status_idx+2] if tmp_filter_status[pose_idx, filter_status_idx+2]: poses_mobile_coords[poses_mobile_coords_idx] = mobile_CA_coord poses_mobile_coords_idx += 1 self.filter_data.values[:] = tmp_filter_results self.filter_pass_results.values[:] = tmp_filter_status self.filtered_poses_data = self.filter_data[ self.filter_pass_results['All Pass Status'].values ].copy().drop('Filter Pass Status', axis=1) if poses_mobile_coords_idx < 2: self.final_results = self.filtered_poses_data.copy() self.final_results['DRSIP Rank'] = range( 1, self.final_results.shape[0]+1 ) self.final_results.index.name = 'Original Rank' else: # Cluster remaining poses filtered_poses_dist_mat = struct_clustering \ .cal_rmsd_between_poses_membrane( static_CA_coord.astype('float32'), poses_mobile_coords[:poses_mobile_coords_idx] ) clusters = struct_clustering.cluster_poses( filtered_poses_dist_mat, cutoff=cluster_RMSD_cutoff) self.filtered_poses_data['Cluster ID'] = clusters # Select cluster representatives, then rank the clusters and # build the final results table self.build_final_results_table( clusters, self.filtered_poses_data) self.run_status = True
[docs] def write_results_file(self, filename): """ Write the top results to a CSV file. Parameters ---------- filename : str Full path to save the comma-seperated (CSV) file. """ if not self.run_status: self.run() common.makedir(filename) self.final_results.to_csv(filename)
[docs] def save_data(self, filename): """Save current session/instance to file. Parameters ---------- filename : str Full file path to save to """ if not self.run_status: self.run() temp_storage_dict = {} temp_storage_dict = self.reader_obj.save_data(temp_storage_dict) temp_storage_dict.update( {'has_dist_rest_data': self.has_dist_rest_data, 'static_orient_unit_vec': self.static_orient_unit_vec.tolist(), 'trans_vect': self.trans_vect.tolist(), 'axis_of_rot': self.axis_of_rot.tolist(), 'cylindrical_axis': self.cylindrical_axis.tolist(), 'protocol': 'membrane', 'transmem_helix_sel_strs': self.transmem_helix_sel_strs, 'run_status': self.run_status } ) save_load.save_pd_table( temp_storage_dict, self.filter_data, 'filter_data') save_load.save_pd_table( temp_storage_dict, self.filter_pass_results, 'filter_pass_results') save_load.save_pd_table( temp_storage_dict, self.filtered_poses_data, 'filtered_poses_data') save_load.save_pd_table( temp_storage_dict, self.final_results, 'final_results') if self.has_dist_rest_data: temp_storage_dict['dist_rest_file'] = \ save_load.convert_StrIO_or_file_to_str( self.dist_rest_file) save_load.save_pd_table( temp_storage_dict, self.dist_rest_conform_dist, 'dist_rest_conform_dist' ) if hasattr(self, 'poses_CAPRI_class'): save_load.save_pd_table( temp_storage_dict, self.poses_CAPRI_class, 'poses_CAPRI_class') common.makedir(filename) with open(filename, 'wb') as output_file: output_file.write(zlib.compress( umsgpack.packb(temp_storage_dict), 9))
[docs] def write_pose(self, pose_num, filename): """Write a C\ :sub:`n` symmetric complex to a PDB file. Given the original rank of a ZDOCK pose, write out the closest ideal C\ :sub:`n` symmetric complex to a PDB file. Parameters ---------- pose_num : int The pose's original ZDOCK rank. filename : str Filename to store the complex in PDB format. """ pose_idx = pose_num - 1 order_of_symm = np.int32( self.filter_data['Order of Symmetry'].values[pose_idx]) rot_mat = Cn_symm_criterion.get_rot_mat_arbitrary_axis( 2 * np.pi / order_of_symm, self.axis_of_rot[pose_idx]) current_pos = self.static_sel.positions - \ self.static_sel.center_of_mass() + \ self.trans_vect[pose_idx] num_atoms = self.static_sel.n_atoms complex_uni = MDAnalysis.Merge( *([self.static_sel] * order_of_symm)) current_select = complex_uni.select_atoms( 'bynum 1:%d' % (num_atoms * 1)) current_select.segments.segids = 'A' current_select.positions = current_pos for current_subunit_id in range(2, order_of_symm + 1): prev_subunit_id = current_subunit_id - 1 current_pos = current_pos.dot(rot_mat.T) current_select = complex_uni.select_atoms('bynum %d:%d' % ( num_atoms * prev_subunit_id + 1, num_atoms * current_subunit_id ) ) current_select.positions = current_pos current_select.segments.segids = string.uppercase[prev_subunit_id] common.makedir(filename) complex_uni.atoms.write(filename)
[docs] def write_topN_poses(self, folder, num_poses=20): """Write the top-N C\ :sub:`n` symmetric complexes to PDB files. The top-N complexes are from the DR-SIP final results table. Parameters ---------- folder : str Folder to write the PDB files of the top-N complexes Files are written out as Pose_1.pdb, Pose_2.pdb, ..., Pose_N.pdb. num_poses : int, optional The number of top-N poses to write out. Default: 20 """ if not self.run_status: self.run() if self.final_results.shape[0] < num_poses: num_poses = self.final_results.shape[0] for idx in xrange(num_poses): pose_num = self.final_results.index.values[idx] self.write_pose(pose_num, os.path.join( folder, 'Pose_%d.pdb' % (idx + 1)))
[docs]class DR_SIP_Soluble(DR_SIP_Base): """ DR-SIP's soluble protocol. Protocol applies the distance restraints filter on docking poses. Currently, only supports docking results from ZDOCK 3.0.2. Parameters ---------- static_pdb, mobile_pdb : str Full path to the PDB files containing the static and mobile monomers. zdock_output_file : str Full path to the ZDOCK 3.0.2 generated output file. dist_rest_file : str Full path to the distance restraints file. File is formatted such that each line corresponds to a distance restraint (tab delimited):: chain_1 resid_1 chain_2 resid_2 distance """ def __init__(self, reader_obj, dist_rest_file): super(DR_SIP_Soluble, self).__init__(dist_rest_file) if reader_obj: self.reader_obj = reader_obj self.total_num_poses = self.reader_obj.total_num_poses self.static_segids = self.reader_obj.static_segids self.mobile_segids = self.reader_obj.mobile_segids self.static_sel = self.reader_obj.static_sel self.mobile_sel = self.reader_obj.mobile_sel self.poses_uni = self.reader_obj.poses_uni if dist_rest_file != '': self._parse_dist_res_input() self.has_dist_rest_data = True else: self.has_dist_rest_data = False else: self.run = self.not_available self.calc_CAPRI_class = self.not_available self.write_topN_poses = self.not_available self.write_pose = self.not_available
[docs] def run(self, dist_rest_cutoff=0.3, cluster_RMSD_cutoff=12.0): """ Executes the soluble protocol. Parameters ---------- dist_rest_cutoff : float, optional The distance restraints filter cutoff value (correlation). Default correlation 0.3. cluster_RMSD_cutoff : float, optional Cutoff used to cluster the docking poses. Default cutoff is 12 Angstrom. """ # Get the selection, coordinates and COM of the static monomer self.static_CA_sel = self.static_sel.select_atoms( 'name CA') static_CA_coord = self.static_CA_sel.positions static_dist_rest_CA_coord = self.dist_res_static_sel.positions # Temp arrays to store the filter results tmp_filter_status = np.zeros( (self.total_num_poses, 1), dtype='bool') tmp_filter_results = np.zeros( (self.total_num_poses, 3), dtype='float32') self.filter_data = pd.DataFrame(np.zeros( (self.total_num_poses, 3), dtype='float32' ), index=range(1, self.total_num_poses + 1), columns=['DR Spearman Corr', 'DR Pearson Corr', 'Filter Pass Status' ] ) self.filter_data.index.name = 'Original Rank' self.filter_pass_results = pd.DataFrame(np.zeros( (self.total_num_poses), dtype='bool'), index=range(1, self.total_num_poses + 1), columns=['DR Pass Status'] ) self.filter_pass_results.index.name = 'Original Rank' # Temp arrays to store mobile coords that have passed the filters poses_mobile_coords = np.zeros( (self.total_num_poses, self.mobile_sel.select_atoms('name CA') .positions.shape[0], 3 ), dtype='float32') poses_mobile_coords_idx = 0 self.mobile_CA_sel = self.poses_uni.select_atoms( 'segid ' + ' '.join(self.mobile_segids) + ' and name CA') for pose_idx, ts in enumerate(self.poses_uni.trajectory): # Get the coordinates and COM of the mobile monomer mobile_CA_coord = self.mobile_CA_sel.positions.astype( 'float64') mobile_dist_rest_CA_coord = self.dist_res_mobile_sel.positions # Distance Restraints (DR) filter: (current_distances, spearman_correl, pearson_correl, dist_rest_pass_status ) = dist_restraints_filter.dist_restraints_filter( static_dist_rest_CA_coord, mobile_dist_rest_CA_coord, self.dist_mat_idx, self.dist_rest, cutoff=dist_rest_cutoff ) # Store filter results self.dist_rest_conform_dist.values[pose_idx] = current_distances tmp_filter_results[pose_idx, 0] = spearman_correl tmp_filter_results[pose_idx, 1] = pearson_correl tmp_filter_results[pose_idx, 2] = dist_rest_pass_status tmp_filter_status[pose_idx] = dist_rest_pass_status if tmp_filter_status[pose_idx]: poses_mobile_coords[poses_mobile_coords_idx] = mobile_CA_coord poses_mobile_coords_idx += 1 self.filter_data.values[:] = tmp_filter_results self.filter_pass_results.values[:] = tmp_filter_status self.filtered_poses_data = self.filter_data[ self.filter_pass_results['DR Pass Status'].values] \ .copy().drop('Filter Pass Status', axis=1) # Cluster remaining poses self.filtered_poses_dist_mat = \ struct_clustering.cal_rmsd_between_poses_soluble( static_CA_coord, poses_mobile_coords[:poses_mobile_coords_idx] ) clusters = struct_clustering.cluster_poses( self.filtered_poses_dist_mat, cutoff=cluster_RMSD_cutoff) self.filtered_poses_data['Cluster ID'] = clusters # Select cluster representatives, rank the clusters and build # the final results table self.build_final_results_table( clusters, self.filtered_poses_data) self.run_status = True
[docs] def write_results_file(self, filename): """ Write the top results to a CSV file. Parameters ---------- filename : str Full path to save the comma-seperated (CSV) file. """ if not self.run_status: self.run() common.makedir(filename) self.final_results.to_csv(filename)
[docs] def build_final_results_table(self, clusters, filtered_poses_data): """ Build the final results table. Table contains the representative poses of each cluster ranked based on the size of the clusters (in descending order). For each representative pose the Cluster ID, the Pearson and Spearman correlations are included in the table. The representative pose of each cluster has the highest Spearman correlation. If there are more than one pose with the same highest correlation, the one with the highest Pearson correlation is chosen. Use :py:meth:`get_final_results_table <drsip.DR_SIP_Soluble.get_final_results_table>` to return the table. Parameters ---------- clusters : list of int List containing the cluster number that each pose is in. filtered_poses_data : pandas.Dataframe Table containing the filter data and cluster number of each pose. """ cluster_counts = Counter(clusters) final_table = pd.DataFrame() for cluster_num, counts in cluster_counts.most_common(): if counts < 3: break cluster_members = \ filtered_poses_data[filtered_poses_data['Cluster ID'] == cluster_num].copy() cluster_members_max_Spearman = cluster_members[ np.abs( cluster_members['DR Spearman Corr'] - cluster_members['DR Spearman Corr'].max() ) < 10**(-5) ] if cluster_members_max_Spearman.shape[0] == 1: final_table = final_table.append( cluster_members_max_Spearman, sort=True) else: final_table = final_table.append( cluster_members_max_Spearman.loc[ cluster_members_max_Spearman['DR Pearson Corr'] .idxmax() ], sort=True) if final_table.shape[0] != 0: final_table['DRSIP Rank'] = range(1, final_table.shape[0]+1) final_table.index.name = 'Original Rank' self.final_results = final_table.copy()
[docs] def calc_CAPRI_class(self, ref_pdb_file, static_sel_str, mobile_sel_str): """ Classify each docking pose to one of four CAPRI classes. Each docking pose is compared to the reference pose in ref_pdb_file and assigned to one of the following classes: 'Incorrect', 'Acceptable', 'Medium' or 'High'. Uses the CAPRI criteria to classify the poses. See :py:meth:`docking_eval.assign_soluble_CAPRI_class <docking_eval.assign_soluble_CAPRI_class>`. Parameters ---------- ref_pdb_file : str Full file path to PDB file containing the reference pose. static_sel_str, mobile_sel_str : str MDAnalysis style selection string to select the atoms in ref_pdb_file that corresponds to the static and mobile monomer, respectively. Returns ------- CAPRI_class : pandas.Dataframe Table of the docking poses with their respective classifications, % Native Contacts and iRMSD. """ universe = MDAnalysis.Universe(ref_pdb_file) self.poses_uni.trajectory[0] self.poses_CAPRI_class = docking_eval.assign_soluble_CAPRI_class( universe.atoms, static_sel_str, mobile_sel_str, self.poses_uni.atoms ) self.CAPRI_assign_status = True return self.poses_CAPRI_class
[docs] def save_data(self, filename): """ Save current session to file. Parameters ---------- filename : str Full file path to save to """ if not self.run_status: self.run() temp_storage_dict = {} temp_storage_dict = self.reader_obj.save_data(temp_storage_dict) temp_storage_dict.update( {'has_dist_rest_data': self.has_dist_rest_data, 'protocol': 'soluble', 'run_status': self.run_status } ) save_load.save_pd_table( temp_storage_dict, self.filter_data, 'filter_data') save_load.save_pd_table( temp_storage_dict, self.filter_pass_results, 'filter_pass_results') save_load.save_pd_table( temp_storage_dict, self.filtered_poses_data, 'filtered_poses_data') save_load.save_pd_table( temp_storage_dict, self.final_results, 'final_results') temp_storage_dict['dist_rest_file'] = \ save_load.convert_StrIO_or_file_to_str(self.dist_rest_file) save_load.save_pd_table( temp_storage_dict, self.dist_rest_conform_dist, 'dist_rest_conform_dist' ) if hasattr(self, 'poses_CAPRI_class'): save_load.save_pd_table( temp_storage_dict, self.poses_CAPRI_class, 'poses_CAPRI_class') common.makedir(filename) with open(filename, 'wb') as output_file: output_file.write(zlib.compress( umsgpack.packb(temp_storage_dict), 9))
[docs] def write_pose(self, pose_num, filename): """ Write a pose to a PDB file. Parameters ---------- pose_num : int The pose's original ZDOCK rank. filename : str Filename of PDB file to store the docking pose. """ common.makedir(filename) self.poses_uni.trajectory[pose_num-1] self.poses_uni.atoms.write(filename)
[docs] def write_topN_poses(self, folder, num_poses=20): """ Write the top-N docking poses to PDB. Parameters ---------- folder : str Folder to write the PDB files of the top-N docking poses ranked by DR-SIP. Files are written out as Pose_1.pdb, Pose_2.pdb, ..., Pose_N.pdb. num_poses : int, optional The number of top poses to write out. Default: 20 """ if not self.run_status: self.run() if self.final_results.shape[0] < num_poses: num_poses = self.final_results.shape[0] for idx in xrange(num_poses): pose_num = self.final_results.index.values[idx] self.write_pose(pose_num, os.path.join( folder, 'Pose_%d.pdb' % (idx + 1)))