"""
========================================================
DR-SIP package (:mod:`drsip`)
========================================================
Package contains filters and protocols for predicting homo-oligomeric
transmembrane proteins (HoTPs).
Functions
---------
.. autofunction:: DR_SIP
.. autofunction:: load_data
Classes
-------
.. autoclass:: DR_SIP_Base
:members:
.. autoclass:: DR_SIP_Membrane
:members:
.. autoclass:: DR_SIP_Soluble
:members:
"""
import pandas as pd
import numpy as np
import os
import string
from collections import Counter
import docking_eval
import MDAnalysis
import umsgpack
import zlib
import Cn_symm_criterion
import dist_restraints_filter
import struct_clustering
import tilt_angle_criterion
import save_load
import drsip_common as common
import readers
[docs]def load_data(filename, reader_obj=None):
"""
Load saved DR_SIP data from file and return DR_SIP instance.
Parameters
----------
filename : str
Full path to save file.
Returns
-------
drsip_inst
Instance of DR_SIP with data from save file.
"""
with open(filename, 'rb') as input_file:
temp_storage_dict = umsgpack.unpackb(
zlib.decompress(input_file.read()))
if temp_storage_dict['protocol'] == 'soluble':
soluble_state = True
elif temp_storage_dict['protocol'] == 'membrane':
soluble_state = False
else:
raise ValueError('Invalid protocol type: {0}'.format(
temp_storage_dict['protocol']))
if temp_storage_dict['has_dist_rest_data']:
save_load.load_StrIO(temp_storage_dict, 'dist_rest_file')
dist_rest_file_input = temp_storage_dict.pop('dist_rest_file')
save_load.load_pd_table(
temp_storage_dict, 'dist_rest_conform_dist', pop=True)
else:
dist_rest_file_input = ''
if reader_obj is None:
if ('reader' in temp_storage_dict) and \
(temp_storage_dict['reader'] in readers.readers_dict):
reader_class = readers.readers_dict[temp_storage_dict['reader']]
reader_obj = reader_class.load_data(temp_storage_dict)
if not soluble_state:
transmem_helix_sel_strs = temp_storage_dict.pop('transmem_helix_sel_strs')
else:
transmem_helix_sel_strs = []
drsip_inst = DR_SIP(reader_obj,
dist_rest_file_input,
transmem_helix_sel_strs,
soluble_state
)
for pd_var in ['filter_data', 'filter_pass_results',
'filtered_poses_data', 'final_results']:
save_load.load_pd_table(temp_storage_dict, pd_var, pop=True)
if 'poses_CAPRI_class_val' in temp_storage_dict:
save_load.load_pd_table(
temp_storage_dict, 'poses_CAPRI_class', pop=True)
if not soluble_state:
temp_storage_dict['static_orient_unit_vec'] = np.array(
temp_storage_dict['static_orient_unit_vec'], dtype=np.float32)
temp_storage_dict['trans_vect'] = np.array(
temp_storage_dict['trans_vect'], dtype=np.float32)
temp_storage_dict['axis_of_rot'] = np.array(
temp_storage_dict['axis_of_rot'], dtype=np.float32)
temp_storage_dict['cylindrical_axis'] = np.array(
temp_storage_dict['cylindrical_axis'], dtype=np.float32)
drsip_inst.__dict__.update(temp_storage_dict)
return drsip_inst
[docs]def DR_SIP(reader_obj=None, dist_rest_file='', transmem_helix_sel_strs=[], soluble=False):
"""
Initialize the correct docking protocol.
The default protocol is the membrane protocol. To select the
soluble protocol, set soluble to True.
Currently, only supports docking results from ZDOCK 3.0.2.
Parameters
----------
static_pdb, mobile_pdb : str
Full path to the PDB files containing the static and mobile
monomers.
zdock_output_file : str
Full path to the ZDOCK 3.0.2 generated output file.
dist_rest_file : str, optional
Full path to the distance restraints file, defaults to an empty
string. When empty, the distance restraints filter is not used.
Required when soluble is set to True.
File is formatted such that each line corresponds to a distance
restraint (tab delimited)::
chain_id_1 res_id_1 chain_id_2 res_id_2 distance
transmem_helix_sel_strs : lists of strings, optional
Define which residues are the transmembrane helices. Each
string should select for 1 transmembrane helix using the
MDAnalysis' `selection language <https://www.mdanalysis.org/docs/documentation_pages/selections.html>`_. Required if soluble is False.
soluble : bool, optional
Defaults to False, which uses DR-SIP's membrane protocol. When
set to True, uses the membrane protocol.
"""
if soluble: # Soluble protocol
if dist_rest_file != '':
return DR_SIP_Soluble(reader_obj, dist_rest_file)
else:
raise IOError(
'No distance restraints file')
else: # Membrane protocol
# Check if transmem_helix_sel_strs is provided
if len(transmem_helix_sel_strs) > 0:
return DR_SIP_Membrane(reader_obj,
transmem_helix_sel_strs,
dist_rest_file
)
else:
raise ValueError("Transmembrane helix selection strings "
"(transmem_helix_sel_strs) are required to "
"perform the membrane protocol."
)
[docs]class DR_SIP_Base(object):
"""
DR_SIP base class.
For subclassing only.
Parameters
----------
static_pdb, mobile_pdb : str
Full path to the PDB files containing the static and mobile
monomers.
zdock_output_file : str
Full path to the ZDOCK 3.0.2 generated output file.
dist_rest_file : str, optional
Full path to the distance restraints file, defaults to an empty
string.
File is formatted such that each line corresponds to a distance
restraint (tab delimited)::
chain_id_1 res_id_1 chain_id_2 res_id_2 distance
chain_id_1 res_id_1 chain_id_2 res_id_2 distance
"""
def __init__(self, dist_rest_file=''):
if dist_rest_file != '':
self.dist_rest_file = dist_rest_file
self.run_status = False
def _parse_dist_res_input(self):
"""Parses the distance restraints input file into a Pandas table."""
self.dist_rest_table = pd.read_csv(
self.dist_rest_file, delimiter='\t', header=None, index_col=None)
self.dist_rest_table.columns = [
'Segid 1', 'Resid 1', 'Segid 2', 'Resid 2', 'Dist']
# Swap columns of rows to make sure that the segid 1 corresponds to
# the static monomer
swap_rows = np.where(
~self.dist_rest_table['Segid 1'].isin(self.static_segids))[0]
self.dist_rest_table.iloc[swap_rows, 0:4] = \
self.dist_rest_table.iloc[swap_rows][['Segid 2', 'Resid 2',
'Segid 1', 'Resid 1']].values
# Make sure each column's dtype is correct
self.dist_rest_table = self.dist_rest_table.astype(
dtype={'Segid 1': 'string',
'Resid 1': 'int32',
'Segid 2': 'string',
'Resid 2': 'int32',
'Dist': 'float32'})
# Make sure the segids are valid segids
segids_only = self.dist_rest_table.loc[:, [
'Segid 1', 'Segid 2']]
num_unknown_segids = np.where(~segids_only.isin(
np.unique(self.static_segids + self.mobile_segids)))[0].shape[0]
if num_unknown_segids > 0:
raise Exception(
'Invalid chain IDs in the distance restraints file:'
' Must be from the static/mobile structure'
)
# Check and see if there are pairs whose segids are identical
num_identical_segids = np.where(
segids_only.iloc[:, 0] == segids_only.iloc[:, 1])[0].shape[0]
if num_identical_segids > 0:
raise Exception(
'Distance restraints file contains pairs of residues'
' with the same chain ID'
)
# Select of residues involved in the the distance restraints
sel_str_format = '(segid {0} and resid {1})'.format
static_sel_list = [sel_str_format(*resid) for resid
in self.dist_rest_table[['Segid 1',
'Resid 1']].values
]
self.dist_res_static_sel_str = ' or '.join(
static_sel_list) + ' and name CA'
self.dist_res_static_sel = self.static_sel.select_atoms(
self.dist_res_static_sel_str)
mobile_sel_list = [sel_str_format(*resid) for resid
in self.dist_rest_table[['Segid 2',
'Resid 2']].values
]
self.dist_res_mobile_sel_str = ' or '.join(
mobile_sel_list) + ' and name CA'
self.dist_res_mobile_sel = self.poses_uni.select_atoms(
self.dist_res_mobile_sel_str)
seg_res_1_pairs_ordered = zip(
self.dist_res_static_sel.segids, self.dist_res_static_sel.resids)
seg_res_2_pairs_ordered = zip(
self.dist_res_mobile_sel.segids, self.dist_res_mobile_sel.resids)
# Indices for extracting the relevent residue pairs from the distance
# matrix
self.dist_mat_idx = [
[seg_res_1_pairs_ordered.index(
tuple(x[1].tolist())
)
for x
in self.dist_rest_table.loc[:, ['Segid 1', 'Resid 1']].iterrows()
],
[seg_res_2_pairs_ordered.index(
tuple(x[1].tolist())
)
for x
in self.dist_rest_table.loc[:, ['Segid 2', 'Resid 2']].iterrows()
]
]
# Distance restraints data
self.dist_rest = self.dist_rest_table.loc[:, 'Dist'].values
# Initialize table to store the distances of the residue pairs for
# each pose
col_name = ['{0}-{1}-{2}-{3}'.format(
*self.dist_rest_table.iloc[idx, 0:4])
for idx
in xrange(self.dist_rest_table.shape[0])]
self.dist_rest_conform_dist = pd.DataFrame(np.zeros(
(self.total_num_poses, self.dist_rest_table.shape[0])),
columns=col_name,
index=range(1, self.total_num_poses + 1)
)
[docs] def write_results_file(self, filename):
"""
Write the top results to a CSV file.
Parameters
----------
filename : str
Full path to save the comma-seperated (CSV) file.
"""
raise NotImplementedError
[docs] def build_final_results_table(self, clusters, filtered_poses_data):
"""
Returns the final results table.
Table contains the final ranking of the docking poses.
Needs to be implemented by the child class.
Parameters
----------
clusters : list of int
List containing the cluster number that each pose is in.
filtered_poses_data : pandas.Dataframe
Table containing the filter data and cluster number of
each pose.
Returns
-------
results_table : pandas.Dataframe
Final results table.
"""
raise NotImplementedError
[docs] def get_final_results_table(self):
"""Returns the final results table.
Returns
-------
pandas.Dataframe
Table of the final results.
"""
if not self.run_status:
self.run()
return self.final_results
[docs] def calc_CAPRI_class(self, ref_pdb_file, static_sel_str, mobile_sel_str):
"""
Classify each docking pose to one of four CAPRI classes.
Each docking pose is compared to the reference pose in
ref_pdb_file and assigned to one of the following classes:
'Incorrect', 'Acceptable', 'Medium' or 'High'.
Uses the CAPRI criteria to classify the poses. See
:py:class:`docking_eval.CAPRI_Criteria`.
Needs to be implemented by the child class.
Parameters
----------
ref_pdb_file : str
Full file path to PDB file containing the reference pose.
static_sel_str, mobile_sel_str : str
MDAnalysis style selection string to select the atoms in
ref_pdb_file that corresponds to the static and mobile
monomers, respectively.
Returns
-------
CAPRI_class : pandas.Dataframe
Table of the docking poses and their respective
classifications.
"""
raise NotImplementedError
[docs] def get_CAPRI_class_table(self):
"""Returns the CAPRI classification table.
Returns a table containing the CAPRI classification of each
docking pose.
Run the calc_CAPRI_class (:py:meth:`membrane <drsip.DR_SIP_Membrane.calc_CAPRI_class>` or :py:meth:`soluble <drsip.DR_SIP_Soluble.calc_CAPRI_class>`) method before
calling this method.
Returns
-------
pandas.Dataframe
Table of the docking poses with their respective
classifications, % Native Contacts and iRMSD.
"""
if not self.CAPRI_assign_status:
raise Exception('Run the calc_CAPRI_class method before '
'running this method'
)
return self.poses_CAPRI_class
[docs] def run(self):
"""
Executes the docking protocol.
Needs to be implemented by the child class.
"""
raise NotImplementedError
def not_available(self):
raise NotImplementedError
[docs]class DR_SIP_Membrane(DR_SIP_Base):
"""
DR-SIP's membrane protocol.
Protocol applies the C\ :sub:`n` symmetry, tilt angle and
distance restraints (optional) filters on docking poses.
Currently, only supports docking results from ZDOCK 3.0.2.
Parameters
----------
static_pdb, mobile_pdb : str
Full path to the PDB files containing the static and mobile
monomers.
zdock_output_file : str
Full path to the ZDOCK 3.0.2 generated output file.
transmem_helix_sel_strs : lists of strings
Define which residues are the transmembrane helices. Each
string should select for 1 transmembrane helix using the
MDAnalysis' `selection language <https://www.mdanalysis.org/docs/documentation_pages/selections.html>`_.
dist_rest_file : str, optional
Full path to the distance restraints file, defaults to an empty
string.
File is formatted such that each line corresponds to a distance
restraint (tab delimited)::
chain_1 resid_1 chain_2 resid_2 distance
"""
def __init__(self, reader_obj=None, transmem_helix_sel_strs=[], dist_rest_file=''):
super(DR_SIP_Membrane, self).__init__(dist_rest_file)
if reader_obj:
self.reader_obj = reader_obj
self.total_num_poses = self.reader_obj.total_num_poses
self.static_segids = self.reader_obj.static_segids
self.mobile_segids = self.reader_obj.mobile_segids
self.static_sel = self.reader_obj.static_sel
self.mobile_sel = self.reader_obj.mobile_sel
self.poses_uni = self.reader_obj.poses_uni
if dist_rest_file != '':
self._parse_dist_res_input()
self.has_dist_rest_data = True
else:
self.has_dist_rest_data = False
if not transmem_helix_sel_strs:
raise Exception('Transmembrane helix selection '
'strings are required'
)
self.transmem_helix_sel_strs = transmem_helix_sel_strs
else:
self.run = self.not_available
self.calc_CAPRI_class = self.not_available
self.write_topN_poses = self.not_available
self.write_pose = self.not_available
[docs] def build_final_results_table(self, clusters, filtered_poses_data):
"""
Returns the final results table.
Table contains the representative poses of each cluster ranked
based on the size of the clusters (in descending order). For each
representative pose the Cluster ID, Order of Symmetry,
C\ :sub:`n` symmetry RMSD, Tilt Angle, Pearson and Spearman
correlations are included in the table.
The representative pose of each cluster are chosen based on 2
criteria:
1. The consensus order of symmetry within a cluster. Only
poses that have the consensus order of symmetry are chosen.
2. The pose with the lowest C\ :sub:`n` symmetry RMSD.
Use :py:meth:`get_final_results_table <drsip.DR_SIP_Soluble.get_final_results_table>` to return the table.
Parameters
----------
clusters : list of int
List containing the cluster number that each pose is in.
filtered_poses_data : pandas.Dataframe
Table containing the filter data and cluster number of
each pose.
"""
cluster_counts = Counter(clusters)
final_table = pd.DataFrame()
for cluster_num, counts in cluster_counts.most_common():
if counts < 3:
break
cluster_members = filtered_poses_data[
filtered_poses_data['Cluster ID'] ==
cluster_num
].copy()
consensus_order_of_symm = \
cluster_members['Order of Symmetry'].value_counts(
).index.values[0]
consensus_cluster_members = cluster_members[
cluster_members['Order of Symmetry'] ==
consensus_order_of_symm
].copy()
consensus_cluster_members['C-n Symm Rank'] = np.argsort(
consensus_cluster_members['Cn Symm RMSD'].values)
final_table = final_table.append(
consensus_cluster_members.sort_values('C-n Symm Rank').iloc[0].copy(),
sort=True
)
if final_table.shape[0] != 0:
final_table.drop('C-n Symm Rank', axis=1, inplace=True)
else:
final_table = self.filtered_poses_data.copy()
final_table['DRSIP Rank'] = range(1, final_table.shape[0]+1)
final_table.index.name = 'ZDOCK Rank'
self.final_results = final_table.copy()
[docs] def calc_CAPRI_class(self, ref_pdb_file, static_sel_str, mobile_sel_str, num_mers):
"""
Classify each docking pose to one of four CAPRI classes.
Each docking pose is compared to the reference pose in
ref_pdb_file and assigned to one of the following classes:
'Incorrect', 'Acceptable', 'Medium' or 'High'.
Uses the modified CAPRI criteria for HoTPs to classify the
poses. See :py:meth:`docking_eval.assign_Cn_symm_CAPRI_class <docking_eval.assign_Cn_symm_CAPRI_class>`.
The modified CAPRI criteria includes an additional criterion
where the predicted size ("n") of the pose must be equal the
size of the reference complex (num_mers).
Parameters
----------
ref_pdb_file : str
Full file path to PDB file containing the reference pose.
static_sel_str, mobile_sel_str : str
MDAnalysis style selection string to select the atoms in
ref_pdb_file that corresponds to the static and mobile
monomer, respectively.
num_mers : int
The size of the original C\ :sub:`n` symmetric HoTP complex.
Returns
-------
CAPRI_class : pandas.Dataframe
Table of the docking poses with their respective
classifications, % Native Contacts and iRMSD.
"""
if not self.run_status:
self.run()
universe = MDAnalysis.Universe(ref_pdb_file)
self.poses_uni.trajectory[0]
self.poses_CAPRI_class = docking_eval.assign_Cn_symm_CAPRI_class(
universe.atoms, static_sel_str, mobile_sel_str,
self.poses_uni.atoms,
self.static_segids,
self.mobile_segids,
num_mers, self.filter_data['Order of Symmetry'].values)
self.CAPRI_assign_status = True
[docs] def run(self, Cn_rmsd_cutoff=2.0, dist_rest_cutoff=0.3, tilt_angle_cutoff=0.541052, cluster_RMSD_cutoff=12.0, num_mers=None):
"""
Executes the membrane protocol.
Parameters
----------
Cn_rmsd_cutoff : float, optional
The C\ :sub:`n` symmetry RMSD filter cutoff value (in
Angstrom). Default is 2 Angstrom.
dist_rest_cutoff : float, optional
The distance restraints filter cutoff value (correlation).
Default correlation 0.3.
tilt_angle_cutoff : float, optional
The tilt angle filter cutoff value (radians). Default
angle is 0.541052 rad (31 degrees).
cluster_RMSD_cutoff : float, optional
Cutoff used to cluster the docking poses. Default cutoff is
12 Angstrom.
"""
# Get the selection, coordinates and COM of the static monomer
self.static_CA_sel = self.static_sel.select_atoms(
'segid ' + ' '.join(self.static_segids) + ' and name CA')
static_CA_coord = self.static_CA_sel.positions.astype('float64')
static_CA_com = static_CA_coord.mean(axis=0)
static_CA_ori_coord = static_CA_coord - static_CA_com
# Get the selection of the mobile monomer
self.mobile_CA_sel = self.poses_uni.select_atoms(
'segid ' + ' '.join(self.mobile_segids) + ' and name CA')
# Initialize the temp arrays to store the filter results and
# the filter_data and filter_pass_results tables
if self.has_dist_rest_data:
filter_data_num_cols = 6
filter_pass_results_num_cols = 4
filter_data_col_names = ['DR Spearman Corr',
'DR Pearson Corr',
'Order of Symmetry',
'Cn Symm RMSD',
'Tilt Angle',
'Filter Pass Status'
]
filter_pass_status_col_names = ['DR Pass Status',
'Cn Symm Pass Status',
'Tilt Angle Pass Status',
'All Pass Status'
]
static_dist_rest_CA_coord = self.dist_res_static_sel.positions
else:
filter_data_num_cols = 4
filter_pass_results_num_cols = 3
filter_data_col_names = ['Order of Symmetry',
'Cn Symm RMSD',
'Tilt Angle',
'Filter Pass Status'
]
filter_pass_status_col_names = ['Cn Symm Pass Status',
'Tilt Angle Pass Status',
'All Pass Status'
]
if num_mers:
filter_data_num_cols += 1
filter_pass_results_num_cols += 1
filter_data_col_names.insert(-1, 'Num Mers Pass Status')
filter_pass_status_col_names.insert(-1, 'Num Mers Pass Status')
tmp_filter_status = np.zeros(
(self.total_num_poses,
filter_pass_results_num_cols),
dtype='bool'
)
tmp_filter_results = np.zeros(
(self.total_num_poses,
filter_data_num_cols),
dtype='float32'
)
self.filter_data = pd.DataFrame(np.zeros(
(self.total_num_poses,
filter_data_num_cols),
dtype='float32'
),
index=range(
1, self.total_num_poses + 1),
columns=filter_data_col_names
)
self.filter_data.index.name = 'Original Rank'
self.filter_pass_results = pd.DataFrame(np.zeros(
(self.total_num_poses,
filter_pass_results_num_cols),
dtype='bool'
),
index=range(
1, self.total_num_poses + 1),
columns=filter_pass_status_col_names
)
self.filter_pass_results.index.name = 'Original Rank'
self.trans_vect = np.zeros(
(self.total_num_poses, 3), dtype='float32')
self.axis_of_rot = np.zeros(
(self.total_num_poses, 3), dtype='float32')
self.cylindrical_axis = np.zeros(3, dtype='float32')
self.cylindrical_axis[:] = tilt_angle_criterion \
.get_cylindrical_axis(
self.static_CA_sel,
self.transmem_helix_sel_strs
)
self.static_orient_unit_vec = static_CA_ori_coord[0] / \
np.linalg.norm(static_CA_ori_coord[0])
# Temp arrays to store mobile coords that have passed the filters
poses_mobile_coords = np.zeros(
(self.total_num_poses, static_CA_coord.shape[0], 3),
dtype='float32')
poses_mobile_coords_idx = 0
for pose_idx, ts in enumerate(self.poses_uni.trajectory):
filter_results_idx = 0
filter_status_idx = 0
# Distance Restraints (DR) filter:
if self.has_dist_rest_data:
mobile_dist_rest_CA_coord = self.dist_res_mobile_sel.positions
(current_distances,
spearman_correl,
pearson_correl,
dist_rest_pass_status
) = dist_restraints_filter \
.dist_restraints_filter(
static_dist_rest_CA_coord,
mobile_dist_rest_CA_coord,
self.dist_mat_idx,
self.dist_rest,
cutoff=dist_rest_cutoff
)
self.dist_rest_conform_dist.values[pose_idx] = \
current_distances
tmp_filter_results[pose_idx, filter_results_idx] = \
spearman_correl
tmp_filter_results[pose_idx, filter_results_idx+1] = \
pearson_correl
tmp_filter_status[pose_idx, filter_status_idx] = \
dist_rest_pass_status
filter_results_idx += 2
filter_status_idx += 1
# Get the selection, coordinates and COM of the mobile monomer
mobile_CA_coord = self.mobile_CA_sel.positions.astype(
'float64')
mobile_CA_com = mobile_CA_coord.mean(axis=0)
mobile_CA_ori_coord = mobile_CA_coord - mobile_CA_com
diff_vect = mobile_CA_com - static_CA_com
# SIP filter:
# Cn symmetry criterion
(axis_of_rot,
static_trans_vect,
pred_num_mers,
RMSD,
C_n_pass_status
) = Cn_symm_criterion.Cn_symm_criterion(
static_CA_ori_coord,
mobile_CA_ori_coord,
diff_vect,
Cn_rmsd_cutoff
)
# Tilt angle criterion
self.axis_of_rot[pose_idx] = axis_of_rot
self.trans_vect[pose_idx] = static_trans_vect
tilt_angle, tilt_angle_pass_status = tilt_angle_criterion \
.tilt_angle_criterion(
self.cylindrical_axis,
axis_of_rot,
cutoff=tilt_angle_cutoff
)
# Num of mers filter
if num_mers:
num_mers_status = pred_num_mers == num_mers
else:
num_mers_status = True
# Store filter results
tmp_filter_results[pose_idx, filter_results_idx] = pred_num_mers
tmp_filter_results[pose_idx, filter_results_idx+1] = RMSD
tmp_filter_status[pose_idx, filter_status_idx] = C_n_pass_status
tmp_filter_results[pose_idx, filter_results_idx+2] = np.degrees(
tilt_angle
)
tmp_filter_status[pose_idx, filter_status_idx+1] = \
tilt_angle_pass_status
if num_mers:
tmp_filter_status[pose_idx, filter_status_idx+2] = \
num_mers_status
tmp_filter_results[pose_idx, filter_results_idx+3] = \
num_mers_status
filter_status_idx += 1
filter_results_idx += 1
if self.has_dist_rest_data:
tmp_filter_status[pose_idx, filter_status_idx+2] = \
dist_rest_pass_status * \
C_n_pass_status * \
tilt_angle_pass_status * \
num_mers_status
else:
tmp_filter_status[pose_idx, filter_status_idx+2] = \
C_n_pass_status * \
tilt_angle_pass_status * \
num_mers_status
tmp_filter_results[pose_idx, filter_results_idx+3] = \
tmp_filter_status[pose_idx, filter_status_idx+2]
if tmp_filter_status[pose_idx, filter_status_idx+2]:
poses_mobile_coords[poses_mobile_coords_idx] = mobile_CA_coord
poses_mobile_coords_idx += 1
self.filter_data.values[:] = tmp_filter_results
self.filter_pass_results.values[:] = tmp_filter_status
self.filtered_poses_data = self.filter_data[
self.filter_pass_results['All Pass Status'].values
].copy().drop('Filter Pass Status', axis=1)
if poses_mobile_coords_idx < 2:
self.final_results = self.filtered_poses_data.copy()
self.final_results['DRSIP Rank'] = range(
1,
self.final_results.shape[0]+1
)
self.final_results.index.name = 'Original Rank'
else:
# Cluster remaining poses
filtered_poses_dist_mat = struct_clustering \
.cal_rmsd_between_poses_membrane(
static_CA_coord.astype('float32'),
poses_mobile_coords[:poses_mobile_coords_idx]
)
clusters = struct_clustering.cluster_poses(
filtered_poses_dist_mat, cutoff=cluster_RMSD_cutoff)
self.filtered_poses_data['Cluster ID'] = clusters
# Select cluster representatives, then rank the clusters and
# build the final results table
self.build_final_results_table(
clusters, self.filtered_poses_data)
self.run_status = True
[docs] def write_results_file(self, filename):
"""
Write the top results to a CSV file.
Parameters
----------
filename : str
Full path to save the comma-seperated (CSV) file.
"""
if not self.run_status:
self.run()
common.makedir(filename)
self.final_results.to_csv(filename)
[docs] def save_data(self, filename):
"""Save current session/instance to file.
Parameters
----------
filename : str
Full file path to save to
"""
if not self.run_status:
self.run()
temp_storage_dict = {}
temp_storage_dict = self.reader_obj.save_data(temp_storage_dict)
temp_storage_dict.update(
{'has_dist_rest_data': self.has_dist_rest_data,
'static_orient_unit_vec':
self.static_orient_unit_vec.tolist(),
'trans_vect': self.trans_vect.tolist(),
'axis_of_rot': self.axis_of_rot.tolist(),
'cylindrical_axis': self.cylindrical_axis.tolist(),
'protocol': 'membrane',
'transmem_helix_sel_strs': self.transmem_helix_sel_strs,
'run_status': self.run_status
}
)
save_load.save_pd_table(
temp_storage_dict, self.filter_data, 'filter_data')
save_load.save_pd_table(
temp_storage_dict, self.filter_pass_results, 'filter_pass_results')
save_load.save_pd_table(
temp_storage_dict, self.filtered_poses_data, 'filtered_poses_data')
save_load.save_pd_table(
temp_storage_dict, self.final_results, 'final_results')
if self.has_dist_rest_data:
temp_storage_dict['dist_rest_file'] = \
save_load.convert_StrIO_or_file_to_str(
self.dist_rest_file)
save_load.save_pd_table(
temp_storage_dict,
self.dist_rest_conform_dist,
'dist_rest_conform_dist'
)
if hasattr(self, 'poses_CAPRI_class'):
save_load.save_pd_table(
temp_storage_dict, self.poses_CAPRI_class, 'poses_CAPRI_class')
common.makedir(filename)
with open(filename, 'wb') as output_file:
output_file.write(zlib.compress(
umsgpack.packb(temp_storage_dict), 9))
[docs] def write_pose(self, pose_num, filename):
"""Write a C\ :sub:`n` symmetric complex to a PDB file.
Given the original rank of a ZDOCK pose, write out the closest
ideal C\ :sub:`n` symmetric complex to a PDB file.
Parameters
----------
pose_num : int
The pose's original ZDOCK rank.
filename : str
Filename to store the complex in PDB format.
"""
pose_idx = pose_num - 1
order_of_symm = np.int32(
self.filter_data['Order of Symmetry'].values[pose_idx])
rot_mat = Cn_symm_criterion.get_rot_mat_arbitrary_axis(
2 * np.pi / order_of_symm, self.axis_of_rot[pose_idx])
current_pos = self.static_sel.positions - \
self.static_sel.center_of_mass() + \
self.trans_vect[pose_idx]
num_atoms = self.static_sel.n_atoms
complex_uni = MDAnalysis.Merge(
*([self.static_sel] * order_of_symm))
current_select = complex_uni.select_atoms(
'bynum 1:%d' % (num_atoms * 1))
current_select.segments.segids = 'A'
current_select.positions = current_pos
for current_subunit_id in range(2, order_of_symm + 1):
prev_subunit_id = current_subunit_id - 1
current_pos = current_pos.dot(rot_mat.T)
current_select = complex_uni.select_atoms('bynum %d:%d' % (
num_atoms * prev_subunit_id + 1,
num_atoms * current_subunit_id
)
)
current_select.positions = current_pos
current_select.segments.segids = string.uppercase[prev_subunit_id]
common.makedir(filename)
complex_uni.atoms.write(filename)
[docs] def write_topN_poses(self, folder, num_poses=20):
"""Write the top-N C\ :sub:`n` symmetric complexes to PDB files.
The top-N complexes are from the DR-SIP final results table.
Parameters
----------
folder : str
Folder to write the PDB files of the top-N complexes
Files are written out as Pose_1.pdb, Pose_2.pdb, ...,
Pose_N.pdb.
num_poses : int, optional
The number of top-N poses to write out. Default: 20
"""
if not self.run_status:
self.run()
if self.final_results.shape[0] < num_poses:
num_poses = self.final_results.shape[0]
for idx in xrange(num_poses):
pose_num = self.final_results.index.values[idx]
self.write_pose(pose_num, os.path.join(
folder, 'Pose_%d.pdb' % (idx + 1)))
[docs]class DR_SIP_Soluble(DR_SIP_Base):
"""
DR-SIP's soluble protocol.
Protocol applies the distance restraints filter on docking poses.
Currently, only supports docking results from ZDOCK 3.0.2.
Parameters
----------
static_pdb, mobile_pdb : str
Full path to the PDB files containing the static and mobile
monomers.
zdock_output_file : str
Full path to the ZDOCK 3.0.2 generated output file.
dist_rest_file : str
Full path to the distance restraints file.
File is formatted such that each line corresponds to a distance
restraint (tab delimited)::
chain_1 resid_1 chain_2 resid_2 distance
"""
def __init__(self, reader_obj, dist_rest_file):
super(DR_SIP_Soluble, self).__init__(dist_rest_file)
if reader_obj:
self.reader_obj = reader_obj
self.total_num_poses = self.reader_obj.total_num_poses
self.static_segids = self.reader_obj.static_segids
self.mobile_segids = self.reader_obj.mobile_segids
self.static_sel = self.reader_obj.static_sel
self.mobile_sel = self.reader_obj.mobile_sel
self.poses_uni = self.reader_obj.poses_uni
if dist_rest_file != '':
self._parse_dist_res_input()
self.has_dist_rest_data = True
else:
self.has_dist_rest_data = False
else:
self.run = self.not_available
self.calc_CAPRI_class = self.not_available
self.write_topN_poses = self.not_available
self.write_pose = self.not_available
[docs] def run(self, dist_rest_cutoff=0.3, cluster_RMSD_cutoff=12.0):
"""
Executes the soluble protocol.
Parameters
----------
dist_rest_cutoff : float, optional
The distance restraints filter cutoff value (correlation).
Default correlation 0.3.
cluster_RMSD_cutoff : float, optional
Cutoff used to cluster the docking poses. Default cutoff is
12 Angstrom.
"""
# Get the selection, coordinates and COM of the static monomer
self.static_CA_sel = self.static_sel.select_atoms(
'name CA')
static_CA_coord = self.static_CA_sel.positions
static_dist_rest_CA_coord = self.dist_res_static_sel.positions
# Temp arrays to store the filter results
tmp_filter_status = np.zeros(
(self.total_num_poses, 1), dtype='bool')
tmp_filter_results = np.zeros(
(self.total_num_poses, 3), dtype='float32')
self.filter_data = pd.DataFrame(np.zeros(
(self.total_num_poses, 3),
dtype='float32'
),
index=range(1, self.total_num_poses + 1),
columns=['DR Spearman Corr',
'DR Pearson Corr',
'Filter Pass Status'
]
)
self.filter_data.index.name = 'Original Rank'
self.filter_pass_results = pd.DataFrame(np.zeros(
(self.total_num_poses),
dtype='bool'),
index=range(1, self.total_num_poses + 1),
columns=['DR Pass Status']
)
self.filter_pass_results.index.name = 'Original Rank'
# Temp arrays to store mobile coords that have passed the filters
poses_mobile_coords = np.zeros(
(self.total_num_poses,
self.mobile_sel.select_atoms('name CA')
.positions.shape[0],
3
),
dtype='float32')
poses_mobile_coords_idx = 0
self.mobile_CA_sel = self.poses_uni.select_atoms(
'segid ' + ' '.join(self.mobile_segids) + ' and name CA')
for pose_idx, ts in enumerate(self.poses_uni.trajectory):
# Get the coordinates and COM of the mobile monomer
mobile_CA_coord = self.mobile_CA_sel.positions.astype(
'float64')
mobile_dist_rest_CA_coord = self.dist_res_mobile_sel.positions
# Distance Restraints (DR) filter:
(current_distances,
spearman_correl,
pearson_correl,
dist_rest_pass_status
) = dist_restraints_filter.dist_restraints_filter(
static_dist_rest_CA_coord,
mobile_dist_rest_CA_coord,
self.dist_mat_idx,
self.dist_rest,
cutoff=dist_rest_cutoff
)
# Store filter results
self.dist_rest_conform_dist.values[pose_idx] = current_distances
tmp_filter_results[pose_idx, 0] = spearman_correl
tmp_filter_results[pose_idx, 1] = pearson_correl
tmp_filter_results[pose_idx, 2] = dist_rest_pass_status
tmp_filter_status[pose_idx] = dist_rest_pass_status
if tmp_filter_status[pose_idx]:
poses_mobile_coords[poses_mobile_coords_idx] = mobile_CA_coord
poses_mobile_coords_idx += 1
self.filter_data.values[:] = tmp_filter_results
self.filter_pass_results.values[:] = tmp_filter_status
self.filtered_poses_data = self.filter_data[
self.filter_pass_results['DR Pass Status'].values] \
.copy().drop('Filter Pass Status', axis=1)
# Cluster remaining poses
self.filtered_poses_dist_mat = \
struct_clustering.cal_rmsd_between_poses_soluble(
static_CA_coord,
poses_mobile_coords[:poses_mobile_coords_idx]
)
clusters = struct_clustering.cluster_poses(
self.filtered_poses_dist_mat, cutoff=cluster_RMSD_cutoff)
self.filtered_poses_data['Cluster ID'] = clusters
# Select cluster representatives, rank the clusters and build
# the final results table
self.build_final_results_table(
clusters, self.filtered_poses_data)
self.run_status = True
[docs] def write_results_file(self, filename):
"""
Write the top results to a CSV file.
Parameters
----------
filename : str
Full path to save the comma-seperated (CSV) file.
"""
if not self.run_status:
self.run()
common.makedir(filename)
self.final_results.to_csv(filename)
[docs] def build_final_results_table(self, clusters, filtered_poses_data):
"""
Build the final results table.
Table contains the representative poses of each cluster ranked
based on the size of the clusters (in descending order). For each
representative pose the Cluster ID, the Pearson and Spearman
correlations are included in the table.
The representative pose of each cluster has the highest Spearman
correlation. If there are more than one pose with the same
highest correlation, the one with the highest Pearson
correlation is chosen.
Use :py:meth:`get_final_results_table <drsip.DR_SIP_Soluble.get_final_results_table>` to return the table.
Parameters
----------
clusters : list of int
List containing the cluster number that each pose is in.
filtered_poses_data : pandas.Dataframe
Table containing the filter data and cluster number of
each pose.
"""
cluster_counts = Counter(clusters)
final_table = pd.DataFrame()
for cluster_num, counts in cluster_counts.most_common():
if counts < 3:
break
cluster_members = \
filtered_poses_data[filtered_poses_data['Cluster ID'] ==
cluster_num].copy()
cluster_members_max_Spearman = cluster_members[
np.abs(
cluster_members['DR Spearman Corr'] -
cluster_members['DR Spearman Corr'].max()
) < 10**(-5)
]
if cluster_members_max_Spearman.shape[0] == 1:
final_table = final_table.append(
cluster_members_max_Spearman, sort=True)
else:
final_table = final_table.append(
cluster_members_max_Spearman.loc[
cluster_members_max_Spearman['DR Pearson Corr']
.idxmax()
],
sort=True)
if final_table.shape[0] != 0:
final_table['DRSIP Rank'] = range(1, final_table.shape[0]+1)
final_table.index.name = 'Original Rank'
self.final_results = final_table.copy()
[docs] def calc_CAPRI_class(self, ref_pdb_file, static_sel_str, mobile_sel_str):
"""
Classify each docking pose to one of four CAPRI classes.
Each docking pose is compared to the reference pose in
ref_pdb_file and assigned to one of the following classes:
'Incorrect', 'Acceptable', 'Medium' or 'High'.
Uses the CAPRI criteria to classify the poses. See
:py:meth:`docking_eval.assign_soluble_CAPRI_class <docking_eval.assign_soluble_CAPRI_class>`.
Parameters
----------
ref_pdb_file : str
Full file path to PDB file containing the reference pose.
static_sel_str, mobile_sel_str : str
MDAnalysis style selection string to select the atoms in
ref_pdb_file that corresponds to the static and mobile
monomer, respectively.
Returns
-------
CAPRI_class : pandas.Dataframe
Table of the docking poses with their respective
classifications, % Native Contacts and iRMSD.
"""
universe = MDAnalysis.Universe(ref_pdb_file)
self.poses_uni.trajectory[0]
self.poses_CAPRI_class = docking_eval.assign_soluble_CAPRI_class(
universe.atoms,
static_sel_str,
mobile_sel_str,
self.poses_uni.atoms
)
self.CAPRI_assign_status = True
return self.poses_CAPRI_class
[docs] def save_data(self, filename):
"""
Save current session to file.
Parameters
----------
filename : str
Full file path to save to
"""
if not self.run_status:
self.run()
temp_storage_dict = {}
temp_storage_dict = self.reader_obj.save_data(temp_storage_dict)
temp_storage_dict.update(
{'has_dist_rest_data': self.has_dist_rest_data,
'protocol': 'soluble',
'run_status': self.run_status
}
)
save_load.save_pd_table(
temp_storage_dict, self.filter_data, 'filter_data')
save_load.save_pd_table(
temp_storage_dict, self.filter_pass_results, 'filter_pass_results')
save_load.save_pd_table(
temp_storage_dict, self.filtered_poses_data, 'filtered_poses_data')
save_load.save_pd_table(
temp_storage_dict, self.final_results, 'final_results')
temp_storage_dict['dist_rest_file'] = \
save_load.convert_StrIO_or_file_to_str(self.dist_rest_file)
save_load.save_pd_table(
temp_storage_dict,
self.dist_rest_conform_dist,
'dist_rest_conform_dist'
)
if hasattr(self, 'poses_CAPRI_class'):
save_load.save_pd_table(
temp_storage_dict, self.poses_CAPRI_class, 'poses_CAPRI_class')
common.makedir(filename)
with open(filename, 'wb') as output_file:
output_file.write(zlib.compress(
umsgpack.packb(temp_storage_dict), 9))
[docs] def write_pose(self, pose_num, filename):
"""
Write a pose to a PDB file.
Parameters
----------
pose_num : int
The pose's original ZDOCK rank.
filename : str
Filename of PDB file to store the docking pose.
"""
common.makedir(filename)
self.poses_uni.trajectory[pose_num-1]
self.poses_uni.atoms.write(filename)
[docs] def write_topN_poses(self, folder, num_poses=20):
"""
Write the top-N docking poses to PDB.
Parameters
----------
folder : str
Folder to write the PDB files of the top-N docking poses
ranked by DR-SIP. Files are written out as Pose_1.pdb,
Pose_2.pdb, ..., Pose_N.pdb.
num_poses : int, optional
The number of top poses to write out. Default: 20
"""
if not self.run_status:
self.run()
if self.final_results.shape[0] < num_poses:
num_poses = self.final_results.shape[0]
for idx in xrange(num_poses):
pose_num = self.final_results.index.values[idx]
self.write_pose(pose_num, os.path.join(
folder, 'Pose_%d.pdb' % (idx + 1)))