Source code for drsip.struct_clustering

"""
=====================================================
Structure Clustering (:mod:`drsip.struct_clustering`)
=====================================================

Module contains functions implementing the structure clustering part of
the protocol.

Functions
---------

.. autofunction:: min_dist
.. autofunction:: calc_MSD
.. autofunction:: calc_MSD_mob_static
.. autofunction:: cluster_poses
.. autofunction:: calc_rot_static_coord
.. autofunction:: cal_rmsd_between_poses_membrane
.. autofunction:: cal_rmsd_between_poses_soluble
"""

from numba import jit, float32
import scipy as sp
import scipy.cluster
import scipy.spatial.distance
import numpy as np
import drsip_common


[docs]def min_dist(dist_mat_1, dist_mat_2): """Compare the elements of 2 matrices and keep the minimum values. Values are compared element-wise and the minimum of the 2 values is returned. Parameters ---------- dist_mat_1, dist_mat_2 : np.array NxN distance matrices between the 2 monomers in the docking pose. Where N are the number of atoms. Returns ------- np.array Returns a new distance matrix containing the minimum values for each element. """ temp_dist_mat = dist_mat_1.copy() swap_idx = np.where(dist_mat_1 > dist_mat_2) temp_dist_mat[swap_idx] = dist_mat_2[swap_idx] return temp_dist_mat
[docs]@jit(float32[:](float32[:, :], float32[:, :, :]), nopython=True) def calc_MSD(ref_coord, mobile_coords): """Compute the mean squared deviation (MSD). MSDs are computed between the reference and the mobile monomers from different poses. Parameters ---------- ref_coord : np.array Nx3 coordinate matrix of the reference monomer with N atoms. mobile_coords : np.array MxNx3 coordinate matrices of mobile monomers with N atoms from M poses. Returns ------- np.array M-dim vector containing the MSDs between the static and mobile monomers. """ num_coords = mobile_coords.shape[0] num_atoms = mobile_coords.shape[1] MSDs = np.zeros(num_coords, dtype=np.float32) for coord_idx in xrange(num_coords): MSDs[coord_idx] = np.sum( (mobile_coords[coord_idx] - ref_coord)**2) return MSDs/num_atoms
[docs]@jit(float32[:, :](float32[:, :, :], float32[:, :, :]), nopython=True) def calc_MSD_mob_static(rotated_static_coords, mobile_coords): """ Compute the MSD between mobile and static monomers. Parameters ---------- rotated_static_coords : np.array MxNx3 coordinate matrix of the static monomers with N atoms from the M poses after superimposing the mobile monomer to the static monomer. mobile_coords : np.array MxNx3 coordinate matrices of the mobile monomers with N atoms from M poses. Returns ------- np.array MxM matrix of the MSDs between all pairs of poses. """ num_poses = rotated_static_coords.shape[0] MSDs = np.zeros((num_poses, num_poses), dtype=np.float32) # Compute the first and last rows of the MSD MSDs[0, 1:] = calc_MSD(rotated_static_coords[0], mobile_coords[1:]) MSDs[num_poses-1, 0:num_poses - 1] = calc_MSD(rotated_static_coords[num_poses-1], mobile_coords[:num_poses-1]) # For the other rows: for row_idx in xrange(1, num_poses-1): # To avoid comparing the same pose (diagonal), split the # calculation into two parts: # Lower half of the MSD MSDs[row_idx, 0:row_idx] = calc_MSD( rotated_static_coords[row_idx], mobile_coords[:row_idx]) # Upper half of the MSDs MSDs[row_idx, row_idx + 1:] = calc_MSD(rotated_static_coords[row_idx], mobile_coords[row_idx+1:]) return MSDs
[docs]def cluster_poses(dist_mat, cutoff=12.0, criterion='distance'): """Cluster docking poses Docking poses are clustered with hierarchical clustering and average linkage criteria. The distance matrix contains the RMSD between the poses. See :py:meth:`cal_rmsd_between_poses_membrane <drsip.struct_clustering.cal_rmsd_between_poses_membrane>` for HoTPs or :py:meth:`cal_rmsd_between_poses_soluble <drsip.struct_clustering.cal_rmsd_between_poses_soluble>` for soluble proteins. Clustering is implemented by `SciPy <https://docs.scipy.org/doc/scipy-0.14.0/reference/cluster.hierarchy.html>`_. Parameters ---------- dist_mat : np.array MxM distance matrix containing the RMSD between all pairs of poses. Where M is the number of poses. cutoff : float, optional The cutoff used to determine when to stop combining clusters. This happens when the shortest distance between all pairs of clusters is >cutoff. Default cutoff is 12.0 Angstroms. criterion : str, optional Default criterion is 'distance' which uses the cophenetic distance to compute distances between members from different clusters. The cophenetic distance is the distance between the 2 clusters. See `SciPy documentation <https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.cluster.hierarchy.fcluster.html#scipy.cluster.hierarchy.fcluster>`_ for details. Returns ------- np.array Returns an array of length M where each element in the array corresponds to cluster number that the m'th pose has been assigned to. """ average_tree = sp.cluster.hierarchy.average( sp.spatial.distance.squareform(dist_mat)) return sp.cluster.hierarchy.fcluster(average_tree, cutoff, criterion=criterion)
[docs]def calc_rot_static_coord(static_coord, mobile_com, static_com, rot_mat): """Compute coordinate of the static monomer after superimposition. After the mobile monomer is superimposed onto one of the monomers from the other pose, the static monomer moves with the mobile monomer. This function computes the coordinates of this static monomer. Parameters ---------- static_coord : np.array Nx3 coordinate matrix of the static monomer with N atoms. mobile_com : np.array Vector pointing from the origin to the mobile monomer's center of mass (COM). COM of the mobile monomer in the pose before superimposition. static_com : np.array Vector pointing from the origin to the static monomer's center of mass (COM). COM of the static monomer in the pose before superimposition. rot_mat : np.array 3x3 rotation matrix used to superimpose the mobile monomer to the monomer on the other pose. Returns ------- np.array Nx3 coordinate matrix of the static monomer after superimposition of the mobile monomer. """ return (static_coord - mobile_com).dot(rot_mat.T) + static_com
[docs]def cal_rmsd_between_poses_membrane(static_coord, mobile_coords): """Compute the RMSD/distance between the docking poses of HoTPs. Each pose contains 2 monomers, the static monomer whose coordinates are fixed and the mobile monomer that is translated and rotated during docking. There are 4 ways to compute the RMSD between any 2 monomers, each chosen from one of the 2 poses: 1. Static monomers from both poses are superimposed and RMSD is computed for the mobile monomers. No superimposition is required for ZDOCK generated poses since the static monomers are fixed. 2. Mobile monomers from both poses are superimposed and RMSD is computed for the static monomers. 3. A mobile monomer from one pose is superimposed onto the static monomer in the second pose. The RMSD of monomers that were not superimposed are computed. 4. The monomers used to compute the RMSD in (3) is superimposed and while the other two monomers are used to compute the RMSD. This function computes all 4 RMSDs and picks the smallest RMSD to represent the distance between these 2 poses. Parameters ---------- static_coord : np.array Nx3 coordinate matrix of the static monomer with N atoms. mobile_coords : np.array MxNx3 coordinate matrices of the mobile monomers with N atoms from M poses. Returns ------- np.array MxM matrix containing the RMSDs between the poses. """ num_poses = mobile_coords.shape[0] num_atoms = mobile_coords.shape[1] static_com = static_coord.mean(axis=0) static_ori_coord = static_coord - static_com poses_mobile_com = mobile_coords.mean(axis=1) poses_mobile_ori_coords = mobile_coords - \ poses_mobile_com[:, None, :] aln_mob_to_stat_static_coords = np.zeros( (num_poses, num_atoms, 3), dtype=np.float32) aln_mob_to_mob_static_coord = np.zeros((1, num_atoms, 3), dtype=np.float32) # Superimpose 2 poses at the static monomer and compute the MSD # between the 2 mobile monomers. Store them in this matrix: stat_stat_msd = np.zeros((num_poses, num_poses), dtype=np.float32) # Superimpose 2 poses at the mobile monomer and compute the MSD # between the 2 static monomers. Store them in this matrix: mob_mob_msd = np.zeros((num_poses, num_poses), dtype=np.float32) # Superimpose 2 poses at the mobile monomer and static monomer and # compute the MSD between the 2 remaining monomers. Store them in # this matrix: mob_stat_msd = np.zeros((num_poses, num_poses), dtype=np.float32) for row_idx in range(num_poses): # Each mobile pose vs the other mobile poses. # MSD matrix is symmetric therefore computing only the upper # half of the matrix. stat_stat_msd[row_idx, row_idx+1:] = calc_MSD( mobile_coords[row_idx], mobile_coords[row_idx+1:]) # Superimpose the mobile monomer (pose 1) to the static monomer # (pose 2) and store the coordinates of the static monomer # (pose 1) after the superimposition. rot_mat_mobile_to_static = drsip_common.get_best_fit_rot_mat( poses_mobile_ori_coords[row_idx], static_ori_coord).astype('float32') aln_mob_to_stat_static_coords[row_idx, :] = calc_rot_static_coord( static_coord, poses_mobile_com[row_idx], static_com, rot_mat_mobile_to_static) for col_idx in range(row_idx+1, num_poses): # Superimpose the 2 poses at the mobile monomers and store # the static monomer after superimposition. rot_mat_mobile_to_mobile = drsip_common.get_best_fit_rot_mat( poses_mobile_ori_coords[row_idx], poses_mobile_ori_coords[col_idx]).astype('float32') aln_mob_to_mob_static_coord[0, :] = calc_rot_static_coord( static_coord, poses_mobile_com[row_idx], poses_mobile_com[col_idx], rot_mat_mobile_to_mobile) # Compute MSD between the 2 poses's static monomers for the # case where the mobile monomers are used for # superimposition. Symmetric matrix therefore we compute # only the upper half. mob_mob_msd[row_idx, col_idx] = calc_MSD( static_coord, aln_mob_to_mob_static_coord) # Compute the MSD between the mobile (pose 1) and static (pose 2) # monomer from the 2 poses after superimposition of the static # (pose 1) and mobile (pose 2) monomer and vice versa. # This matrix is not symmetric! mob_stat_msd[:] = calc_MSD_mob_static( aln_mob_to_stat_static_coords, mobile_coords) # Identify the minimum MSD between each pair of poses current_min_msd = min_dist(stat_stat_msd, mob_mob_msd) current_min_msd = min_dist(current_min_msd, np.triu(mob_stat_msd)) current_min_msd = min_dist(current_min_msd, np.triu(mob_stat_msd.T)) return np.sqrt(current_min_msd + current_min_msd.T)
[docs]def cal_rmsd_between_poses_soluble(static_coord, poses_mobile_coords): """Compute the RMSD between poses of soluble proteins. Parameters ---------- static_coord : np.array Nx3 coordinate matrix of the static monomer with N atoms. mobile_coords : np.array MxNx3 coordinate matrices of the mobile monomers with N atoms from M poses. Returns ------- np.array MxM matrix containing the RMSDs between the poses. """ num_poses = poses_mobile_coords.shape[0] msd_between_conf = np.zeros((num_poses, num_poses), dtype=np.float32) for row_idx in xrange(num_poses): row_mobile_coord = poses_mobile_coords[row_idx] msd_between_conf[row_idx, row_idx + 1:] = calc_MSD(row_mobile_coord, poses_mobile_coords[row_idx+1:]) return np.sqrt((msd_between_conf.T + msd_between_conf))