Speaker
Description
Recent computational methods predict the binding affinity between pairs of proteins and ligands, often applied to evaluate how well a candidate drug might attach to a target protein. In complex-based protein-ligand affinity prediction, binding strength is inferred from the known 3-dimensional structures of protein-ligand complexes. State-of-the-art methods employ neural networks, but certain geometric structures remain difficult to encode as learnable features. To address this challenge, we apply persistent homology cohomology (PCH)\cite{pch}, a mathematical technique that identifies “holes” in data across spatial dimensions. In the protein-ligand complex, PCH identifies 0-dimensional holes, representing clusters of nearby atoms, 1-dimensional holes representing ring-like atomic structures, and 2-dimensional holes representing cavities in the binding pocket. To represent PCH features in a learnable format, we introduce the spatially-aware topological graph. We locate spatial representatives of each n-dimensional PCH feature, and compute the shape-preserving Wasserstein distance between each pair. These distances induce the spatially-aware topological graph, nodes represent n-dimensional holes and edges connect holes sharing a Wasserstein-neighborhood. Embedding nodes and edges with known physiochemical properties, we train a graph neural network on the PDBbind dataset\cite{pdb} of known protein-ligand complexes, achieving accuracy comparable to existing high-end methods.
Bibliography
@article{pch,
author = {Cang, Zixuan and Wei, Guo-Wei},
title = {Persistent Cohomology for Data With Multicomponent Heterogeneous Information},
journal = {SIAM Journal on Mathematics of Data Science},
volume = {2},
number = {2},
pages = {396-418},
year = {2020},
doi = {10.1137/19M1272226},
URL = {
https://doi.org/10.1137/19M1272226
},
eprint = {
https://doi.org/10.1137/19M1272226
}
,
abstract = { Persistent homology is a powerful tool for characterizing the topology of a data set at various geometric scales. When applied to the description of molecular structures, persistent homology can capture the multiscale geometric features and reveal certain interaction patterns in terms of topological invariants. However, in addition to the geometric information, there is a wide variety of nongeometric information of molecular structures, such as element types, atomic partial charges, atomic pairwise interactions, and electrostatic potential functions, that is not described by persistent homology. Although element-specific homology and electrostatic persistent homology can encode some nongeometric information into geometry based topological invariants, it is desirable to have a mathematical paradigm to systematically embed both geometric and nongeometric information, i.e., multicomponent heterogeneous information, into unified topological representations. To this end, we propose a persistent cohomology based framework for the enriched representation of data. In our framework, nongeometric information can either be distributed globally or reside locally on the datasets in the geometric sense and can be properly defined on topological spaces, i.e., simplicial complexes. Using the proposed persistent cohomology based framework, enriched barcodes are extracted from datasets to represent heterogeneous information. We consider a variety of datasets to validate the present formulation and illustrate the usefulness of the proposed method based on persistent cohomology. It is found that the proposed framework outperforms or at least matches the state-of-the-art methods in the protein-ligand binding affinity prediction from massive biomolecular datasets without resorting to any deep learning formulation. }
}
@article{pdb,
abstract = {We have screened the entire Protein Data Bank (Release No. 103, January 2003) and identified 5671 protein-ligand complexes out of 19 621 experimental structures. A systematic examination of the primary references of these entries has led to a collection of binding affinity data (K(d), K(i), and IC(50)) for a total of 1359 complexes. The outcomes of this project have been organized into a Web-accessible database named the PDBbind database.},
author = {Wang, Renxiao and Fang, Xueliang and Lu, Yipin and Wang, Shaomeng},
doi = {10.1021/jm030580l},
issn = {0022-2623},
journal = {Journal of medicinal chemistry},
month = {June},
number = {12},
pages = {2977---2980},
title = {The PDBbind database: collection of binding affinities for protein-ligand complexes with known three-dimensional structures},
url = {https://doi.org/10.1021/jm030580l},
volume = {47},
year = {2004},
bdsk-url-1 = {https://doi.org/10.1021/jm030580l}}