Speaker
Description
Computational, data-driven predictors of protein-protein interactions have become an active research area in interaction proteomics. Interestingly, high-performing baseline sequence-based models designed to predict interacting protein pairs were recently found to be trained on datasets of high similarity, resulting in data leakage during model training. In this study, we analyze protein-protein interaction networks (PPINs) and its biological and topological implications for computational models. We experiment with structure-aware machine learning models using various message-passing graph neural networks (MP-GNNs) paired with protein language models on varying PPINs—an unprocessed dataset from \cite{pan} and a clustered dataset from \cite{clus}. We found that MP-GNN models trained on clustered dataset paired with the protein language model ProtTrans perform best. Moreover, we found that model performance evidently changed when trained on an unprocessed dataset and evaluated on an unseen clustered dataset, and models trained on clustered datasets more reliably predict interactions than those trained on unprocessed ones. In essence, findings highlight that the framework can effectively learn interaction patterns, but its predictive power is still constrained by the quality and composition of the training data. Extensively, we discuss what the limitations of MP-GNNs are as computational models to enhance PPINs and, in turn, how they should be used as effective predictive models.
Bibliography
@article{pan,
title={Large-Scale prediction of human protein- protein interactions from amino acid sequence based on latent topic features},
author={Pan, Xiao-Yong and Zhang, Ya-Nan and Shen, Hong-Bin},
journal={Journal of proteome research},
volume={9},
number={10},
pages={4992--5001},
year={2010},
publisher={ACS Publications}
}
@article{clus,
title={Cracking the black box of deep sequence-based protein--protein interaction prediction},
author={Bernett, Judith and Blumenthal, David B and List, Markus},
journal={Briefings in Bioinformatics},
volume={25},
number={2},
pages={bbae076},
year={2024},
publisher={Oxford University Press}
}
@article{whalen2022navigating,
title={Navigating the pitfalls of applying machine learning in genomics},
author={Whalen, Sean and Schreiber, Jacob and Noble, William S and Pollard, Katherine S},
journal={Nature Reviews Genetics},
volume={23},
number={3},
pages={169--181},
year={2022},
publisher={Nature Publishing Group UK London}
}