@misc{10481/105012, year = {2025}, month = {5}, url = {https://hdl.handle.net/10481/105012}, abstract = {The protein sequence space is vast. This fact, together with the prevalence of epistasis, hampers the engineering of novel enzymes through library screening and is a major obstacle to any attempt to predict natural protein evolution. Recently, specialized methodologies have been used to determine fitness data on ~260,000 sequences for the gene of the enzyme dihydrofolate reductase and antibody affinity data for all combinations of the mutations present in the receptor-binding domain (RBD) of the Omicron strain of SARS-CoV-2 (~30,000 variants). We show that upon iterative training on a total of just a few hundred variants, various state-of-the-art AI tools (multi-layer perceptron, random forest, and XGBoost algorithms) find very high fitness variants of the enzyme and predict the antibody evasion patterns of the RBD. This work provides a basis for efficient, widely applicable, low-throughput experimental approaches to assess viral protein evolution and to engineer enzymes for biotechnological applications.}, organization = {Instituto de Salud Carlos III (IHRC22/00004)}, organization = {Next-Generation EU}, organization = {MICIU/AEI/10.13039/501100011033 (PID2021-124534OB-100, PID2021-0125017OB-I00)}, organization = {Enia Programs}, publisher = {MDPI}, keywords = {Enzyme engineering}, keywords = {Viral protein evolution}, keywords = {Focused library screening}, title = {Efficient Searches in Protein Sequence Space Through AI-Driven Iterative Learning}, doi = {10.3390/ijms26104741}, author = {Suárez-Martín, Ignacio and Risso, Valeria Alejandra and Romero-Zaliz, Rocío and Sánchez Ruiz, José Manuel}, }