"""ModelCIF files generated by AlphaFold 2 deviate from the official ModelCIF
definition dictionary in specific cases. Here are functions to fix this.
"""
# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
# Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Won't shorten the module atm, allow in Pylint
# pylint: disable=too-many-lines
from pathlib import Path
from timeit import default_timer as timer
import datetime
import gzip
import logging
import os
import shutil
import zipfile
import ihm.citations
import modelcif.dumper
import modelcif.model
import modelcif.protocol
import modelcif.qa_metric
import modelcif.associated
import numpy as np
logger = logging.getLogger(__name__)
# These classes follows python-modelcif, allow too few methods in Pylint.
# pylint: disable=too-few-public-methods
class _LPeptideAlphabetWithXO(ihm.LPeptideAlphabet):
"""Have the default amino acid alphabet plus 'X' for unknown residues
and 'O' as allowed non-def. AA (U already in alphabet)."""
# extra entry added according to LPeptideAlphabet def. in
# https://python-ihm.readthedocs.io/en/latest/_modules/ihm.html
# and https://files.rcsb.org/view/1NTH.cif for values for 'O'.
def __init__(self):
"""Create the alphabet."""
super().__init__()
self._comps["X"] = self._comps["UNK"]
self._comps["O"] = ihm.LPeptideChemComp(
"PYL", "O", "O", "PYRROLYSINE", "C12 H21 N3 O3"
)
class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
"""Predicted accuracy according to the CA-only lDDT in [0,100]"""
name = "pLDDT"
software = None
class _LocalPairwisePAE(
modelcif.qa_metric.LocalPairwise, modelcif.qa_metric.PAE
):
"""Predicted aligned error (in Angstroms)"""
name = "PAE"
software = None
class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
"""Predicted accuracy according to the CA-only lDDT in [0,100]"""
name = "pLDDT"
software = None
class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
"""Predicted accuracy according to the TM-score score in [0,1]"""
name = "pTM"
software = None
[docs]
class GlobalIpTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM):
"""Predicted protein-protein interface score based on TM-score in [0,1]"""
name = "ipTM"
software = None
[docs]
class GlobalConfRankMultimer(
modelcif.qa_metric.Global, modelcif.qa_metric.NormalizedScore
):
"""Default ranking score used by AlphaFold-Multimer"""
name = "ranking-confidence (ipTM*0.8+pTM*0.2)"
software = None
# pylint: enable=too-few-public-methods
[docs]
def get_cf_sequence_dbs(config_data):
"""Get ColabFold sequence databases and store them in ``config_data``.
Looks up a hardcoded list of known ColabFold sequence databases and
populates ``config_data["seq_dbs"]`` with
:class:`modelcif.ReferenceDatabase` instances corresponding to the
databases requested via ``config_data["seq_db_keys"]``. If a template
database is specified via ``config_data["tpl_db"]``, it is appended as
well. UniRef database entries require a version string in
``config_data["ur30_db_version"]``; template database entries require
a version string in ``config_data["tpl_db_version"]``.
Args:
config_data (dict): Configuration data dictionary. Relevant keys:
``seq_db_keys`` (list of :class:`str`) — sequence database
identifiers to look up; ``ur30_db_version`` (:class:`str` or
``None``) — version string required when ``"UniRef"`` is in
``seq_db_keys``; ``tpl_db`` (:class:`str` or ``None``) — optional
template database identifier; ``tpl_db_version`` (:class:`str` or
``None``) — version string required when ``tpl_db`` is set. On
return, ``seq_dbs`` is added as a list of
:class:`modelcif.ReferenceDatabase` instances.
Returns:
None
Raises:
ValueError: If ``"UniRef"`` is in ``seq_db_keys`` but
``ur30_db_version`` is ``None``.
ValueError: If ``tpl_db`` is set but ``tpl_db_version`` is ``None``.
ValueError: If a resolved database key is not found in the hardcoded
set of known ColabFold databases.
"""
# Uses HC list of known DBs used in ColabFold
# -> see also notes in get_cf_config
db_dict = {
"UniRef_2021_03": modelcif.ReferenceDatabase(
"UniRef30",
"https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz",
version="2021_03",
),
"UniRef_2022_02": modelcif.ReferenceDatabase(
"UniRef30",
"https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2202.tar.gz",
version="2022_02",
),
"UniRef_2023_02": modelcif.ReferenceDatabase(
"UniRef30",
"https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2302.tar.gz",
version="2023_02",
),
"Environmental": modelcif.ReferenceDatabase(
"ColabFold DB",
"https://wwwuser.gwdg.de/~compbiol/colabfold/"
+ "colabfold_envdb_202108.tar.gz",
version="2021_08",
),
"PDB100_230517": modelcif.ReferenceDatabase(
"PDB100",
"https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+ "hhsuite_dbs/pdb100_foldseek_230517.tar.gz",
release_date=datetime.date(2023, 5, 17),
),
"PDB70_211027": modelcif.ReferenceDatabase(
"PDB70",
"https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+ "hhsuite_dbs/pdb70_from_mmcif_211027.tar.gz",
release_date=datetime.date(2021, 10, 27),
),
"PDB70_211117": modelcif.ReferenceDatabase(
"PDB70",
"https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+ "hhsuite_dbs/pdb70_from_mmcif_211117.tar.gz",
release_date=datetime.date(2021, 11, 17),
),
"PDB70_220313": modelcif.ReferenceDatabase(
"PDB70",
"https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+ "hhsuite_dbs/pdb70_from_mmcif_220313.tar.gz",
release_date=datetime.date(2022, 3, 13),
),
}
# fill list of DBs
seq_dbs = []
search_keys = []
for seq_db in config_data["seq_db_keys"]:
if seq_db == "UniRef":
if config_data["ur30_db_version"] is None:
raise ValueError("Cannot use UniRef without version")
search_key = f"UniRef_{config_data['ur30_db_version']}"
else:
search_key = seq_db
search_keys.append(search_key)
if config_data["tpl_db"] is not None:
if config_data["tpl_db_version"] is None:
raise ValueError("Cannot have tpl DB without version")
search_keys.append(
f"{config_data['tpl_db']}_{config_data['tpl_db_version']}"
)
for search_key in search_keys:
if search_key not in db_dict:
raise ValueError(f"Unknown seq. DB {search_key}")
seq_dbs.append(db_dict[search_key])
config_data["seq_dbs"] = seq_dbs
[docs]
def get_localcolabfold_software(version=None):
"""Get LocalColabFold as a :class:`dict` for creating a software object.
Args:
version (str): Version of LocalColabFold. Should only be ``None`` if the
version is genuinely unavailable.
Returns:
dict: A dictionary with software metadata suitable for creating a
ModelCIF software object.
"""
return {
"name": "LocalColabFold",
"classification": "model building",
"description": "Structure prediction",
"citation": None,
"location": "https://github.com/YoshitakaMo/localcolabfold",
"type": "package",
"version": version,
}
[docs]
def get_colabfold_software(version=None):
"""Get ColabFold as a :class:`dict` for creating a software object.
Args:
version (str): Version of ColabFold. Should only be ``None`` if the
version is genuinely unavailable.
Returns:
dict: A dictionary with software metadata suitable for creating a
ModelCIF software object.
"""
return {
"name": "ColabFold",
"classification": "model building",
"description": "Structure prediction",
"citation": ihm.citations.colabfold,
"location": "https://github.com/sokrypton/ColabFold",
"type": "package",
"version": version,
}
[docs]
def get_mmseqs2_software(version=None):
"""Get MMseqs2 as a :class:`dict` for creating a software object.
Args:
version (str): Version of MMseqs2. Should only be ``None`` if the
version is genuinely unavailable.
Returns:
dict: A dictionary with software metadata suitable for creating a
ModelCIF software object.
"""
return {
"name": "MMseqs2",
"classification": "data collection",
"description": "Many-against-Many sequence searching",
"citation": ihm.citations.mmseqs2,
"location": "https://github.com/soedinglab/mmseqs2",
"type": "package",
"version": version,
}
[docs]
def get_af2_software(version=None, is_multimer=False):
"""Get AlphaFold 2 as a :class:`dict` for creating a software object.
Args:
version (str): Version of AlphaFold 2. Should only be ``None`` if the
version is genuinely unavailable.
is_multimer (bool): If ``True``, return metadata for AlphaFold-Multimer
instead of AlphaFold 2.
Returns:
dict: A dictionary with software metadata suitable for creating a
ModelCIF software object. The ``name`` and ``citation`` entries
differ depending on ``is_multimer``.
"""
if is_multimer:
return {
"name": "AlphaFold-Multimer",
"classification": "model building",
"description": "Structure prediction",
"citation": ihm.Citation(
pmid=None,
title="Protein complex prediction with "
+ "AlphaFold-Multimer.",
journal="bioRxiv",
volume=None,
page_range=None,
year=2021,
authors=[
"Evans, R.",
"O'Neill, M.",
"Pritzel, A.",
"Antropova, N.",
"Senior, A.",
"Green, T.",
"Zidek, A.",
"Bates, R.",
"Blackwell, S.",
"Yim, J.",
"Ronneberger, O.",
"Bodenstein, S.",
"Zielinski, M.",
"Bridgland, A.",
"Potapenko, A.",
"Cowie, A.",
"Tunyasuvunakool, K.",
"Jain, R.",
"Clancy, E.",
"Kohli, P.",
"Jumper, J.",
"Hassabis, D.",
],
doi="10.1101/2021.10.04.463034",
),
"location": "https://github.com/deepmind/alphafold",
"type": "package",
"version": version,
}
return {
"name": "AlphaFold",
"classification": "model building",
"description": "Structure prediction",
"citation": ihm.citations.alphafold2,
"location": "https://github.com/deepmind/alphafold",
"type": "package",
"version": version,
}
[docs]
def get_cf_sw_plus_params(config_data, use_localcolabfold=False):
"""Create a list of software and parameters for a ColabFold protocol step.
Args:
config_data (dict): ColabFold configuration data as returned by
:func:`get_cf_config`.
use_localcolabfold (bool): If ``True``, prepend LocalColabFold to the
list of software entries.
Returns:
list[tuple[dict, dict]]: A list of ``(software, parameters)`` tuples
suitable for use in a protocol.
"""
sw_plus_params = []
if use_localcolabfold:
sw_plus_params.append((get_localcolabfold_software(), {}))
sw_plus_params.append(
(
get_colabfold_software(config_data["cf_version"]),
config_data["cf_params"],
)
)
if config_data["use_mmseqs"]:
sw_plus_params.append((get_mmseqs2_software(), {}))
sw_plus_params.append(
(get_af2_software(is_multimer=config_data["use_multimer"]), {})
)
return sw_plus_params
[docs]
def get_cf_config(
cf_config, ur30_db_version=None, tpl_db=None, tpl_db_version=None
):
"""Process a ColabFold configuration into a standardised data dictionary.
Args:
cf_config (dict): Raw ColabFold configuration data, typically read
from a ColabFold configuration file. Must contain the keys
``version``, ``msa_mode``, ``model_type``, ``num_recycles``,
``use_templates``, and ``rank_by``. Optional keys include
``commit``, ``pair_mode``, ``recycle_early_stop_tolerance``,
``stop_at_score``, ``num_seeds``, ``num_models``, ``use_amber``,
and ``num_relax``.
ur30_db_version (str, optional): Version of the UniRef30 database
used. Should only be ``None`` if the database was not used.
tpl_db (str, optional): Template database used. Accepted values are
``"PDB70"``, ``"PDB100"``, or ``None`` if no template database
was used.
tpl_db_version (str, optional): Version of the template database
used. Should only be ``None`` if the database was not used.
Returns:
dict: A dictionary with processed ColabFold configuration data for
further use in model preparation.
Raises:
ValueError: If ``msa_mode`` is not one of the known values.
ValueError: If ``model_type`` is not one of the known values.
ValueError: If ``rank_by`` is not one of the known values.
"""
# Not going to reduce no. of variables, branches or statements at this
# point, allow in Pylint
# pylint: disable=too-many-locals,too-many-branches,too-many-statements
# keep version indep. of params (and add commit since versions are meh)
cf_version = cf_config["version"]
if "commit" in cf_config and cf_config["commit"] is not None:
cf_version += f" ({cf_config['commit'][:7]})"
# drop fields which are not relevant for model building
cf_config = cf_config.copy()
for key in ["num_queries", "commit", "version", "user_agent"]:
if key in cf_config:
del cf_config[key]
# NOTE: following code from
# https://github.com/sokrypton/ColabFold/blob/main/colabfold/batch.py to
# understand config
# -> see also https://github.com/sokrypton/ColabFold/wiki/v1.5.0
# deal with old names (some settings changed name in v1.5)
# -> code taken almost verbatim from https://github.com/sokrypton/ColabFold
old_names = {
"MMseqs2 (UniRef+Environmental)": "mmseqs2_uniref_env",
"MMseqs2 (UniRef only)": "mmseqs2_uniref",
"unpaired+paired": "unpaired_paired",
"AlphaFold2-multimer-v1": "alphafold2_multimer_v1",
"AlphaFold2-multimer-v2": "alphafold2_multimer_v2",
"AlphaFold2-multimer-v3": "alphafold2_multimer_v3",
"AlphaFold2-ptm": "alphafold2_ptm",
"AlphaFold2": "alphafold2",
}
msa_mode = old_names.get(cf_config["msa_mode"], cf_config["msa_mode"])
if "pair_mode" in cf_config:
pair_mode = old_names.get(
cf_config["pair_mode"], cf_config["pair_mode"]
)
else:
pair_mode = None
model_type = old_names.get(cf_config["model_type"], cf_config["model_type"])
# fix v1.5 defaults for num_recycles and recycle_early_stop_tolerance
# -> def. (set as "null" in config):
# - num_recycles == 20 if alphafold2_multimer_v3 else 3
# - recycle_early_stop_tolerance == 0.5 if multimer else 0.0
# -> valid from 1.5.0 until 1.5.5 (and probably later)
# -> defined in alphafold/model/config.py of steineggerlab/alphafold repo
if "num_recycles" in cf_config and cf_config["num_recycles"] is None:
if "multimer" in model_type and model_type not in [
"alphafold2_multimer_v1",
"alphafold2_multimer_v2",
]:
cf_config["num_recycles"] = 20
else:
cf_config["num_recycles"] = 3
if (
"recycle_early_stop_tolerance" in cf_config
and cf_config["recycle_early_stop_tolerance"] is None
):
cf_config["recycle_early_stop_tolerance"] = (
0.5 if "multimer" in model_type else 0.0
)
# remove null config entries (ASSUME: None = use default)
cf_config = {k: v for k, v in cf_config.items() if v is not None}
# fetch relevant data
# -> MSA mode
if msa_mode == "mmseqs2_uniref_env":
seq_dbs = ["UniRef", "Environmental"]
use_mmseqs = True
use_msa = True
elif msa_mode == "mmseqs2_uniref":
seq_dbs = ["UniRef"]
use_mmseqs = True
use_msa = True
elif msa_mode == "single_sequence":
seq_dbs = []
use_mmseqs = False
use_msa = False
elif msa_mode == "custom":
seq_dbs = []
use_mmseqs = False
use_msa = True
else:
raise ValueError(f"Unknown msa_mode {cf_config['msa_mode']}")
# -> model type
if model_type == "alphafold2_multimer_v1":
# AF-Multimer as introduced in AlphaFold v2.1.0
use_multimer = True
multimer_version = 1
elif model_type == "alphafold2_multimer_v2":
# AF-Multimer as introduced in AlphaFold v2.2.0
use_multimer = True
multimer_version = 2
elif model_type == "alphafold2_multimer_v3":
# AF-Multimer as introduced in AlphaFold v2.3.0
use_multimer = True
multimer_version = 3
elif model_type == "alphafold2_ptm":
use_multimer = False
multimer_version = None
else:
raise ValueError(f"Unknown model_type {cf_config['model_type']}")
# write modeling description
mdl_description = f"Model generated using ColabFold v{cf_version}"
if use_multimer:
mdl_description += f" with AlphaFold-Multimer (v{multimer_version})"
else:
mdl_description += " with AlphaFold"
# early stopping feature of ColabFold
upto_mdl = ""
upto_rec = ""
if cf_config.get("stop_at_score", 100) < 100:
upto_mdl = "up to "
upto_rec = "up to "
if cf_config.get("recycle_early_stop_tolerance", 0) > 0:
upto_rec = "up to "
if cf_config.get("num_seeds", 1) > 1:
mdl_str = (
f"{cf_config['num_models'] * cf_config['num_seeds']} "
f"models ({cf_config['num_seeds']} random seeds per "
f"parameter set)"
)
else:
mdl_str = f"{cf_config['num_models']} models"
mdl_description += (
f" producing {upto_mdl}{mdl_str} with {upto_rec}"
f"{cf_config['num_recycles']} recycles each"
)
if cf_config.get("use_amber", False) or cf_config.get("num_relax", 0) > 0:
mdl_description += ", with AMBER relaxation"
else:
mdl_description += ", without model relaxation"
if cf_config["use_templates"]:
# tpl_db == None meant to mean that custom templates were used
# -> no need to stress it but just visible in search DBs
mdl_description += ", using templates"
else:
mdl_description += ", without templates"
tpl_db = None
tpl_db_version = None
if cf_config["rank_by"] == "plddt":
mdl_description += ", ranked by pLDDT"
elif cf_config["rank_by"] == "ptmscore":
mdl_description += ", ranked by pTM"
elif cf_config["rank_by"] == "multimer":
mdl_description += ", ranked by 80*ipTM+20*pTM"
else:
raise ValueError(f"Unknown rank_by {cf_config['rank_by']}")
if use_msa:
mdl_description += ", starting from"
if use_mmseqs:
msa_type = "MSA"
else:
msa_type = "custom MSA"
if use_multimer:
if pair_mode == "unpaired_paired":
mdl_description += f" paired and unpaired {msa_type}s"
elif pair_mode == "paired":
mdl_description += f" paired {msa_type}s"
elif pair_mode == "unpaired":
mdl_description += f" unpaired {msa_type}s"
elif pair_mode is None:
raise ValueError(
"Key 'pair_mode' required with " + "'use_multimer=True'"
)
else:
raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}")
elif msa_type.startswith("M"):
mdl_description += f" an {msa_type}"
else:
mdl_description += f" a {msa_type}"
if use_mmseqs:
mdl_description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
else:
mdl_description += " without an MSA"
mdl_description += "."
config_data = {
"cf_params": cf_config,
"cf_version": cf_version,
"seq_db_keys": seq_dbs,
"use_mmseqs": use_mmseqs,
"use_msa": use_msa,
"ur30_db_version": ur30_db_version,
"tpl_db": tpl_db,
"tpl_db_version": tpl_db_version,
"use_multimer": use_multimer,
"multimer_version": multimer_version,
"description": mdl_description,
}
get_cf_sequence_dbs(config_data)
return config_data
def _get_res_num(r, use_auth=False):
"""Get residue number, optionally from PDB auth IDs."""
if use_auth:
return int(r.GetStringProp("pdb_auth_resnum"))
return r.number.num
[docs]
def get_sequence(chn, use_auth=False):
"""Get the sequence of an `OpenStructure`_ chain, inserting ``'-'`` for
gaps.
Args:
chn (ost.mol.ChainHandle or ost.mol.ChainView): `OST`_ chain to extract
the sequence from. Any object providing the following interface
can be used as a drop-in replacement for the `OST`_ chain object:
- ``chn.residues``: sequence of residue objects, each providing
- ``chn.residues[i].number.num`` (:class:`int`): internal residue
number
- ``chn.residues[i].one_letter_code`` (:class:`str`): single-letter
code
- ``chn.residues[i].GetStringProp("pdb_auth_resnum")``
(:class:`str`): author residue number as an integer string, only
required if ``use_auth=True``
use_auth (bool): If ``True``, use PDB author residue numbers instead
of internal residue numbers.
Returns:
str: One-letter code sequence with ``'-'`` characters inserted for gaps.
"""
# initialise (add gaps if first is not at num. 1)
lst_rn = _get_res_num(chn.residues[0], use_auth)
idx = 1
sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code
for res in chn.residues[idx:]:
lst_rn += 1
while lst_rn != _get_res_num(res, use_auth):
sqe += "-"
lst_rn += 1
sqe += res.one_letter_code
return sqe
[docs]
def assemble_modelcif_software(soft_dict, params_dict):
"""Create a :class:`modelcif.SoftwareWithParameters` instance from
dictionaries.
Args:
soft_dict (dict): Software metadata as returned by functions such as
:func:`get_colabfold_software`. Must contain the keys ``name``,
``classification``, ``description``, ``location``, ``type``,
``version``, and ``citation``.
params_dict (dict): Software parameters, where each key is passed as
the parameter name and each value as the parameter value to
:class:`modelcif.SoftwareParameter`.
Returns:
modelcif.SoftwareWithParameters: A ModelCIF software object with
associated parameters.
"""
# create SW object
sw = modelcif.Software(
soft_dict["name"],
soft_dict["classification"],
soft_dict["description"],
soft_dict["location"],
soft_dict["type"],
soft_dict["version"],
citation=soft_dict["citation"],
)
# assemble parameters
params = []
for key, val in params_dict.items():
params.append(modelcif.SoftwareParameter(key, val))
# put them together
return modelcif.SoftwareWithParameters(sw, params)
def _get_ch_name(ch, use_auth=False):
"""Get chain name from auth. IDs if reading from mmCIF files."""
if use_auth:
return ch.GetStringProp("pdb_auth_chain_name")
return ch.name
class _OST2ModelCIF(modelcif.model.AbInitioModel):
"""Map an OST entity to a :class:`modelcif.model.AbInitioModel`.
Args:
assembly (modelcif.Assembly): Collection of asymmetric units for
this model.
asym (dict): Mapping of chain names to :class:`modelcif.AsymUnit`
objects.
ost_entity (ost.mol.Entity or ost.mol.EntityHandle): Loaded OST entity
object.
name (str, optional): Short name for this model.
plddt_from_b_factors (bool): If ``True``, read per-residue pLDDT
from B-factors instead of from ``scores_json``.
scores_json (dict): Score data to store. Supported keys are
``plddt``, ``plddt_global``, ``ptm``, ``iptm``, ``confrank``,
and ``pae``.
extra_global_scores (list): Additional global QA score objects to
append to ``qa_metrics``.
incl_pae (bool): If ``True``, include PAE scores. Defaults to
``True`` if ``"pae"`` is present in ``scores_json``.
use_auth (bool): If ``True``, use PDB author IDs instead of
internal IDs. Relevant when reading from mmCIF files.
pae_digits (int): Number of decimal digits for PAE values.
"""
# not going to fight for reducing a single attribute, allow in Pylint
# pylint: disable=too-many-instance-attributes
def __init__(self, *args, **kwargs):
"""Initialise a model"""
self.ost_entity = kwargs.pop("ost_entity")
self.asym = kwargs.pop("asym")
# optional arguments
self.plddt_from_b_factors = kwargs.pop("plddt_from_b_factors", False)
self.scores_json = kwargs.pop("scores_json", {})
self.extra_global_scores = kwargs.pop("extra_global_scores", [])
self.incl_pae = kwargs.pop("incl_pae", "pae" in self.scores_json)
self.use_auth = kwargs.pop("use_auth", False)
self.pae_digits = kwargs.pop("pae_digits", 3)
# get pLDDT if needed
if self.plddt_from_b_factors:
self.scores_json["plddt"] = []
for res in self.ost_entity.residues:
b_factors = [a.b_factor for a in res.atoms]
assert len(set(b_factors)) == 1 # must all be equal!
self.scores_json["plddt"].append(b_factors[0])
if "plddt_global" not in self.scores_json:
self.scores_json["plddt_global"] = np.mean(
self.scores_json["plddt"]
)
# check lengths for local scores (cannot deal with unmodelled gaps here)
exp_len = self.ost_entity.residue_count
if "plddt" in self.scores_json:
assert len(self.scores_json["plddt"]) == exp_len
if self.incl_pae:
assert "pae" in self.scores_json
assert len(self.scores_json["pae"]) == exp_len
assert len(self.scores_json["pae"][0]) == exp_len
super().__init__(*args, **kwargs)
def get_atoms(self):
"""Yield atom records for writing to a ModelCIF file via
:mod:`modelcif`."""
# ToDo [internal]: Take B-factor out since its not a B-factor?
# NOTE: this assumes that _get_res_num maps residue to pos. in seqres
# within asym
for atm in self.ost_entity.atoms:
yield modelcif.model.Atom(
asym_unit=self.asym[_get_ch_name(atm.chain, self.use_auth)],
seq_id=_get_res_num(atm.residue, self.use_auth),
atom_id=atm.name,
type_symbol=atm.element,
x=atm.pos[0],
y=atm.pos[1],
z=atm.pos[2],
het=atm.is_hetatom,
biso=atm.b_factor,
occupancy=atm.occupancy,
)
def add_scores(self):
"""Add QA metrics from AF2 scores."""
# global scores
for key, score_class in [
("plddt_global", _GlobalPLDDT),
("ptm", _GlobalPTM),
("iptm", GlobalIpTM),
("confrank", GlobalConfRankMultimer),
]:
if key in self.scores_json:
self.qa_metrics.append(score_class(self.scores_json[key]))
# extra ones
self.qa_metrics.extend(self.extra_global_scores)
# NOTE: none of the below expected to work if we have unmodelled gaps!
# local scores
lpae = []
i = 0
for chn_i in self.ost_entity.chains:
ch_name_i = _get_ch_name(chn_i, self.use_auth)
for res_i in chn_i.residues:
# local pLDDT
res_num_i = _get_res_num(res_i, self.use_auth)
if "plddt" in self.scores_json:
self.qa_metrics.append(
_LocalPLDDT(
self.asym[ch_name_i].residue(res_num_i),
self.scores_json["plddt"][i],
)
)
# pairwise alignment error
if self.incl_pae:
j = 0
for chn_j in self.ost_entity.chains:
ch_name_j = _get_ch_name(chn_j, self.use_auth)
for res_j in chn_j.residues:
res_num_j = _get_res_num(res_j, self.use_auth)
pae_ij = self.scores_json["pae"][i][j]
lpae.append(
_LocalPairwisePAE(
self.asym[ch_name_i].residue(res_num_i),
self.asym[ch_name_j].residue(res_num_j),
round(pae_ij, self.pae_digits),
)
)
j += 1
i += 1
if self.incl_pae:
self.qa_metrics.extend(lpae)
def _get_modelcif_entities(target_ents, asym_units, system):
"""Create ModelCIF entities and asymmetric units."""
alphabet = _LPeptideAlphabetWithXO()
for cif_ent in target_ents:
# combine into ModelCIF entity
mdlcif_ent = modelcif.Entity(
cif_ent["seqres"],
description=cif_ent["description"],
alphabet=alphabet,
source=cif_ent["source"],
references=cif_ent["references"],
)
# NOTE: this assigns (potentially new) alphabetic chain names
for pdb_chain_id in cif_ent["pdb_chain_ids"]:
asym_units[pdb_chain_id] = modelcif.AsymUnit(
mdlcif_ent,
strand_id=pdb_chain_id,
auth_seq_id_map=cif_ent["auth_seq_id_map"],
)
system.entities.append(mdlcif_ent)
def _get_modelcif_protocol_software(js_step):
"""Assemble software entries for a ModelCIF protocol step."""
# new setup in python-modelcif (as of late 2023): params with each SW
sw_list = []
for sw, sw_params in js_step["software_plus_params"]:
sw_list.append(assemble_modelcif_software(sw, sw_params))
# group and done...
if sw_list:
return modelcif.SoftwareGroup(sw_list)
return None
def _get_modelcif_protocol_data(
data_labels, target_entities, models, ref_dbs, acc_data, dg_cache=None
):
"""Assemble data for a ModelCIF protocol step.
Cached access to objects needed to remove duplicates in ModelCIF.
"""
# No idea how to reduce arguments here, allow in Pylint
# pylint: disable=too-many-positional-arguments,too-many-arguments
if dg_cache is None:
dg_cache = {}
cache_key = tuple(sorted(data_labels))
if cache_key in dg_cache:
return dg_cache[cache_key]
data = modelcif.data.DataGroup()
for data_label in data_labels:
if data_label == "target_sequences":
data.extend(target_entities)
elif data_label == "ref_dbs":
data.extend(ref_dbs)
elif data_label == "models":
data.extend(models)
elif data_label in acc_data:
data.append(acc_data[data_label])
else:
raise RuntimeError(f"Unknown protocol data: '{data_label}'")
dg_cache[cache_key] = data
return data
def _get_modelcif_protocol(
protocol_steps, target_entities, models, ref_dbs, acc_data
):
"""Create the protocol for the ModelCIF file."""
dg_cache = {} # cached DataGroup objects per protocol
protocol = modelcif.protocol.Protocol()
for js_step in protocol_steps:
sftwre = _get_modelcif_protocol_software(js_step)
input_data = _get_modelcif_protocol_data(
js_step["input"],
target_entities,
models,
ref_dbs,
acc_data,
dg_cache,
)
output_data = _get_modelcif_protocol_data(
js_step["output"],
target_entities,
models,
ref_dbs,
acc_data,
dg_cache,
)
protocol.steps.append(
modelcif.protocol.Step(
input_data=input_data,
output_data=output_data,
name=js_step["name"],
details=js_step["details"],
software=sftwre,
)
)
protocol.steps[-1].method_type = js_step["method_type"]
return protocol
def _get_assoc_pae_file(entry_id, mdl_name):
"""Generate a associated file object to extract PAE to extra file."""
return modelcif.associated.QAMetricsFile(
f"{mdl_name}_local_pairwise_qa.cif",
categories=["_ma_qa_metric_local_pairwise"],
copy_categories=["_ma_qa_metric"],
entry_id=entry_id,
entry_details="This file is an associated file consisting "
+ "of local pairwise QA metrics. This is a partial mmCIF "
+ "file and can be validated by merging with the main "
+ "mmCIF file containing the model coordinates and other "
+ "associated data.",
details="Predicted aligned error",
)
def _get_associated_file(
fle_path, data, file_format="other", file_content="other"
):
"""Generate a modelcif.associated.File object for given data."""
afile = modelcif.associated.File(
fle_path,
details=data.name,
data=data,
)
afile.file_format = file_format
afile.file_content = file_content
return afile
def _get_associated_files(mdl_name, arc_files):
"""Create entry for associated files."""
# package all into zip file
return modelcif.associated.Repository(
"",
[modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
)
# NOTE: by convention MA expects zip file with same name as model-cif
def _get_sw_for_qe(steps, step_name):
"""Fetch suitable SW objects from protocol steps to use in QE."""
# to maximally reduce duplicates we reuse single groups
# otherwise new group created using same SoftwareWithParameters objects
sw_groups = [step.software for step in steps if step.name == step_name]
if len(sw_groups) == 0:
return None
if len(sw_groups) == 1:
return sw_groups[0]
# each sw_group is a list (SoftwareGroup) of SoftwareWithParameters
# ...and we remove duplicates...just in case
sw_dict = {}
for sw_group in sw_groups:
sw_dict.update({hash(swp): swp for swp in sw_group})
return modelcif.SoftwareGroup(sw_dict.values())
def _package_associated_files(repo):
"""Compress associated files into single zip file and delete original."""
# zip settings tested for good speed vs compression
for archive in repo.files:
with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
for zfile in archive.files:
cif_zip.write(zfile.path, arcname=zfile.path)
os.remove(zfile.path)
def _compress_cif_file(cif_file):
"""Compress cif file and delete original."""
with open(cif_file, "rb") as f_in:
with gzip.open(cif_file + ".gz", "wb") as f_out:
shutil.copyfileobj(f_in, f_out)
os.remove(cif_file)
[docs]
def store_as_modelcif(mdl_data, out_dir, mdl_fle_stem, compress):
"""Assemble model data into a ModelCIF file and write it to disk.
Creates a :class:`modelcif.System` from the provided data, attaches
entities, models, QA scores, associated files, and a modelling protocol,
then writes the result as a ModelCIF file. Optionally compresses the output
and packages associated files into a ZIP archive.
Args:
mdl_data (dict): Dictionary with model data. Expected keys:
- ``title`` (str): Title of the modelling system.
- ``mdl_id`` (str): Model identifier; converted to upper case.
- ``model_details`` (str): Free-text description of the model.
- ``audit_authors`` (list[str]): Author names for the audit record.
- ``ranked_mdls`` (list[dict]): Per-model atom data.
- ``target_entities`` (list[dict]): Target sequence data used to
build asymmetric units and entities.
- ``config_data`` (dict): Configuration data; must contain the
key ``seq_dbs`` with reference database entries for the protocol.
- ``protocol`` (dict): Modelling protocol description passed to
:func:`_get_modelcif_protocol`.
- ``acc_files`` (dict, optional): Mapping of labels to associated
file descriptors, each containing ``details``,
``destination_file_name``, ``source_file_path``,
``file_format``, and ``file_content``.
- ``af2_protocol_name`` (str, optional): If present, used to
assign software metadata to AlphaFold 2 QA metric classes.
out_dir (:class:`str` | :class:`~pathlib.Path`): Directory to write the
output file(s) to.
mdl_fle_stem (str): Base name for the output file, without extension.
compress (bool): If ``True``, the mmCIF file is gzip-compressed after
writing.
Returns:
str: File name of the written mmCIF file, relative to ``out_dir``.
Ends with ``.cif`` or ``.cif.gz`` depending on ``compress``.
"""
# allow more variables in Pylint, not gonna fix that atm
# pylint: disable=too-many-locals
logger.info("generating ModelCIF objects...")
pstart = timer()
# create system to gather all the data
system = modelcif.System(
title=mdl_data["title"],
id=mdl_data["mdl_id"].upper(),
model_details=mdl_data["model_details"],
)
# create an asymmetric unit and an entity per target sequence
asym_units = {}
_get_modelcif_entities(mdl_data["target_entities"], asym_units, system)
# audit_authors
system.authors.extend(mdl_data["audit_authors"])
# set up the models to produce coordinates
has_pae = False
qa_time = 0
model_group = modelcif.model.ModelGroup()
for ranked_mdl in mdl_data["ranked_mdls"]:
model = _OST2ModelCIF(
assembly=modelcif.Assembly(asym_units.values()),
asym=asym_units,
**ranked_mdl,
)
if model.incl_pae:
has_pae = True
qa_start = timer()
model.add_scores()
qa_time += timer() - qa_start
model_group.append(model)
system.model_groups.append(model_group)
# handle additional files
arc_files = []
acc_data = {}
if has_pae:
arc_files.append(_get_assoc_pae_file(system.id, mdl_fle_stem))
for af_label, af_dict in mdl_data.get("acc_files", {}).items():
# needs data (for protocol) and file (for associated files) object
acc_data[af_label] = modelcif.data.Data(af_dict["details"])
arc_files.append(
_get_associated_file(
af_dict["destination_file_name"],
acc_data[af_label],
af_dict["file_format"],
af_dict["file_content"],
)
)
# need to copy file for zip to work later
shutil.copyfile(
af_dict["source_file_path"],
os.path.join(out_dir, af_dict["destination_file_name"]),
)
if arc_files:
system.repositories.append(
_get_associated_files(mdl_fle_stem, arc_files)
)
# get data and steps
ref_dbs = mdl_data["config_data"]["seq_dbs"]
protocol = _get_modelcif_protocol(
mdl_data["protocol"],
system.entities,
model_group,
ref_dbs,
acc_data,
)
system.protocols.append(protocol)
# set SW for QE
if "af2_protocol_name" in mdl_data:
sw4qe = _get_sw_for_qe(protocol.steps, mdl_data["af2_protocol_name"])
for af2_class in [
_GlobalPTM,
GlobalIpTM,
GlobalConfRankMultimer,
_GlobalPLDDT,
_LocalPLDDT,
_LocalPairwisePAE,
]:
af2_class.software = sw4qe
logger.info(" (%.2fs; QA: %.2fs)", timer() - pstart, qa_time)
# write modelcif System to file
logger.info("write to disk...")
pstart = timer()
# NOTE: this will dump PAE on path provided in add_scores
# -> hence we cheat by changing path and back while being exception-safe...
oldpwd = Path.cwd()
os.chdir(out_dir)
mdl_fle = f"{mdl_fle_stem}.cif"
try:
with open(mdl_fle, "w", encoding="ascii") as mmcif_fh:
modelcif.dumper.write(mmcif_fh, [system])
if arc_files:
_package_associated_files(system.repositories[0])
if compress:
_compress_cif_file(mdl_fle)
mdl_fle += ".gz"
finally:
os.chdir(oldpwd)
logger.info(" (%.2fs)", timer() - pstart)
return mdl_fle
global_ref_dbs = {}
def _get_ref_db_object(name, url, version=None, release_date=None):
"""Cached access to modelcif.ReferenceDatabase objects.
Needed to remove duplicates in ModelCIF.
"""
key = (name, url, version, release_date)
if key not in global_ref_dbs:
global_ref_dbs[key] = modelcif.ReferenceDatabase(
name, url, version, release_date
)
return global_ref_dbs[key]
[docs]
def get_af2_sequence_dbs(config_data):
"""Get AF2 sequence databases and store them in ``config_data``.
Builds a list of :class:`modelcif.ReferenceDatabase` objects based on
the AlphaFold 2 configuration and writes it to ``config_data["seq_dbs"]``.
The selection depends on the database preset, AF2 version, and whether
multimer mode or templates are used.
Args:
config_data (dict): AF2 configuration data, as returned by
:func:`get_af2_config`. Relevant keys:
- ``af_version`` (str): AlphaFold 2 version string; determines
which MGnify and UniRef variants are added.
- ``use_small_bfd`` (bool): If ``True``, uses Reduced BFD instead
of full BFD.
- ``use_multimer`` (bool): If ``True``, adds TrEMBL, Swiss-Prot,
and PDB seqres databases.
- ``use_templates`` (bool): If ``True``, adds a PDB sequence
database (PDB seqres for multimer, PDB70 for monomer).
- ``up_version`` (str or None): UniProt release version, passed
to the ``version`` attribute of UniRef90, TrEMBL, and
Swiss-Prot database objects.
- ``up_rel_date`` (datetime.date or None): UniProt release date,
passed to the ``release_date`` attribute of UniRef90, TrEMBL,
and Swiss-Prot database objects.
- ``pdb_rel_date`` (datetime.date or None): PDB release date,
passed to the ``release_date`` attribute of the PDB seqres
database object.
Returns:
None: Results are written to ``config_data["seq_dbs"]`` as a list of
:class:`modelcif.ReferenceDatabase` objects.
"""
up_version = config_data["up_version"]
up_rel_date = config_data["up_rel_date"]
# fill list of DBs
seq_dbs = []
if config_data["use_small_bfd"]:
seq_dbs.append(
_get_ref_db_object(
"Reduced BFD",
"https://storage.googleapis.com/alphafold-databases/"
+ "reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz",
)
)
else:
seq_dbs.append(
_get_ref_db_object(
"BFD",
"https://storage.googleapis.com/alphafold-databases/"
+ "casp14_versions/"
+ "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar"
+ ".gz",
version="6a634dc6eb105c2e9b4cba7bbae93412",
)
)
if config_data["af_version"] < "2.3.0":
seq_dbs.append(
_get_ref_db_object(
"MGnify",
"https://storage.googleapis.com/alphafold-databases/"
+ "casp14_versions/mgy_clusters_2018_12.fa.gz",
version="2018_12",
release_date=datetime.date(2018, 12, 6),
)
)
seq_dbs.append(
_get_ref_db_object(
"Uniclust30",
"https://storage.googleapis.com/alphafold-databases/"
+ "casp14_versions/uniclust30_2018_08_hhsuite.tar.gz",
version="2018_08",
release_date=None,
)
)
else:
# Don't add a linebreak to URLs, allow long line in Pylint
# pylint: disable=line-too-long
# NOTE: release date according to https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/
# pylint: enable=line-too-long
seq_dbs.append(
_get_ref_db_object(
"MGnify",
"https://storage.googleapis.com/alphafold-databases/"
+ "v2.3/mgy_clusters_2022_05.fa.gz",
version="2022_05",
release_date=datetime.date(2022, 5, 6),
)
)
seq_dbs.append(
_get_ref_db_object(
"UniRef30",
"https://storage.googleapis.com/alphafold-databases/"
+ "v2.3/UniRef30_2021_03.tar.gz",
version="2021_03",
release_date=None,
)
)
if config_data["use_multimer"]:
seq_dbs.append(
_get_ref_db_object(
"TrEMBL",
"ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/"
+ "knowledgebase/complete/uniprot_trembl.fasta.gz",
version=up_version,
release_date=up_rel_date,
)
)
seq_dbs.append(
_get_ref_db_object(
"Swiss-Prot",
"ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/"
+ "knowledgebase/complete/uniprot_sprot.fasta.gz",
version=up_version,
release_date=up_rel_date,
)
)
seq_dbs.append(
_get_ref_db_object(
"UniRef90",
"ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/"
+ "uniref90.fasta.gz",
version=up_version,
release_date=up_rel_date,
)
)
if config_data["use_templates"]:
if config_data["use_multimer"]:
# uses whatever is latest set of PDB sequences
# see AF2 scripts/download_pdb_seqres.sh
seq_dbs.append(
_get_ref_db_object(
"PDB seqres",
"https://files.wwpdb.org/pub/pdb/derived_data/"
+ "pdb_seqres.txt",
release_date=config_data["pdb_rel_date"],
)
)
else:
# fixed version used in AF2 scripts/download_pdb70.sh
seq_dbs.append(
_get_ref_db_object(
"PDB70",
"http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
+ "hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz",
release_date=datetime.date(2020, 4, 1),
)
)
config_data["seq_dbs"] = seq_dbs
[docs]
def get_af2_config(
af_version,
af_params=None,
custom_ranking=None,
up_version=None,
up_rel_date=None,
pdb_rel_date=None,
):
"""Get configuration data for an AlphaFold 2 modelling run.
Derives modelling settings from the provided AlphaFold 2 version and
parameters, builds a human-readable description of the run, and returns
a configuration dictionary for use by downstream functions.
Args:
af_version (str): AlphaFold 2 version string (e.g. ``"2.3.2"``).
af_params (dict, optional): Non-default AlphaFold 2 parameters.
Recognised keys include ``model_preset``, ``db_preset``,
``num_multimer_predictions_per_model``, ``models_to_relax``,
``run_relax``, ``max_template_date``, and ``num_ensemble``.
Defaults to an empty dict if not provided.
custom_ranking (str, optional): Custom model ranking expression. If
not provided, defaults to ``"pLDDT"`` for monomer runs and
``"ipTM*0.8+pTM*0.2"`` for multimer runs.
up_version (str, optional): UniProt release in ``"YYYY_MM"`` format
current at the time of AF2 installation. (see
https://www.uniprot.org/release-notes)
up_rel_date (datetime.date, optional): Release date corresponding to
``up_version``.
pdb_rel_date (datetime.date, optional): PDB release date current at
the time of AF2 installation. Relevant for multimer runs using
templates.
Returns:
dict: Configuration data for downstream functions. Keys:
- ``af_params`` (dict): Parameters as passed (or empty dict).
- ``af_version`` (str): AlphaFold 2 version string as passed.
- ``description`` (str): Human-readable run description.
- ``use_templates`` (bool): Whether templates were used.
- ``use_small_bfd`` (bool): Whether the reduced BFD database
setting was used.
- ``use_multimer`` (bool): Whether multimer mode was used.
- ``up_version`` (str or None): UniProt release as passed.
- ``up_rel_date`` (datetime.date or None): UniProt release date
as passed.
- ``pdb_rel_date`` (datetime.date or None): PDB release date
as passed.
- ``seq_dbs`` (list[modelcif.ReferenceDatabase]): Sequence DB
objects.
"""
# Disable some Pylint warnings, not going to be fixed atm
# pylint: disable=too-many-arguments,too-many-positional-arguments
# pylint: disable=too-many-locals,too-many-branches,too-many-statements
if af_params is None:
af_params = {}
# get defaults
model_preset = af_params.get("model_preset", "monomer")
db_preset = af_params.get("db_preset", "full_dbs")
use_multimer = model_preset == "multimer"
# 5 models unless multimer with extra flag
if use_multimer:
if "num_multimer_predictions_per_model" in af_params:
num_seeds = af_params["num_multimer_predictions_per_model"]
else:
num_seeds = 5 if af_version >= "2.2.0" else 1
else:
num_seeds = 1
num_models = num_seeds * 5
# default relax setting changed over time; translate to latest
if "models_to_relax" in af_params:
models_to_relax = af_params["models_to_relax"]
assert "run_relax" not in af_params
elif "run_relax" in af_params:
models_to_relax = "all" if af_params["run_relax"] else "none"
else:
models_to_relax = "all" if af_version < "2.3.2" else "best"
# templates turned off if max_template_date older than oldest PDB
use_templates = (
"max_template_date" not in af_params
or af_params["max_template_date"] >= "1976-05-19"
)
# build description text
description = (
f"Model generated using AlphaFold (v{af_version}) "
f"producing {num_models} {model_preset} models "
)
if num_seeds > 1:
description += f"({num_seeds} random seeds per parameter set) "
if use_multimer and af_version >= "2.3.0":
description += "with up to 20 recycles "
else:
description += "with 3 recycles "
if "num_ensemble" in af_params:
description += f"and {af_params['num_ensemble']} ensemble "
description += "each, "
if models_to_relax == "all":
description += "with AMBER relaxation, "
elif models_to_relax == "best":
description += "with AMBER relaxation on best model, "
else:
description += "without model relaxation, "
if use_templates:
if "max_template_date" in af_params:
mtd_str = f" up to date {af_params['max_template_date']}"
else:
mtd_str = ""
description += f"using templates{mtd_str}, "
else:
description += "without templates, "
if custom_ranking:
rank_str = custom_ranking
elif use_multimer:
rank_str = "ipTM*0.8+pTM*0.2"
else:
rank_str = "pLDDT"
msa_str = "MSAs" if use_multimer else "an MSA"
description += (
f"ranked by {rank_str}, starting from {msa_str} with "
f"{db_preset} setting."
)
config_data = {
"af_params": af_params,
"af_version": af_version,
"description": description,
"use_templates": use_templates,
"use_small_bfd": db_preset == "reduced_dbs",
"use_multimer": use_multimer,
"up_version": up_version,
"up_rel_date": up_rel_date,
"pdb_rel_date": pdb_rel_date,
}
get_af2_sequence_dbs(config_data)
return config_data
[docs]
def get_galaxy_software(version):
"""Get Galaxy as a software dictionary for a ModelCIF file.
Builds a dictionary suitable for creating a :class:`modelcif.Software`
object, with citation and download URL derived from the provided version
string.
Args:
version (str): Galaxy AlphaFold 2 version string in the format
``[AF2v]+galaxy[X]``, e.g. ``"2.3.2+galaxy1"``.
Returns:
dict: Software descriptor with keys ``name``, ``classification``,
``description``, ``citation``, ``location``, ``type``, and
``version``.
"""
galaxy_suffix = version.split("galaxy")[-1]
return {
"name": "Galaxy AlphaFold 2",
"classification": "model building",
"description": "Structure prediction",
"citation": ihm.Citation(
pmid="38769056",
title="The Galaxy platform for accessible, reproducible, and "
+ "collaborative data analyses: 2024 update.",
journal="Nucleic Acids Res",
volume=52,
page_range=["W83", "W94"],
year=2024,
authors=["Galaxy Community"],
doi="10.1093/nar/gkae410",
),
"location": (
f"https://usegalaxy.eu/root?tool_id=toolshed.g2.bx.psu.edu/repos/"
f"galaxy-australia/alphafold2/alphafold/2.3.2+galaxy{galaxy_suffix}"
),
"type": "package",
"version": version,
}
[docs]
def get_cf_db_versions(dt, num_days_unk=1):
"""Get ColabFold database versions for a given date.
Returns the UniRef30, template database name, and template database
version used by the ColabFold MSA server on a given date. Based on
https://github.com/sokrypton/ColabFold/wiki/MSA-Server-Database-History.
Args:
dt (datetime.date): Date for which to look up the database versions.
num_days_unk (int): Number of days around a database switch date
within which the result is considered unknown. Defaults to 1.
Returns:
tuple: A 3-tuple of ``(ur30_db_version, tpl_db, tpl_db_version)``,
each a :class:`str`. Values are set to ``"UNK"`` if ``dt`` falls
within ``num_days_unk`` days of a switch date, if the template
database version is unknown, or if no matching date range is
found.
"""
# logic: newest first, tuple with ur30_db_version, tpl_db, tpl_db_version
switch_dates = [
(datetime.date(2023, 7, 31), ("2023_02", "PDB100", "230517")),
(datetime.date(2023, 7, 27), ("2022_02", "PDB70", "220313")),
(datetime.date(2023, 6, 12), ("2023_02", "PDB100", "230517")),
(datetime.date(2022, 7, 13), ("2022_02", "PDB70", "220313")),
(datetime.date(2021, 1, 1), ("2021_03", "PDB70", "UNK")),
]
for switch_dt, dbs in switch_dates:
dd = (dt - switch_dt).days
if abs(dd) <= num_days_unk:
return ("UNK", "UNK", "UNK")
if dd > num_days_unk:
return dbs
return ("UNK", "UNK", "UNK")
# LocalWords: LocalColabFold Args str bool ColabFold config func msa num ur
# LocalWords: localcolabfold UniRef tpl ValueError