Source code for modelarchive.modelcif.fix_af2

"""ModelCIF files generated by AlphaFold 2 deviate from the official ModelCIF
definition dictionary in specific cases. Here are functions to fix this.
"""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Won't shorten the module atm, allow in Pylint
# pylint: disable=too-many-lines

from pathlib import Path
from timeit import default_timer as timer
import datetime
import gzip
import logging
import os
import shutil
import zipfile

import ihm.citations
import modelcif.dumper
import modelcif.model
import modelcif.protocol
import modelcif.qa_metric
import modelcif.associated
import numpy as np

logger = logging.getLogger(__name__)


# These classes follows python-modelcif, allow too few methods in Pylint.
# pylint: disable=too-few-public-methods


class _LPeptideAlphabetWithXO(ihm.LPeptideAlphabet):
    """Have the default amino acid alphabet plus 'X' for unknown residues
    and 'O' as allowed non-def. AA (U already in alphabet)."""

    # extra entry added according to LPeptideAlphabet def. in
    # https://python-ihm.readthedocs.io/en/latest/_modules/ihm.html
    # and https://files.rcsb.org/view/1NTH.cif for values for 'O'.

    def __init__(self):
        """Create the alphabet."""
        super().__init__()
        self._comps["X"] = self._comps["UNK"]
        self._comps["O"] = ihm.LPeptideChemComp(
            "PYL", "O", "O", "PYRROLYSINE", "C12 H21 N3 O3"
        )


class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


class _LocalPairwisePAE(
    modelcif.qa_metric.LocalPairwise, modelcif.qa_metric.PAE
):
    """Predicted aligned error (in Angstroms)"""

    name = "PAE"
    software = None


class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
    """Predicted accuracy according to the TM-score score in [0,1]"""

    name = "pTM"
    software = None


class _GlobalIpTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM):
    """Predicted protein-protein interface score based on TM-score in [0,1]"""

    name = "ipTM"
    software = None


class _GlobalConfRankMultimer(
    modelcif.qa_metric.Global, modelcif.qa_metric.NormalizedScore
):
    """Default ranking score used by AlphaFold-Multimer"""

    name = "ranking-confidence (ipTM*0.8+pTM*0.2)"
    software = None


# pylint: enable=too-few-public-methods


[docs] def get_cf_sequence_dbs(config_data): """Get ColabFold sequence databases and store them in ``config_data``. Looks up a hardcoded list of known ColabFold sequence databases and populates ``config_data["seq_dbs"]`` with :class:`modelcif.ReferenceDatabase` instances corresponding to the databases requested via ``config_data["seq_db_keys"]``. If a template database is specified via ``config_data["tpl_db"]``, it is appended as well. UniRef database entries require a version string in ``config_data["ur30_db_version"]``; template database entries require a version string in ``config_data["tpl_db_version"]``. Args: config_data (dict): Configuration data dictionary. Relevant keys: ``seq_db_keys`` (list of :class:`str`) — sequence database identifiers to look up; ``ur30_db_version`` (:class:`str` or ``None``) — version string required when ``"UniRef"`` is in ``seq_db_keys``; ``tpl_db`` (:class:`str` or ``None``) — optional template database identifier; ``tpl_db_version`` (:class:`str` or ``None``) — version string required when ``tpl_db`` is set. On return, ``seq_dbs`` is added as a list of :class:`modelcif.ReferenceDatabase` instances. Returns: None Raises: ValueError: If ``"UniRef"`` is in ``seq_db_keys`` but ``ur30_db_version`` is ``None``. ValueError: If ``tpl_db`` is set but ``tpl_db_version`` is ``None``. ValueError: If a resolved database key is not found in the hardcoded set of known ColabFold databases. """ # Uses HC list of known DBs used in ColabFold # -> see also notes in get_cf_config db_dict = { "UniRef_2021_03": modelcif.ReferenceDatabase( "UniRef30", "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz", version="2021_03", ), "UniRef_2022_02": modelcif.ReferenceDatabase( "UniRef30", "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2202.tar.gz", version="2022_02", ), "UniRef_2023_02": modelcif.ReferenceDatabase( "UniRef30", "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2302.tar.gz", version="2023_02", ), "Environmental": modelcif.ReferenceDatabase( "ColabFold DB", "https://wwwuser.gwdg.de/~compbiol/colabfold/" + "colabfold_envdb_202108.tar.gz", version="2021_08", ), "PDB100_230517": modelcif.ReferenceDatabase( "PDB100", "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/" + "hhsuite_dbs/pdb100_foldseek_230517.tar.gz", release_date=datetime.date(2023, 5, 17), ), "PDB70_211027": modelcif.ReferenceDatabase( "PDB70", "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/" + "hhsuite_dbs/pdb70_from_mmcif_211027.tar.gz", release_date=datetime.date(2021, 10, 27), ), "PDB70_211117": modelcif.ReferenceDatabase( "PDB70", "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/" + "hhsuite_dbs/pdb70_from_mmcif_211117.tar.gz", release_date=datetime.date(2021, 11, 17), ), "PDB70_220313": modelcif.ReferenceDatabase( "PDB70", "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/" + "hhsuite_dbs/pdb70_from_mmcif_220313.tar.gz", release_date=datetime.date(2022, 3, 13), ), } # fill list of DBs seq_dbs = [] search_keys = [] for seq_db in config_data["seq_db_keys"]: if seq_db == "UniRef": if config_data["ur30_db_version"] is None: raise ValueError("Cannot use UniRef without version") search_key = f"UniRef_{config_data['ur30_db_version']}" else: search_key = seq_db search_keys.append(search_key) if config_data["tpl_db"] is not None: if config_data["tpl_db_version"] is None: raise ValueError("Cannot have tpl DB without version") search_keys.append( f"{config_data['tpl_db']}_{config_data['tpl_db_version']}" ) for search_key in search_keys: if search_key not in db_dict: raise ValueError(f"Unknown seq. DB {search_key}") seq_dbs.append(db_dict[search_key]) config_data["seq_dbs"] = seq_dbs
[docs] def get_localcolabfold_software(version=None): """Get LocalColabFold as a :class:`dict` for creating a software object. Args: version (str): Version of LocalColabFold. Should only be ``None`` if the version is genuinely unavailable. Returns: dict: A dictionary with software metadata suitable for creating a ModelCIF software object. """ return { "name": "LocalColabFold", "classification": "model building", "description": "Structure prediction", "citation": None, "location": "https://github.com/YoshitakaMo/localcolabfold", "type": "package", "version": version, }
[docs] def get_colabfold_software(version=None): """Get ColabFold as a :class:`dict` for creating a software object. Args: version (str): Version of ColabFold. Should only be ``None`` if the version is genuinely unavailable. Returns: dict: A dictionary with software metadata suitable for creating a ModelCIF software object. """ return { "name": "ColabFold", "classification": "model building", "description": "Structure prediction", "citation": ihm.citations.colabfold, "location": "https://github.com/sokrypton/ColabFold", "type": "package", "version": version, }
[docs] def get_mmseqs2_software(version=None): """Get MMseqs2 as a :class:`dict` for creating a software object. Args: version (str): Version of MMseqs2. Should only be ``None`` if the version is genuinely unavailable. Returns: dict: A dictionary with software metadata suitable for creating a ModelCIF software object. """ return { "name": "MMseqs2", "classification": "data collection", "description": "Many-against-Many sequence searching", "citation": ihm.citations.mmseqs2, "location": "https://github.com/soedinglab/mmseqs2", "type": "package", "version": version, }
[docs] def get_af2_software(version=None, is_multimer=False): """Get AlphaFold 2 as a :class:`dict` for creating a software object. Args: version (str): Version of AlphaFold 2. Should only be ``None`` if the version is genuinely unavailable. is_multimer (bool): If ``True``, return metadata for AlphaFold-Multimer instead of AlphaFold 2. Returns: dict: A dictionary with software metadata suitable for creating a ModelCIF software object. The ``name`` and ``citation`` entries differ depending on ``is_multimer``. """ if is_multimer: return { "name": "AlphaFold-Multimer", "classification": "model building", "description": "Structure prediction", "citation": ihm.Citation( pmid=None, title="Protein complex prediction with " + "AlphaFold-Multimer.", journal="bioRxiv", volume=None, page_range=None, year=2021, authors=[ "Evans, R.", "O'Neill, M.", "Pritzel, A.", "Antropova, N.", "Senior, A.", "Green, T.", "Zidek, A.", "Bates, R.", "Blackwell, S.", "Yim, J.", "Ronneberger, O.", "Bodenstein, S.", "Zielinski, M.", "Bridgland, A.", "Potapenko, A.", "Cowie, A.", "Tunyasuvunakool, K.", "Jain, R.", "Clancy, E.", "Kohli, P.", "Jumper, J.", "Hassabis, D.", ], doi="10.1101/2021.10.04.463034", ), "location": "https://github.com/deepmind/alphafold", "type": "package", "version": version, } return { "name": "AlphaFold", "classification": "model building", "description": "Structure prediction", "citation": ihm.citations.alphafold2, "location": "https://github.com/deepmind/alphafold", "type": "package", "version": version, }
[docs] def get_cf_sw_plus_params(config_data, use_localcolabfold=False): """Create a list of software and parameters for a ColabFold protocol step. Args: config_data (dict): ColabFold configuration data as returned by :func:`get_cf_config`. use_localcolabfold (bool): If ``True``, prepend LocalColabFold to the list of software entries. Returns: list[tuple[dict, dict]]: A list of ``(software, parameters)`` tuples suitable for use in a protocol. """ sw_plus_params = [] if use_localcolabfold: sw_plus_params.append((get_localcolabfold_software(), {})) sw_plus_params.append( ( get_colabfold_software(config_data["cf_version"]), config_data["cf_params"], ) ) if config_data["use_mmseqs"]: sw_plus_params.append((get_mmseqs2_software(), {})) sw_plus_params.append( (get_af2_software(is_multimer=config_data["use_multimer"]), {}) ) return sw_plus_params
[docs] def get_cf_config( cf_config, ur30_db_version=None, tpl_db=None, tpl_db_version=None ): """Process a ColabFold configuration into a standardised data dictionary. Args: cf_config (dict): Raw ColabFold configuration data, typically read from a ColabFold configuration file. Must contain the keys ``version``, ``msa_mode``, ``model_type``, ``num_recycles``, ``use_templates``, and ``rank_by``. Optional keys include ``commit``, ``pair_mode``, ``recycle_early_stop_tolerance``, ``stop_at_score``, ``num_seeds``, ``num_models``, ``use_amber``, and ``num_relax``. ur30_db_version (str, optional): Version of the UniRef30 database used. Should only be ``None`` if the database was not used. tpl_db (str, optional): Template database used. Accepted values are ``"PDB70"``, ``"PDB100"``, or ``None`` if no template database was used. tpl_db_version (str, optional): Version of the template database used. Should only be ``None`` if the database was not used. Returns: dict: A dictionary with processed ColabFold configuration data for further use in model preparation. Raises: ValueError: If ``msa_mode`` is not one of the known values. ValueError: If ``model_type`` is not one of the known values. ValueError: If ``rank_by`` is not one of the known values. """ # Not going to reduce no. of variables, branches or statements at this # point, allow in Pylint # pylint: disable=too-many-locals,too-many-branches,too-many-statements # keep version indep. of params (and add commit since versions are meh) cf_version = cf_config["version"] if "commit" in cf_config and cf_config["commit"] is not None: cf_version += f" ({cf_config['commit'][:7]})" # drop fields which are not relevant for model building cf_config = cf_config.copy() for key in ["num_queries", "commit", "version", "user_agent"]: if key in cf_config: del cf_config[key] # NOTE: following code from # https://github.com/sokrypton/ColabFold/blob/main/colabfold/batch.py to # understand config # -> see also https://github.com/sokrypton/ColabFold/wiki/v1.5.0 # deal with old names (some settings changed name in v1.5) # -> code taken almost verbatim from https://github.com/sokrypton/ColabFold old_names = { "MMseqs2 (UniRef+Environmental)": "mmseqs2_uniref_env", "MMseqs2 (UniRef only)": "mmseqs2_uniref", "unpaired+paired": "unpaired_paired", "AlphaFold2-multimer-v1": "alphafold2_multimer_v1", "AlphaFold2-multimer-v2": "alphafold2_multimer_v2", "AlphaFold2-multimer-v3": "alphafold2_multimer_v3", "AlphaFold2-ptm": "alphafold2_ptm", "AlphaFold2": "alphafold2", } msa_mode = old_names.get(cf_config["msa_mode"], cf_config["msa_mode"]) if "pair_mode" in cf_config: pair_mode = old_names.get( cf_config["pair_mode"], cf_config["pair_mode"] ) else: pair_mode = None model_type = old_names.get(cf_config["model_type"], cf_config["model_type"]) # fix v1.5 defaults for num_recycles and recycle_early_stop_tolerance # -> def. (set as "null" in config): # - num_recycles == 20 if alphafold2_multimer_v3 else 3 # - recycle_early_stop_tolerance == 0.5 if multimer else 0.0 # -> valid from 1.5.0 until 1.5.5 (and probably later) # -> defined in alphafold/model/config.py of steineggerlab/alphafold repo if "num_recycles" in cf_config and cf_config["num_recycles"] is None: if "multimer" in model_type and model_type not in [ "alphafold2_multimer_v1", "alphafold2_multimer_v2", ]: cf_config["num_recycles"] = 20 else: cf_config["num_recycles"] = 3 if ( "recycle_early_stop_tolerance" in cf_config and cf_config["recycle_early_stop_tolerance"] is None ): cf_config["recycle_early_stop_tolerance"] = ( 0.5 if "multimer" in model_type else 0.0 ) # remove null config entries (ASSUME: None = use default) cf_config = {k: v for k, v in cf_config.items() if v is not None} # fetch relevant data # -> MSA mode if msa_mode == "mmseqs2_uniref_env": seq_dbs = ["UniRef", "Environmental"] use_mmseqs = True use_msa = True elif msa_mode == "mmseqs2_uniref": seq_dbs = ["UniRef"] use_mmseqs = True use_msa = True elif msa_mode == "single_sequence": seq_dbs = [] use_mmseqs = False use_msa = False elif msa_mode == "custom": seq_dbs = [] use_mmseqs = False use_msa = True else: raise ValueError(f"Unknown msa_mode {cf_config['msa_mode']}") # -> model type if model_type == "alphafold2_multimer_v1": # AF-Multimer as introduced in AlphaFold v2.1.0 use_multimer = True multimer_version = 1 elif model_type == "alphafold2_multimer_v2": # AF-Multimer as introduced in AlphaFold v2.2.0 use_multimer = True multimer_version = 2 elif model_type == "alphafold2_multimer_v3": # AF-Multimer as introduced in AlphaFold v2.3.0 use_multimer = True multimer_version = 3 elif model_type == "alphafold2_ptm": use_multimer = False multimer_version = None else: raise ValueError(f"Unknown model_type {cf_config['model_type']}") # write modeling description mdl_description = f"Model generated using ColabFold v{cf_version}" if use_multimer: mdl_description += f" with AlphaFold-Multimer (v{multimer_version})" else: mdl_description += " with AlphaFold" # early stopping feature of ColabFold upto_mdl = "" upto_rec = "" if cf_config.get("stop_at_score", 100) < 100: upto_mdl = "up to " upto_rec = "up to " if cf_config.get("recycle_early_stop_tolerance", 0) > 0: upto_rec = "up to " if cf_config.get("num_seeds", 1) > 1: mdl_str = ( f"{cf_config['num_models'] * cf_config['num_seeds']} " f"models ({cf_config['num_seeds']} random seeds per " f"parameter set)" ) else: mdl_str = f"{cf_config['num_models']} models" mdl_description += ( f" producing {upto_mdl}{mdl_str} with {upto_rec}" f"{cf_config['num_recycles']} recycles each" ) if cf_config.get("use_amber", False) or cf_config.get("num_relax", 0) > 0: mdl_description += ", with AMBER relaxation" else: mdl_description += ", without model relaxation" if cf_config["use_templates"]: # tpl_db == None meant to mean that custom templates were used # -> no need to stress it but just visible in search DBs mdl_description += ", using templates" else: mdl_description += ", without templates" tpl_db = None tpl_db_version = None if cf_config["rank_by"] == "plddt": mdl_description += ", ranked by pLDDT" elif cf_config["rank_by"] == "ptmscore": mdl_description += ", ranked by pTM" elif cf_config["rank_by"] == "multimer": mdl_description += ", ranked by 80*ipTM+20*pTM" else: raise ValueError(f"Unknown rank_by {cf_config['rank_by']}") if use_msa: mdl_description += ", starting from" if use_mmseqs: msa_type = "MSA" else: msa_type = "custom MSA" if use_multimer: if pair_mode == "unpaired_paired": mdl_description += f" paired and unpaired {msa_type}s" elif pair_mode == "paired": mdl_description += f" paired {msa_type}s" elif pair_mode == "unpaired": mdl_description += f" unpaired {msa_type}s" elif pair_mode is None: raise ValueError( "Key 'pair_mode' required with " + "'use_multimer=True'" ) else: raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}") elif msa_type.startswith("M"): mdl_description += f" an {msa_type}" else: mdl_description += f" a {msa_type}" if use_mmseqs: mdl_description += f" from MMseqs2 ({'+'.join(seq_dbs)})" else: mdl_description += " without an MSA" mdl_description += "." config_data = { "cf_params": cf_config, "cf_version": cf_version, "seq_db_keys": seq_dbs, "use_mmseqs": use_mmseqs, "use_msa": use_msa, "ur30_db_version": ur30_db_version, "tpl_db": tpl_db, "tpl_db_version": tpl_db_version, "use_multimer": use_multimer, "multimer_version": multimer_version, "description": mdl_description, } get_cf_sequence_dbs(config_data) return config_data
def _get_res_num(r, use_auth=False): """Get residue number, optionally from PDB auth IDs.""" if use_auth: return int(r.GetStringProp("pdb_auth_resnum")) return r.number.num
[docs] def get_sequence(chn, use_auth=False): """Get the sequence of an `OpenStructure`_ chain, inserting ``'-'`` for gaps. Args: chn (ost.mol.ChainHandle or ost.mol.ChainView): `OST`_ chain to extract the sequence from. Any object providing the following interface can be used as a drop-in replacement for the `OST`_ chain object: - ``chn.residues``: sequence of residue objects, each providing - ``chn.residues[i].number.num`` (:class:`int`): internal residue number - ``chn.residues[i].one_letter_code`` (:class:`str`): single-letter code - ``chn.residues[i].GetStringProp("pdb_auth_resnum")`` (:class:`str`): author residue number as an integer string, only required if ``use_auth=True`` use_auth (bool): If ``True``, use PDB author residue numbers instead of internal residue numbers. Returns: str: One-letter code sequence with ``'-'`` characters inserted for gaps. """ # initialise (add gaps if first is not at num. 1) lst_rn = _get_res_num(chn.residues[0], use_auth) idx = 1 sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code for res in chn.residues[idx:]: lst_rn += 1 while lst_rn != _get_res_num(res, use_auth): sqe += "-" lst_rn += 1 sqe += res.one_letter_code return sqe
[docs] def assemble_modelcif_software(soft_dict, params_dict): """Create a :class:`modelcif.SoftwareWithParameters` instance from dictionaries. Args: soft_dict (dict): Software metadata as returned by functions such as :func:`get_colabfold_software`. Must contain the keys ``name``, ``classification``, ``description``, ``location``, ``type``, ``version``, and ``citation``. params_dict (dict): Software parameters, where each key is passed as the parameter name and each value as the parameter value to :class:`modelcif.SoftwareParameter`. Returns: modelcif.SoftwareWithParameters: A ModelCIF software object with associated parameters. """ # create SW object sw = modelcif.Software( soft_dict["name"], soft_dict["classification"], soft_dict["description"], soft_dict["location"], soft_dict["type"], soft_dict["version"], citation=soft_dict["citation"], ) # assemble parameters params = [] for key, val in params_dict.items(): params.append(modelcif.SoftwareParameter(key, val)) # put them together return modelcif.SoftwareWithParameters(sw, params)
def _get_ch_name(ch, use_auth=False): """Get chain name from auth. IDs if reading from mmCIF files.""" if use_auth: return ch.GetStringProp("pdb_auth_chain_name") return ch.name class _OST2ModelCIF(modelcif.model.AbInitioModel): """Map an OST entity to a :class:`modelcif.model.AbInitioModel`. Args: assembly (modelcif.Assembly): Collection of asymmetric units for this model. asym (dict): Mapping of chain names to :class:`modelcif.AsymUnit` objects. ost_entity (ost.mol.Entity or ost.mol.EntityHandle): Loaded OST entity object. name (str, optional): Short name for this model. plddt_from_b_factors (bool): If ``True``, read per-residue pLDDT from B-factors instead of from ``scores_json``. scores_json (dict): Score data to store. Supported keys are ``plddt``, ``plddt_global``, ``ptm``, ``iptm``, ``confrank``, and ``pae``. extra_global_scores (list): Additional global QA score objects to append to ``qa_metrics``. incl_pae (bool): If ``True``, include PAE scores. Defaults to ``True`` if ``"pae"`` is present in ``scores_json``. use_auth (bool): If ``True``, use PDB author IDs instead of internal IDs. Relevant when reading from mmCIF files. pae_digits (int): Number of decimal digits for PAE values. """ # not going to fight for reducing a single attribute, allow in Pylint # pylint: disable=too-many-instance-attributes def __init__(self, *args, **kwargs): """Initialise a model""" self.ost_entity = kwargs.pop("ost_entity") self.asym = kwargs.pop("asym") # optional arguments self.plddt_from_b_factors = kwargs.pop("plddt_from_b_factors", False) self.scores_json = kwargs.pop("scores_json", {}) self.extra_global_scores = kwargs.pop("extra_global_scores", []) self.incl_pae = kwargs.pop("incl_pae", "pae" in self.scores_json) self.use_auth = kwargs.pop("use_auth", False) self.pae_digits = kwargs.pop("pae_digits", 3) # get pLDDT if needed if self.plddt_from_b_factors: self.scores_json["plddt"] = [] for res in self.ost_entity.residues: b_factors = [a.b_factor for a in res.atoms] assert len(set(b_factors)) == 1 # must all be equal! self.scores_json["plddt"].append(b_factors[0]) if "plddt_global" not in self.scores_json: self.scores_json["plddt_global"] = np.mean( self.scores_json["plddt"] ) # check lengths for local scores (cannot deal with unmodelled gaps here) exp_len = self.ost_entity.residue_count if "plddt" in self.scores_json: assert len(self.scores_json["plddt"]) == exp_len if self.incl_pae: assert "pae" in self.scores_json assert len(self.scores_json["pae"]) == exp_len assert len(self.scores_json["pae"][0]) == exp_len super().__init__(*args, **kwargs) def get_atoms(self): """Yield atom records for writing to a ModelCIF file via :mod:`modelcif`.""" # ToDo [internal]: Take B-factor out since its not a B-factor? # NOTE: this assumes that _get_res_num maps residue to pos. in seqres # within asym for atm in self.ost_entity.atoms: yield modelcif.model.Atom( asym_unit=self.asym[_get_ch_name(atm.chain, self.use_auth)], seq_id=_get_res_num(atm.residue, self.use_auth), atom_id=atm.name, type_symbol=atm.element, x=atm.pos[0], y=atm.pos[1], z=atm.pos[2], het=atm.is_hetatom, biso=atm.b_factor, occupancy=atm.occupancy, ) def add_scores(self): """Add QA metrics from AF2 scores.""" # global scores for key, score_class in [ ("plddt_global", _GlobalPLDDT), ("ptm", _GlobalPTM), ("iptm", _GlobalIpTM), ("confrank", _GlobalConfRankMultimer), ]: if key in self.scores_json: self.qa_metrics.append(score_class(self.scores_json[key])) # extra ones self.qa_metrics.extend(self.extra_global_scores) # NOTE: none of the below expected to work if we have unmodelled gaps! # local scores lpae = [] i = 0 for chn_i in self.ost_entity.chains: ch_name_i = _get_ch_name(chn_i, self.use_auth) for res_i in chn_i.residues: # local pLDDT res_num_i = _get_res_num(res_i, self.use_auth) if "plddt" in self.scores_json: self.qa_metrics.append( _LocalPLDDT( self.asym[ch_name_i].residue(res_num_i), self.scores_json["plddt"][i], ) ) # pairwise alignment error if self.incl_pae: j = 0 for chn_j in self.ost_entity.chains: ch_name_j = _get_ch_name(chn_j, self.use_auth) for res_j in chn_j.residues: res_num_j = _get_res_num(res_j, self.use_auth) pae_ij = self.scores_json["pae"][i][j] lpae.append( _LocalPairwisePAE( self.asym[ch_name_i].residue(res_num_i), self.asym[ch_name_j].residue(res_num_j), round(pae_ij, self.pae_digits), ) ) j += 1 i += 1 if self.incl_pae: self.qa_metrics.extend(lpae) def _get_modelcif_entities(target_ents, asym_units, system): """Create ModelCIF entities and asymmetric units.""" alphabet = _LPeptideAlphabetWithXO() for cif_ent in target_ents: # combine into ModelCIF entity mdlcif_ent = modelcif.Entity( cif_ent["seqres"], description=cif_ent["description"], alphabet=alphabet, source=cif_ent["source"], references=cif_ent["references"], ) # NOTE: this assigns (potentially new) alphabetic chain names for pdb_chain_id in cif_ent["pdb_chain_ids"]: asym_units[pdb_chain_id] = modelcif.AsymUnit( mdlcif_ent, strand_id=pdb_chain_id, auth_seq_id_map=cif_ent["auth_seq_id_map"], ) system.entities.append(mdlcif_ent) def _get_modelcif_protocol_software(js_step): """Assemble software entries for a ModelCIF protocol step.""" # new setup in python-modelcif (as of late 2023): params with each SW sw_list = [] for sw, sw_params in js_step["software_plus_params"]: sw_list.append(assemble_modelcif_software(sw, sw_params)) # group and done... if sw_list: return modelcif.SoftwareGroup(sw_list) return None def _get_modelcif_protocol_data( data_labels, target_entities, models, ref_dbs, acc_data, dg_cache=None ): """Assemble data for a ModelCIF protocol step. Cached access to objects needed to remove duplicates in ModelCIF. """ # No idea how to reduce arguments here, allow in Pylint # pylint: disable=too-many-positional-arguments,too-many-arguments if dg_cache is None: dg_cache = {} cache_key = tuple(sorted(data_labels)) if cache_key in dg_cache: return dg_cache[cache_key] data = modelcif.data.DataGroup() for data_label in data_labels: if data_label == "target_sequences": data.extend(target_entities) elif data_label == "ref_dbs": data.extend(ref_dbs) elif data_label == "models": data.extend(models) elif data_label in acc_data: data.append(acc_data[data_label]) else: raise RuntimeError(f"Unknown protocol data: '{data_label}'") dg_cache[cache_key] = data return data def _get_modelcif_protocol( protocol_steps, target_entities, models, ref_dbs, acc_data ): """Create the protocol for the ModelCIF file.""" dg_cache = {} # cached DataGroup objects per protocol protocol = modelcif.protocol.Protocol() for js_step in protocol_steps: sftwre = _get_modelcif_protocol_software(js_step) input_data = _get_modelcif_protocol_data( js_step["input"], target_entities, models, ref_dbs, acc_data, dg_cache, ) output_data = _get_modelcif_protocol_data( js_step["output"], target_entities, models, ref_dbs, acc_data, dg_cache, ) protocol.steps.append( modelcif.protocol.Step( input_data=input_data, output_data=output_data, name=js_step["name"], details=js_step["details"], software=sftwre, ) ) protocol.steps[-1].method_type = js_step["method_type"] return protocol def _get_assoc_pae_file(entry_id, mdl_name): """Generate a associated file object to extract PAE to extra file.""" return modelcif.associated.QAMetricsFile( f"{mdl_name}_local_pairwise_qa.cif", categories=["_ma_qa_metric_local_pairwise"], copy_categories=["_ma_qa_metric"], entry_id=entry_id, entry_details="This file is an associated file consisting " + "of local pairwise QA metrics. This is a partial mmCIF " + "file and can be validated by merging with the main " + "mmCIF file containing the model coordinates and other " + "associated data.", details="Predicted aligned error", ) def _get_associated_file( fle_path, data, file_format="other", file_content="other" ): """Generate a modelcif.associated.File object for given data.""" afile = modelcif.associated.File( fle_path, details=data.name, data=data, ) afile.file_format = file_format afile.file_content = file_content return afile def _get_associated_files(mdl_name, arc_files): """Create entry for associated files.""" # package all into zip file return modelcif.associated.Repository( "", [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)], ) # NOTE: by convention MA expects zip file with same name as model-cif def _get_sw_for_qe(steps, step_name): """Fetch suitable SW objects from protocol steps to use in QE.""" # to maximally reduce duplicates we reuse single groups # otherwise new group created using same SoftwareWithParameters objects sw_groups = [step.software for step in steps if step.name == step_name] if len(sw_groups) == 0: return None if len(sw_groups) == 1: return sw_groups[0] # each sw_group is a list (SoftwareGroup) of SoftwareWithParameters # ...and we remove duplicates...just in case sw_dict = {} for sw_group in sw_groups: sw_dict.update({hash(swp): swp for swp in sw_group}) return modelcif.SoftwareGroup(sw_dict.values()) def _package_associated_files(repo): """Compress associated files into single zip file and delete original.""" # zip settings tested for good speed vs compression for archive in repo.files: with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip: for zfile in archive.files: cif_zip.write(zfile.path, arcname=zfile.path) os.remove(zfile.path) def _compress_cif_file(cif_file): """Compress cif file and delete original.""" with open(cif_file, "rb") as f_in: with gzip.open(cif_file + ".gz", "wb") as f_out: shutil.copyfileobj(f_in, f_out) os.remove(cif_file)
[docs] def store_as_modelcif(mdl_data, out_dir, mdl_fle_stem, compress): """Assemble model data into a ModelCIF file and write it to disk. Creates a :class:`modelcif.System` from the provided data, attaches entities, models, QA scores, associated files, and a modelling protocol, then writes the result as a ModelCIF file. Optionally compresses the output and packages associated files into a ZIP archive. Args: mdl_data (dict): Dictionary with model data. Expected keys: - ``title`` (str): Title of the modelling system. - ``mdl_id`` (str): Model identifier; converted to upper case. - ``model_details`` (str): Free-text description of the model. - ``audit_authors`` (list[str]): Author names for the audit record. - ``ranked_mdls`` (list[dict]): Per-model atom data. - ``target_entities`` (list[dict]): Target sequence data used to build asymmetric units and entities. - ``config_data`` (dict): Configuration data; must contain the key ``seq_dbs`` with reference database entries for the protocol. - ``protocol`` (dict): Modelling protocol description passed to :func:`_get_modelcif_protocol`. - ``acc_files`` (dict, optional): Mapping of labels to associated file descriptors, each containing ``details``, ``destination_file_name``, ``source_file_path``, ``file_format``, and ``file_content``. - ``af2_protocol_name`` (str, optional): If present, used to assign software metadata to AlphaFold 2 QA metric classes. out_dir (:class:`str` | :class:`~pathlib.Path`): Directory to write the output file(s) to. mdl_fle_stem (str): Base name for the output file, without extension. compress (bool): If ``True``, the mmCIF file is gzip-compressed after writing. Returns: str: File name of the written mmCIF file, relative to ``out_dir``. Ends with ``.cif`` or ``.cif.gz`` depending on ``compress``. """ # allow more variables in Pylint, not gonna fix that atm # pylint: disable=too-many-locals logger.info("generating ModelCIF objects...") pstart = timer() # create system to gather all the data system = modelcif.System( title=mdl_data["title"], id=mdl_data["mdl_id"].upper(), model_details=mdl_data["model_details"], ) # create an asymmetric unit and an entity per target sequence asym_units = {} _get_modelcif_entities(mdl_data["target_entities"], asym_units, system) # audit_authors system.authors.extend(mdl_data["audit_authors"]) # set up the models to produce coordinates has_pae = False qa_time = 0 model_group = modelcif.model.ModelGroup() for ranked_mdl in mdl_data["ranked_mdls"]: model = _OST2ModelCIF( assembly=modelcif.Assembly(asym_units.values()), asym=asym_units, **ranked_mdl, ) if model.incl_pae: has_pae = True qa_start = timer() model.add_scores() qa_time += timer() - qa_start model_group.append(model) system.model_groups.append(model_group) # handle additional files arc_files = [] acc_data = {} if has_pae: arc_files.append(_get_assoc_pae_file(system.id, mdl_fle_stem)) for af_label, af_dict in mdl_data.get("acc_files", {}).items(): # needs data (for protocol) and file (for associated files) object acc_data[af_label] = modelcif.data.Data(af_dict["details"]) arc_files.append( _get_associated_file( af_dict["destination_file_name"], acc_data[af_label], af_dict["file_format"], af_dict["file_content"], ) ) # need to copy file for zip to work later shutil.copyfile( af_dict["source_file_path"], os.path.join(out_dir, af_dict["destination_file_name"]), ) if arc_files: system.repositories.append( _get_associated_files(mdl_fle_stem, arc_files) ) # get data and steps ref_dbs = mdl_data["config_data"]["seq_dbs"] protocol = _get_modelcif_protocol( mdl_data["protocol"], system.entities, model_group, ref_dbs, acc_data, ) system.protocols.append(protocol) # set SW for QE if "af2_protocol_name" in mdl_data: sw4qe = _get_sw_for_qe(protocol.steps, mdl_data["af2_protocol_name"]) for af2_class in [ _GlobalPTM, _GlobalIpTM, _GlobalConfRankMultimer, _GlobalPLDDT, _LocalPLDDT, _LocalPairwisePAE, ]: af2_class.software = sw4qe logger.info(" (%.2fs; QA: %.2fs)", timer() - pstart, qa_time) # write modelcif System to file logger.info("write to disk...") pstart = timer() # NOTE: this will dump PAE on path provided in add_scores # -> hence we cheat by changing path and back while being exception-safe... oldpwd = Path.cwd() os.chdir(out_dir) mdl_fle = f"{mdl_fle_stem}.cif" try: with open(mdl_fle, "w", encoding="ascii") as mmcif_fh: modelcif.dumper.write(mmcif_fh, [system]) if arc_files: _package_associated_files(system.repositories[0]) if compress: _compress_cif_file(mdl_fle) mdl_fle += ".gz" finally: os.chdir(oldpwd) logger.info(" (%.2fs)", timer() - pstart) return mdl_fle
global_ref_dbs = {} def _get_ref_db_object(name, url, version=None, release_date=None): """Cached access to modelcif.ReferenceDatabase objects. Needed to remove duplicates in ModelCIF. """ key = (name, url, version, release_date) if key not in global_ref_dbs: global_ref_dbs[key] = modelcif.ReferenceDatabase( name, url, version, release_date ) return global_ref_dbs[key]
[docs] def get_af2_sequence_dbs(config_data): """Get AF2 sequence databases and store them in ``config_data``. Builds a list of :class:`modelcif.ReferenceDatabase` objects based on the AlphaFold 2 configuration and writes it to ``config_data["seq_dbs"]``. The selection depends on the database preset, AF2 version, and whether multimer mode or templates are used. Args: config_data (dict): AF2 configuration data, as returned by :func:`get_af2_config`. Relevant keys: - ``af_version`` (str): AlphaFold 2 version string; determines which MGnify and UniRef variants are added. - ``use_small_bfd`` (bool): If ``True``, uses Reduced BFD instead of full BFD. - ``use_multimer`` (bool): If ``True``, adds TrEMBL, Swiss-Prot, and PDB seqres databases. - ``use_templates`` (bool): If ``True``, adds a PDB sequence database (PDB seqres for multimer, PDB70 for monomer). - ``up_version`` (str or None): UniProt release version, passed to the ``version`` attribute of UniRef90, TrEMBL, and Swiss-Prot database objects. - ``up_rel_date`` (datetime.date or None): UniProt release date, passed to the ``release_date`` attribute of UniRef90, TrEMBL, and Swiss-Prot database objects. - ``pdb_rel_date`` (datetime.date or None): PDB release date, passed to the ``release_date`` attribute of the PDB seqres database object. Returns: None: Results are written to ``config_data["seq_dbs"]`` as a list of :class:`modelcif.ReferenceDatabase` objects. """ up_version = config_data["up_version"] up_rel_date = config_data["up_rel_date"] # fill list of DBs seq_dbs = [] if config_data["use_small_bfd"]: seq_dbs.append( _get_ref_db_object( "Reduced BFD", "https://storage.googleapis.com/alphafold-databases/" + "reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz", ) ) else: seq_dbs.append( _get_ref_db_object( "BFD", "https://storage.googleapis.com/alphafold-databases/" + "casp14_versions/" + "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar" + ".gz", version="6a634dc6eb105c2e9b4cba7bbae93412", ) ) if config_data["af_version"] < "2.3.0": seq_dbs.append( _get_ref_db_object( "MGnify", "https://storage.googleapis.com/alphafold-databases/" + "casp14_versions/mgy_clusters_2018_12.fa.gz", version="2018_12", release_date=datetime.date(2018, 12, 6), ) ) seq_dbs.append( _get_ref_db_object( "Uniclust30", "https://storage.googleapis.com/alphafold-databases/" + "casp14_versions/uniclust30_2018_08_hhsuite.tar.gz", version="2018_08", release_date=None, ) ) else: # Don't add a linebreak to URLs, allow long line in Pylint # pylint: disable=line-too-long # NOTE: release date according to https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/ # pylint: enable=line-too-long seq_dbs.append( _get_ref_db_object( "MGnify", "https://storage.googleapis.com/alphafold-databases/" + "v2.3/mgy_clusters_2022_05.fa.gz", version="2022_05", release_date=datetime.date(2022, 5, 6), ) ) seq_dbs.append( _get_ref_db_object( "UniRef30", "https://storage.googleapis.com/alphafold-databases/" + "v2.3/UniRef30_2021_03.tar.gz", version="2021_03", release_date=None, ) ) if config_data["use_multimer"]: seq_dbs.append( _get_ref_db_object( "TrEMBL", "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/" + "knowledgebase/complete/uniprot_trembl.fasta.gz", version=up_version, release_date=up_rel_date, ) ) seq_dbs.append( _get_ref_db_object( "Swiss-Prot", "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/" + "knowledgebase/complete/uniprot_sprot.fasta.gz", version=up_version, release_date=up_rel_date, ) ) seq_dbs.append( _get_ref_db_object( "UniRef90", "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/" + "uniref90.fasta.gz", version=up_version, release_date=up_rel_date, ) ) if config_data["use_templates"]: if config_data["use_multimer"]: # uses whatever is latest set of PDB sequences # see AF2 scripts/download_pdb_seqres.sh seq_dbs.append( _get_ref_db_object( "PDB seqres", "https://files.wwpdb.org/pub/pdb/derived_data/" + "pdb_seqres.txt", release_date=config_data["pdb_rel_date"], ) ) else: # fixed version used in AF2 scripts/download_pdb70.sh seq_dbs.append( _get_ref_db_object( "PDB70", "http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/" + "hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz", release_date=datetime.date(2020, 4, 1), ) ) config_data["seq_dbs"] = seq_dbs
[docs] def get_af2_config( af_version, af_params=None, custom_ranking=None, up_version=None, up_rel_date=None, pdb_rel_date=None, ): """Get configuration data for an AlphaFold 2 modelling run. Derives modelling settings from the provided AlphaFold 2 version and parameters, builds a human-readable description of the run, and returns a configuration dictionary for use by downstream functions. Args: af_version (str): AlphaFold 2 version string (e.g. ``"2.3.2"``). af_params (dict, optional): Non-default AlphaFold 2 parameters. Recognised keys include ``model_preset``, ``db_preset``, ``num_multimer_predictions_per_model``, ``models_to_relax``, ``run_relax``, ``max_template_date``, and ``num_ensemble``. Defaults to an empty dict if not provided. custom_ranking (str, optional): Custom model ranking expression. If not provided, defaults to ``"pLDDT"`` for monomer runs and ``"ipTM*0.8+pTM*0.2"`` for multimer runs. up_version (str, optional): UniProt release in ``"YYYY_MM"`` format current at the time of AF2 installation. (see https://www.uniprot.org/release-notes) up_rel_date (datetime.date, optional): Release date corresponding to ``up_version``. pdb_rel_date (datetime.date, optional): PDB release date current at the time of AF2 installation. Relevant for multimer runs using templates. Returns: dict: Configuration data for downstream functions. Keys: - ``af_params`` (dict): Parameters as passed (or empty dict). - ``af_version`` (str): AlphaFold 2 version string as passed. - ``description`` (str): Human-readable run description. - ``use_templates`` (bool): Whether templates were used. - ``use_small_bfd`` (bool): Whether the reduced BFD database setting was used. - ``use_multimer`` (bool): Whether multimer mode was used. - ``up_version`` (str or None): UniProt release as passed. - ``up_rel_date`` (datetime.date or None): UniProt release date as passed. - ``pdb_rel_date`` (datetime.date or None): PDB release date as passed. - ``seq_dbs`` (list[modelcif.ReferenceDatabase]): Sequence DB objects. """ # Disable some Pylint warnings, not going to be fixed atm # pylint: disable=too-many-arguments,too-many-positional-arguments # pylint: disable=too-many-locals,too-many-branches,too-many-statements if af_params is None: af_params = {} # get defaults model_preset = af_params.get("model_preset", "monomer") db_preset = af_params.get("db_preset", "full_dbs") use_multimer = model_preset == "multimer" # 5 models unless multimer with extra flag if use_multimer: if "num_multimer_predictions_per_model" in af_params: num_seeds = af_params["num_multimer_predictions_per_model"] else: num_seeds = 5 if af_version >= "2.2.0" else 1 else: num_seeds = 1 num_models = num_seeds * 5 # default relax setting changed over time; translate to latest if "models_to_relax" in af_params: models_to_relax = af_params["models_to_relax"] assert "run_relax" not in af_params elif "run_relax" in af_params: models_to_relax = "all" if af_params["run_relax"] else "none" else: models_to_relax = "all" if af_version < "2.3.2" else "best" # templates turned off if max_template_date older than oldest PDB use_templates = ( "max_template_date" not in af_params or af_params["max_template_date"] >= "1976-05-19" ) # build description text description = ( f"Model generated using AlphaFold (v{af_version}) " f"producing {num_models} {model_preset} models " ) if num_seeds > 1: description += f"({num_seeds} random seeds per parameter set) " if use_multimer and af_version >= "2.3.0": description += "with up to 20 recycles " else: description += "with 3 recycles " if "num_ensemble" in af_params: description += f"and {af_params['num_ensemble']} ensemble " description += "each, " if models_to_relax == "all": description += "with AMBER relaxation, " elif models_to_relax == "best": description += "with AMBER relaxation on best model, " else: description += "without model relaxation, " if use_templates: if "max_template_date" in af_params: mtd_str = f" up to date {af_params['max_template_date']}" else: mtd_str = "" description += f"using templates{mtd_str}, " else: description += "without templates, " if custom_ranking: rank_str = custom_ranking elif use_multimer: rank_str = "ipTM*0.8+pTM*0.2" else: rank_str = "pLDDT" msa_str = "MSAs" if use_multimer else "an MSA" description += ( f"ranked by {rank_str}, starting from {msa_str} with " f"{db_preset} setting." ) config_data = { "af_params": af_params, "af_version": af_version, "description": description, "use_templates": use_templates, "use_small_bfd": db_preset == "reduced_dbs", "use_multimer": use_multimer, "up_version": up_version, "up_rel_date": up_rel_date, "pdb_rel_date": pdb_rel_date, } get_af2_sequence_dbs(config_data) return config_data
[docs] def get_galaxy_software(version): """Get Galaxy as a software dictionary for a ModelCIF file. Builds a dictionary suitable for creating a :class:`modelcif.Software` object, with citation and download URL derived from the provided version string. Args: version (str): Galaxy AlphaFold 2 version string in the format ``[AF2v]+galaxy[X]``, e.g. ``"2.3.2+galaxy1"``. Returns: dict: Software descriptor with keys ``name``, ``classification``, ``description``, ``citation``, ``location``, ``type``, and ``version``. """ galaxy_suffix = version.split("galaxy")[-1] return { "name": "Galaxy AlphaFold 2", "classification": "model building", "description": "Structure prediction", "citation": ihm.Citation( pmid="38769056", title="The Galaxy platform for accessible, reproducible, and " + "collaborative data analyses: 2024 update.", journal="Nucleic Acids Res", volume=52, page_range=["W83", "W94"], year=2024, authors=["Galaxy Community"], doi="10.1093/nar/gkae410", ), "location": ( f"https://usegalaxy.eu/root?tool_id=toolshed.g2.bx.psu.edu/repos/" f"galaxy-australia/alphafold2/alphafold/2.3.2+galaxy{galaxy_suffix}" ), "type": "package", "version": version, }
[docs] def get_cf_db_versions(dt, num_days_unk=1): """Get ColabFold database versions for a given date. Returns the UniRef30, template database name, and template database version used by the ColabFold MSA server on a given date. Based on https://github.com/sokrypton/ColabFold/wiki/MSA-Server-Database-History. Args: dt (datetime.date): Date for which to look up the database versions. num_days_unk (int): Number of days around a database switch date within which the result is considered unknown. Defaults to 1. Returns: tuple: A 3-tuple of ``(ur30_db_version, tpl_db, tpl_db_version)``, each a :class:`str`. Values are set to ``"UNK"`` if ``dt`` falls within ``num_days_unk`` days of a switch date, if the template database version is unknown, or if no matching date range is found. """ # logic: newest first, tuple with ur30_db_version, tpl_db, tpl_db_version switch_dates = [ (datetime.date(2023, 7, 31), ("2023_02", "PDB100", "230517")), (datetime.date(2023, 7, 27), ("2022_02", "PDB70", "220313")), (datetime.date(2023, 6, 12), ("2023_02", "PDB100", "230517")), (datetime.date(2022, 7, 13), ("2022_02", "PDB70", "220313")), (datetime.date(2021, 1, 1), ("2021_03", "PDB70", "UNK")), ] for switch_dt, dbs in switch_dates: dd = (dt - switch_dt).days if abs(dd) <= num_days_unk: return ("UNK", "UNK", "UNK") if dd > num_days_unk: return dbs return ("UNK", "UNK", "UNK")
# LocalWords: LocalColabFold Args str bool ColabFold config func msa num ur # LocalWords: localcolabfold UniRef tpl ValueError