Source code for modelarchive.modelcif.fix_af2

"""ModelCIF files generated by AlphaFold 2 deviate from the official ModelCIF
definition dictionary in specific cases. Here are functions to fix this.
"""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Won't shorten the module atm, allow in Pylint
# pylint: disable=too-many-lines

from pathlib import Path
from timeit import default_timer as timer
import datetime
import gzip
import logging
import os
import shutil
import zipfile

import ihm.citations
import modelcif.dumper
import modelcif.model
import modelcif.protocol
import modelcif.qa_metric
import modelcif.associated
import numpy as np

logger = logging.getLogger(__name__)


# These classes follows python-modelcif, allow too few methods in Pylint.
# pylint: disable=too-few-public-methods


class _LPeptideAlphabetWithXO(ihm.LPeptideAlphabet):
    """Have the default amino acid alphabet plus 'X' for unknown residues
    and 'O' as allowed non-def. AA (U already in alphabet)."""

    # extra entry added according to LPeptideAlphabet def. in
    # https://python-ihm.readthedocs.io/en/latest/_modules/ihm.html
    # and https://files.rcsb.org/view/1NTH.cif for values for 'O'.

    def __init__(self):
        """Create the alphabet."""
        super().__init__()
        self._comps["X"] = self._comps["UNK"]
        self._comps["O"] = ihm.LPeptideChemComp(
            "PYL", "O", "O", "PYRROLYSINE", "C12 H21 N3 O3"
        )


class _LocalPLDDT(modelcif.qa_metric.Local, modelcif.qa_metric.PLDDT):
    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


class _LocalPairwisePAE(
    modelcif.qa_metric.LocalPairwise, modelcif.qa_metric.PAE
):
    """Predicted aligned error (in Angstroms)"""

    name = "PAE"
    software = None


class _GlobalPLDDT(modelcif.qa_metric.Global, modelcif.qa_metric.PLDDT):
    """Predicted accuracy according to the CA-only lDDT in [0,100]"""

    name = "pLDDT"
    software = None


class _GlobalPTM(modelcif.qa_metric.Global, modelcif.qa_metric.PTM):
    """Predicted accuracy according to the TM-score score in [0,1]"""

    name = "pTM"
    software = None



[docs]
class GlobalIpTM(modelcif.qa_metric.Global, modelcif.qa_metric.IpTM):
    """Predicted protein-protein interface score based on TM-score in [0,1]"""

    name = "ipTM"
    software = None




[docs]
class GlobalConfRankMultimer(
    modelcif.qa_metric.Global, modelcif.qa_metric.NormalizedScore
):
    """Default ranking score used by AlphaFold-Multimer"""

    name = "ranking-confidence (ipTM*0.8+pTM*0.2)"
    software = None



# pylint: enable=too-few-public-methods



[docs]
def get_cf_sequence_dbs(config_data):
    """Get ColabFold sequence databases and store them in ``config_data``.

    Looks up a hardcoded list of known ColabFold sequence databases and
    populates ``config_data["seq_dbs"]`` with
    :class:`modelcif.ReferenceDatabase` instances corresponding to the
    databases requested via ``config_data["seq_db_keys"]``. If a template
    database is specified via ``config_data["tpl_db"]``, it is appended as
    well. UniRef database entries require a version string in
    ``config_data["ur30_db_version"]``; template database entries require
    a version string in ``config_data["tpl_db_version"]``.

    Args:
        config_data (dict): Configuration data dictionary. Relevant keys:
            ``seq_db_keys`` (list of :class:`str`) — sequence database
            identifiers to look up; ``ur30_db_version`` (:class:`str` or
            ``None``) — version string required when ``"UniRef"`` is in
            ``seq_db_keys``; ``tpl_db`` (:class:`str` or ``None``) — optional
            template database identifier; ``tpl_db_version`` (:class:`str` or
            ``None``) — version string required when ``tpl_db`` is set. On
            return, ``seq_dbs`` is added as a list of
            :class:`modelcif.ReferenceDatabase` instances.

    Returns:
        None

    Raises:
        ValueError: If ``"UniRef"`` is in ``seq_db_keys`` but
            ``ur30_db_version`` is ``None``.
        ValueError: If ``tpl_db`` is set but ``tpl_db_version`` is ``None``.
        ValueError: If a resolved database key is not found in the hardcoded
            set of known ColabFold databases.
    """
    # Uses HC list of known DBs used in ColabFold
    # -> see also notes in get_cf_config
    db_dict = {
        "UniRef_2021_03": modelcif.ReferenceDatabase(
            "UniRef30",
            "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2103.tar.gz",
            version="2021_03",
        ),
        "UniRef_2022_02": modelcif.ReferenceDatabase(
            "UniRef30",
            "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2202.tar.gz",
            version="2022_02",
        ),
        "UniRef_2023_02": modelcif.ReferenceDatabase(
            "UniRef30",
            "https://wwwuser.gwdg.de/~compbiol/colabfold/uniref30_2302.tar.gz",
            version="2023_02",
        ),
        "Environmental": modelcif.ReferenceDatabase(
            "ColabFold DB",
            "https://wwwuser.gwdg.de/~compbiol/colabfold/"
            + "colabfold_envdb_202108.tar.gz",
            version="2021_08",
        ),
        "PDB100_230517": modelcif.ReferenceDatabase(
            "PDB100",
            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
            + "hhsuite_dbs/pdb100_foldseek_230517.tar.gz",
            release_date=datetime.date(2023, 5, 17),
        ),
        "PDB70_211027": modelcif.ReferenceDatabase(
            "PDB70",
            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
            + "hhsuite_dbs/pdb70_from_mmcif_211027.tar.gz",
            release_date=datetime.date(2021, 10, 27),
        ),
        "PDB70_211117": modelcif.ReferenceDatabase(
            "PDB70",
            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
            + "hhsuite_dbs/pdb70_from_mmcif_211117.tar.gz",
            release_date=datetime.date(2021, 11, 17),
        ),
        "PDB70_220313": modelcif.ReferenceDatabase(
            "PDB70",
            "https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
            + "hhsuite_dbs/pdb70_from_mmcif_220313.tar.gz",
            release_date=datetime.date(2022, 3, 13),
        ),
    }
    # fill list of DBs
    seq_dbs = []
    search_keys = []
    for seq_db in config_data["seq_db_keys"]:
        if seq_db == "UniRef":
            if config_data["ur30_db_version"] is None:
                raise ValueError("Cannot use UniRef without version")
            search_key = f"UniRef_{config_data['ur30_db_version']}"
        else:
            search_key = seq_db
        search_keys.append(search_key)
    if config_data["tpl_db"] is not None:
        if config_data["tpl_db_version"] is None:
            raise ValueError("Cannot have tpl DB without version")
        search_keys.append(
            f"{config_data['tpl_db']}_{config_data['tpl_db_version']}"
        )
    for search_key in search_keys:
        if search_key not in db_dict:
            raise ValueError(f"Unknown seq. DB {search_key}")
        seq_dbs.append(db_dict[search_key])
    config_data["seq_dbs"] = seq_dbs




[docs]
def get_localcolabfold_software(version=None):
    """Get LocalColabFold as a :class:`dict` for creating a software object.

    Args:
        version (str): Version of LocalColabFold. Should only be ``None`` if the
            version is genuinely unavailable.

    Returns:
        dict: A dictionary with software metadata suitable for creating a
        ModelCIF software object.
    """
    return {
        "name": "LocalColabFold",
        "classification": "model building",
        "description": "Structure prediction",
        "citation": None,
        "location": "https://github.com/YoshitakaMo/localcolabfold",
        "type": "package",
        "version": version,
    }




[docs]
def get_colabfold_software(version=None):
    """Get ColabFold as a :class:`dict` for creating a software object.

    Args:
        version (str): Version of ColabFold. Should only be ``None`` if the
            version is genuinely unavailable.

    Returns:
        dict: A dictionary with software metadata suitable for creating a
        ModelCIF software object.
    """
    return {
        "name": "ColabFold",
        "classification": "model building",
        "description": "Structure prediction",
        "citation": ihm.citations.colabfold,
        "location": "https://github.com/sokrypton/ColabFold",
        "type": "package",
        "version": version,
    }




[docs]
def get_mmseqs2_software(version=None):
    """Get MMseqs2 as a :class:`dict` for creating a software object.

    Args:
        version (str): Version of MMseqs2. Should only be ``None`` if the
            version is genuinely unavailable.

    Returns:
        dict: A dictionary with software metadata suitable for creating a
        ModelCIF software object.
    """
    return {
        "name": "MMseqs2",
        "classification": "data collection",
        "description": "Many-against-Many sequence searching",
        "citation": ihm.citations.mmseqs2,
        "location": "https://github.com/soedinglab/mmseqs2",
        "type": "package",
        "version": version,
    }




[docs]
def get_af2_software(version=None, is_multimer=False):
    """Get AlphaFold 2 as a :class:`dict` for creating a software object.

    Args:
        version (str): Version of AlphaFold 2. Should only be ``None`` if the
            version is genuinely unavailable.
        is_multimer (bool): If ``True``, return metadata for AlphaFold-Multimer
            instead of AlphaFold 2.

    Returns:
        dict: A dictionary with software metadata suitable for creating a
        ModelCIF software object. The ``name`` and ``citation`` entries
        differ depending on ``is_multimer``.
    """
    if is_multimer:
        return {
            "name": "AlphaFold-Multimer",
            "classification": "model building",
            "description": "Structure prediction",
            "citation": ihm.Citation(
                pmid=None,
                title="Protein complex prediction with "
                + "AlphaFold-Multimer.",
                journal="bioRxiv",
                volume=None,
                page_range=None,
                year=2021,
                authors=[
                    "Evans, R.",
                    "O'Neill, M.",
                    "Pritzel, A.",
                    "Antropova, N.",
                    "Senior, A.",
                    "Green, T.",
                    "Zidek, A.",
                    "Bates, R.",
                    "Blackwell, S.",
                    "Yim, J.",
                    "Ronneberger, O.",
                    "Bodenstein, S.",
                    "Zielinski, M.",
                    "Bridgland, A.",
                    "Potapenko, A.",
                    "Cowie, A.",
                    "Tunyasuvunakool, K.",
                    "Jain, R.",
                    "Clancy, E.",
                    "Kohli, P.",
                    "Jumper, J.",
                    "Hassabis, D.",
                ],
                doi="10.1101/2021.10.04.463034",
            ),
            "location": "https://github.com/deepmind/alphafold",
            "type": "package",
            "version": version,
        }

    return {
        "name": "AlphaFold",
        "classification": "model building",
        "description": "Structure prediction",
        "citation": ihm.citations.alphafold2,
        "location": "https://github.com/deepmind/alphafold",
        "type": "package",
        "version": version,
    }




[docs]
def get_cf_sw_plus_params(config_data, use_localcolabfold=False):
    """Create a list of software and parameters for a ColabFold protocol step.

    Args:
        config_data (dict): ColabFold configuration data as returned by
            :func:`get_cf_config`.
        use_localcolabfold (bool): If ``True``, prepend LocalColabFold to the
            list of software entries.

    Returns:
        list[tuple[dict, dict]]: A list of ``(software, parameters)`` tuples
        suitable for use in a protocol.
    """
    sw_plus_params = []
    if use_localcolabfold:
        sw_plus_params.append((get_localcolabfold_software(), {}))
    sw_plus_params.append(
        (
            get_colabfold_software(config_data["cf_version"]),
            config_data["cf_params"],
        )
    )
    if config_data["use_mmseqs"]:
        sw_plus_params.append((get_mmseqs2_software(), {}))
    sw_plus_params.append(
        (get_af2_software(is_multimer=config_data["use_multimer"]), {})
    )
    return sw_plus_params




[docs]
def get_cf_config(
    cf_config, ur30_db_version=None, tpl_db=None, tpl_db_version=None
):
    """Process a ColabFold configuration into a standardised data dictionary.

    Args:
        cf_config (dict): Raw ColabFold configuration data, typically read
            from a ColabFold configuration file. Must contain the keys
            ``version``, ``msa_mode``, ``model_type``, ``num_recycles``,
            ``use_templates``, and ``rank_by``. Optional keys include
            ``commit``, ``pair_mode``, ``recycle_early_stop_tolerance``,
            ``stop_at_score``, ``num_seeds``, ``num_models``, ``use_amber``,
            and ``num_relax``.
        ur30_db_version (str, optional): Version of the UniRef30 database
            used. Should only be ``None`` if the database was not used.
        tpl_db (str, optional): Template database used. Accepted values are
            ``"PDB70"``, ``"PDB100"``, or ``None`` if no template database
            was used.
        tpl_db_version (str, optional): Version of the template database
            used. Should only be ``None`` if the database was not used.

    Returns:
        dict: A dictionary with processed ColabFold configuration data for
        further use in model preparation.

    Raises:
        ValueError: If ``msa_mode`` is not one of the known values.
        ValueError: If ``model_type`` is not one of the known values.
        ValueError: If ``rank_by`` is not one of the known values.
    """
    # Not going to reduce no. of variables, branches or statements at this
    # point, allow in Pylint
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements

    # keep version indep. of params (and add commit since versions are meh)
    cf_version = cf_config["version"]
    if "commit" in cf_config and cf_config["commit"] is not None:
        cf_version += f" ({cf_config['commit'][:7]})"
    # drop fields which are not relevant for model building
    cf_config = cf_config.copy()
    for key in ["num_queries", "commit", "version", "user_agent"]:
        if key in cf_config:
            del cf_config[key]

    # NOTE: following code from
    # https://github.com/sokrypton/ColabFold/blob/main/colabfold/batch.py to
    # understand config
    # -> see also https://github.com/sokrypton/ColabFold/wiki/v1.5.0

    # deal with old names (some settings changed name in v1.5)
    # -> code taken almost verbatim from https://github.com/sokrypton/ColabFold
    old_names = {
        "MMseqs2 (UniRef+Environmental)": "mmseqs2_uniref_env",
        "MMseqs2 (UniRef only)": "mmseqs2_uniref",
        "unpaired+paired": "unpaired_paired",
        "AlphaFold2-multimer-v1": "alphafold2_multimer_v1",
        "AlphaFold2-multimer-v2": "alphafold2_multimer_v2",
        "AlphaFold2-multimer-v3": "alphafold2_multimer_v3",
        "AlphaFold2-ptm": "alphafold2_ptm",
        "AlphaFold2": "alphafold2",
    }
    msa_mode = old_names.get(cf_config["msa_mode"], cf_config["msa_mode"])
    if "pair_mode" in cf_config:
        pair_mode = old_names.get(
            cf_config["pair_mode"], cf_config["pair_mode"]
        )
    else:
        pair_mode = None
    model_type = old_names.get(cf_config["model_type"], cf_config["model_type"])

    # fix v1.5 defaults for num_recycles and recycle_early_stop_tolerance
    # -> def. (set as "null" in config):
    #    - num_recycles == 20 if alphafold2_multimer_v3 else 3
    #    - recycle_early_stop_tolerance == 0.5 if multimer else 0.0
    # -> valid from 1.5.0 until 1.5.5 (and probably later)
    # -> defined in alphafold/model/config.py of steineggerlab/alphafold repo
    if "num_recycles" in cf_config and cf_config["num_recycles"] is None:
        if "multimer" in model_type and model_type not in [
            "alphafold2_multimer_v1",
            "alphafold2_multimer_v2",
        ]:
            cf_config["num_recycles"] = 20
        else:
            cf_config["num_recycles"] = 3
    if (
        "recycle_early_stop_tolerance" in cf_config
        and cf_config["recycle_early_stop_tolerance"] is None
    ):
        cf_config["recycle_early_stop_tolerance"] = (
            0.5 if "multimer" in model_type else 0.0
        )

    # remove null config entries (ASSUME: None = use default)
    cf_config = {k: v for k, v in cf_config.items() if v is not None}

    # fetch relevant data
    # -> MSA mode
    if msa_mode == "mmseqs2_uniref_env":
        seq_dbs = ["UniRef", "Environmental"]
        use_mmseqs = True
        use_msa = True
    elif msa_mode == "mmseqs2_uniref":
        seq_dbs = ["UniRef"]
        use_mmseqs = True
        use_msa = True
    elif msa_mode == "single_sequence":
        seq_dbs = []
        use_mmseqs = False
        use_msa = False
    elif msa_mode == "custom":
        seq_dbs = []
        use_mmseqs = False
        use_msa = True
    else:
        raise ValueError(f"Unknown msa_mode {cf_config['msa_mode']}")

    # -> model type
    if model_type == "alphafold2_multimer_v1":
        # AF-Multimer as introduced in AlphaFold v2.1.0
        use_multimer = True
        multimer_version = 1
    elif model_type == "alphafold2_multimer_v2":
        # AF-Multimer as introduced in AlphaFold v2.2.0
        use_multimer = True
        multimer_version = 2
    elif model_type == "alphafold2_multimer_v3":
        # AF-Multimer as introduced in AlphaFold v2.3.0
        use_multimer = True
        multimer_version = 3
    elif model_type == "alphafold2_ptm":
        use_multimer = False
        multimer_version = None
    else:
        raise ValueError(f"Unknown model_type {cf_config['model_type']}")

    # write modeling description
    mdl_description = f"Model generated using ColabFold v{cf_version}"
    if use_multimer:
        mdl_description += f" with AlphaFold-Multimer (v{multimer_version})"
    else:
        mdl_description += " with AlphaFold"
    # early stopping feature of ColabFold
    upto_mdl = ""
    upto_rec = ""
    if cf_config.get("stop_at_score", 100) < 100:
        upto_mdl = "up to "
        upto_rec = "up to "
    if cf_config.get("recycle_early_stop_tolerance", 0) > 0:
        upto_rec = "up to "
    if cf_config.get("num_seeds", 1) > 1:
        mdl_str = (
            f"{cf_config['num_models'] * cf_config['num_seeds']} "
            f"models ({cf_config['num_seeds']} random seeds per "
            f"parameter set)"
        )
    else:
        mdl_str = f"{cf_config['num_models']} models"
    mdl_description += (
        f" producing {upto_mdl}{mdl_str} with {upto_rec}"
        f"{cf_config['num_recycles']} recycles each"
    )
    if cf_config.get("use_amber", False) or cf_config.get("num_relax", 0) > 0:
        mdl_description += ", with AMBER relaxation"
    else:
        mdl_description += ", without model relaxation"
    if cf_config["use_templates"]:
        # tpl_db == None meant to mean that custom templates were used
        # -> no need to stress it but just visible in search DBs
        mdl_description += ", using templates"
    else:
        mdl_description += ", without templates"
        tpl_db = None
        tpl_db_version = None
    if cf_config["rank_by"] == "plddt":
        mdl_description += ", ranked by pLDDT"
    elif cf_config["rank_by"] == "ptmscore":
        mdl_description += ", ranked by pTM"
    elif cf_config["rank_by"] == "multimer":
        mdl_description += ", ranked by 80*ipTM+20*pTM"
    else:
        raise ValueError(f"Unknown rank_by {cf_config['rank_by']}")
    if use_msa:
        mdl_description += ", starting from"
        if use_mmseqs:
            msa_type = "MSA"
        else:
            msa_type = "custom MSA"
        if use_multimer:
            if pair_mode == "unpaired_paired":
                mdl_description += f" paired and unpaired {msa_type}s"
            elif pair_mode == "paired":
                mdl_description += f" paired {msa_type}s"
            elif pair_mode == "unpaired":
                mdl_description += f" unpaired {msa_type}s"
            elif pair_mode is None:
                raise ValueError(
                    "Key 'pair_mode' required with " + "'use_multimer=True'"
                )
            else:
                raise ValueError(f"Unknown pair_mode {cf_config['pair_mode']}")
        elif msa_type.startswith("M"):
            mdl_description += f" an {msa_type}"
        else:
            mdl_description += f" a {msa_type}"
        if use_mmseqs:
            mdl_description += f" from MMseqs2 ({'+'.join(seq_dbs)})"
    else:
        mdl_description += " without an MSA"
    mdl_description += "."

    config_data = {
        "cf_params": cf_config,
        "cf_version": cf_version,
        "seq_db_keys": seq_dbs,
        "use_mmseqs": use_mmseqs,
        "use_msa": use_msa,
        "ur30_db_version": ur30_db_version,
        "tpl_db": tpl_db,
        "tpl_db_version": tpl_db_version,
        "use_multimer": use_multimer,
        "multimer_version": multimer_version,
        "description": mdl_description,
    }
    get_cf_sequence_dbs(config_data)
    return config_data



def _get_res_num(r, use_auth=False):
    """Get residue number, optionally from PDB auth IDs."""
    if use_auth:
        return int(r.GetStringProp("pdb_auth_resnum"))
    return r.number.num



[docs]
def get_sequence(chn, use_auth=False):
    """Get the sequence of an `OpenStructure`_ chain, inserting ``'-'`` for
    gaps.

    Args:
        chn (ost.mol.ChainHandle or ost.mol.ChainView): `OST`_ chain to extract
            the sequence from. Any object providing the following interface
            can be used as a drop-in replacement for the `OST`_ chain object:

            - ``chn.residues``: sequence of residue objects, each providing
            - ``chn.residues[i].number.num`` (:class:`int`): internal residue
              number
            - ``chn.residues[i].one_letter_code`` (:class:`str`): single-letter
              code
            - ``chn.residues[i].GetStringProp("pdb_auth_resnum")``
              (:class:`str`): author residue number as an integer string, only
              required if ``use_auth=True``
        use_auth (bool): If ``True``, use PDB author residue numbers instead
            of internal residue numbers.

    Returns:
        str: One-letter code sequence with ``'-'`` characters inserted for gaps.
    """
    # initialise (add gaps if first is not at num. 1)
    lst_rn = _get_res_num(chn.residues[0], use_auth)
    idx = 1
    sqe = "-" * (lst_rn - 1) + chn.residues[0].one_letter_code

    for res in chn.residues[idx:]:
        lst_rn += 1
        while lst_rn != _get_res_num(res, use_auth):
            sqe += "-"
            lst_rn += 1
        sqe += res.one_letter_code
    return sqe




[docs]
def assemble_modelcif_software(soft_dict, params_dict):
    """Create a :class:`modelcif.SoftwareWithParameters` instance from
    dictionaries.

    Args:
        soft_dict (dict): Software metadata as returned by functions such as
            :func:`get_colabfold_software`. Must contain the keys ``name``,
            ``classification``, ``description``, ``location``, ``type``,
            ``version``, and ``citation``.
        params_dict (dict): Software parameters, where each key is passed as
            the parameter name and each value as the parameter value to
            :class:`modelcif.SoftwareParameter`.

    Returns:
        modelcif.SoftwareWithParameters: A ModelCIF software object with
        associated parameters.
    """
    # create SW object
    sw = modelcif.Software(
        soft_dict["name"],
        soft_dict["classification"],
        soft_dict["description"],
        soft_dict["location"],
        soft_dict["type"],
        soft_dict["version"],
        citation=soft_dict["citation"],
    )
    # assemble parameters
    params = []
    for key, val in params_dict.items():
        params.append(modelcif.SoftwareParameter(key, val))
    # put them together
    return modelcif.SoftwareWithParameters(sw, params)



def _get_ch_name(ch, use_auth=False):
    """Get chain name from auth. IDs if reading from mmCIF files."""
    if use_auth:
        return ch.GetStringProp("pdb_auth_chain_name")
    return ch.name


class _OST2ModelCIF(modelcif.model.AbInitioModel):
    """Map an OST entity to a :class:`modelcif.model.AbInitioModel`.

    Args:
        assembly (modelcif.Assembly): Collection of asymmetric units for
            this model.
        asym (dict): Mapping of chain names to :class:`modelcif.AsymUnit`
            objects.
        ost_entity (ost.mol.Entity or ost.mol.EntityHandle): Loaded OST entity
            object.
        name (str, optional): Short name for this model.
        plddt_from_b_factors (bool): If ``True``, read per-residue pLDDT
            from B-factors instead of from ``scores_json``.
        scores_json (dict): Score data to store. Supported keys are
            ``plddt``, ``plddt_global``, ``ptm``, ``iptm``, ``confrank``,
            and ``pae``.
        extra_global_scores (list): Additional global QA score objects to
            append to ``qa_metrics``.
        incl_pae (bool): If ``True``, include PAE scores. Defaults to
            ``True`` if ``"pae"`` is present in ``scores_json``.
        use_auth (bool): If ``True``, use PDB author IDs instead of
            internal IDs. Relevant when reading from mmCIF files.
        pae_digits (int): Number of decimal digits for PAE values.
    """

    # not going to fight for reducing a single attribute, allow in Pylint
    # pylint: disable=too-many-instance-attributes

    def __init__(self, *args, **kwargs):
        """Initialise a model"""
        self.ost_entity = kwargs.pop("ost_entity")
        self.asym = kwargs.pop("asym")
        # optional arguments
        self.plddt_from_b_factors = kwargs.pop("plddt_from_b_factors", False)
        self.scores_json = kwargs.pop("scores_json", {})
        self.extra_global_scores = kwargs.pop("extra_global_scores", [])
        self.incl_pae = kwargs.pop("incl_pae", "pae" in self.scores_json)
        self.use_auth = kwargs.pop("use_auth", False)
        self.pae_digits = kwargs.pop("pae_digits", 3)

        # get pLDDT if needed
        if self.plddt_from_b_factors:
            self.scores_json["plddt"] = []
            for res in self.ost_entity.residues:
                b_factors = [a.b_factor for a in res.atoms]
                assert len(set(b_factors)) == 1  # must all be equal!
                self.scores_json["plddt"].append(b_factors[0])
        if "plddt_global" not in self.scores_json:
            self.scores_json["plddt_global"] = np.mean(
                self.scores_json["plddt"]
            )

        # check lengths for local scores (cannot deal with unmodelled gaps here)
        exp_len = self.ost_entity.residue_count
        if "plddt" in self.scores_json:
            assert len(self.scores_json["plddt"]) == exp_len
        if self.incl_pae:
            assert "pae" in self.scores_json
            assert len(self.scores_json["pae"]) == exp_len
            assert len(self.scores_json["pae"][0]) == exp_len

        super().__init__(*args, **kwargs)

    def get_atoms(self):
        """Yield atom records for writing to a ModelCIF file via
        :mod:`modelcif`."""
        # ToDo [internal]: Take B-factor out since its not a B-factor?
        # NOTE: this assumes that _get_res_num maps residue to pos. in seqres
        #       within asym
        for atm in self.ost_entity.atoms:
            yield modelcif.model.Atom(
                asym_unit=self.asym[_get_ch_name(atm.chain, self.use_auth)],
                seq_id=_get_res_num(atm.residue, self.use_auth),
                atom_id=atm.name,
                type_symbol=atm.element,
                x=atm.pos[0],
                y=atm.pos[1],
                z=atm.pos[2],
                het=atm.is_hetatom,
                biso=atm.b_factor,
                occupancy=atm.occupancy,
            )

    def add_scores(self):
        """Add QA metrics from AF2 scores."""
        # global scores
        for key, score_class in [
            ("plddt_global", _GlobalPLDDT),
            ("ptm", _GlobalPTM),
            ("iptm", GlobalIpTM),
            ("confrank", GlobalConfRankMultimer),
        ]:
            if key in self.scores_json:
                self.qa_metrics.append(score_class(self.scores_json[key]))
        # extra ones
        self.qa_metrics.extend(self.extra_global_scores)

        # NOTE: none of the below expected to work if we have unmodelled gaps!

        # local scores
        lpae = []
        i = 0
        for chn_i in self.ost_entity.chains:
            ch_name_i = _get_ch_name(chn_i, self.use_auth)
            for res_i in chn_i.residues:
                # local pLDDT
                res_num_i = _get_res_num(res_i, self.use_auth)
                if "plddt" in self.scores_json:
                    self.qa_metrics.append(
                        _LocalPLDDT(
                            self.asym[ch_name_i].residue(res_num_i),
                            self.scores_json["plddt"][i],
                        )
                    )

                # pairwise alignment error
                if self.incl_pae:
                    j = 0
                    for chn_j in self.ost_entity.chains:
                        ch_name_j = _get_ch_name(chn_j, self.use_auth)
                        for res_j in chn_j.residues:
                            res_num_j = _get_res_num(res_j, self.use_auth)
                            pae_ij = self.scores_json["pae"][i][j]
                            lpae.append(
                                _LocalPairwisePAE(
                                    self.asym[ch_name_i].residue(res_num_i),
                                    self.asym[ch_name_j].residue(res_num_j),
                                    round(pae_ij, self.pae_digits),
                                )
                            )
                            j += 1

                i += 1

        if self.incl_pae:
            self.qa_metrics.extend(lpae)


def _get_modelcif_entities(target_ents, asym_units, system):
    """Create ModelCIF entities and asymmetric units."""
    alphabet = _LPeptideAlphabetWithXO()
    for cif_ent in target_ents:
        # combine into ModelCIF entity
        mdlcif_ent = modelcif.Entity(
            cif_ent["seqres"],
            description=cif_ent["description"],
            alphabet=alphabet,
            source=cif_ent["source"],
            references=cif_ent["references"],
        )
        # NOTE: this assigns (potentially new) alphabetic chain names
        for pdb_chain_id in cif_ent["pdb_chain_ids"]:
            asym_units[pdb_chain_id] = modelcif.AsymUnit(
                mdlcif_ent,
                strand_id=pdb_chain_id,
                auth_seq_id_map=cif_ent["auth_seq_id_map"],
            )
        system.entities.append(mdlcif_ent)


def _get_modelcif_protocol_software(js_step):
    """Assemble software entries for a ModelCIF protocol step."""
    # new setup in python-modelcif (as of late 2023): params with each SW
    sw_list = []
    for sw, sw_params in js_step["software_plus_params"]:
        sw_list.append(assemble_modelcif_software(sw, sw_params))
    # group and done...
    if sw_list:
        return modelcif.SoftwareGroup(sw_list)
    return None


def _get_modelcif_protocol_data(
    data_labels, target_entities, models, ref_dbs, acc_data, dg_cache=None
):
    """Assemble data for a ModelCIF protocol step.
    Cached access to objects needed to remove duplicates in ModelCIF.
    """
    # No idea how to reduce arguments here, allow in Pylint
    # pylint: disable=too-many-positional-arguments,too-many-arguments
    if dg_cache is None:
        dg_cache = {}
    cache_key = tuple(sorted(data_labels))
    if cache_key in dg_cache:
        return dg_cache[cache_key]
    data = modelcif.data.DataGroup()
    for data_label in data_labels:
        if data_label == "target_sequences":
            data.extend(target_entities)
        elif data_label == "ref_dbs":
            data.extend(ref_dbs)
        elif data_label == "models":
            data.extend(models)
        elif data_label in acc_data:
            data.append(acc_data[data_label])
        else:
            raise RuntimeError(f"Unknown protocol data: '{data_label}'")
    dg_cache[cache_key] = data
    return data


def _get_modelcif_protocol(
    protocol_steps, target_entities, models, ref_dbs, acc_data
):
    """Create the protocol for the ModelCIF file."""
    dg_cache = {}  # cached DataGroup objects per protocol
    protocol = modelcif.protocol.Protocol()
    for js_step in protocol_steps:
        sftwre = _get_modelcif_protocol_software(js_step)
        input_data = _get_modelcif_protocol_data(
            js_step["input"],
            target_entities,
            models,
            ref_dbs,
            acc_data,
            dg_cache,
        )
        output_data = _get_modelcif_protocol_data(
            js_step["output"],
            target_entities,
            models,
            ref_dbs,
            acc_data,
            dg_cache,
        )

        protocol.steps.append(
            modelcif.protocol.Step(
                input_data=input_data,
                output_data=output_data,
                name=js_step["name"],
                details=js_step["details"],
                software=sftwre,
            )
        )
        protocol.steps[-1].method_type = js_step["method_type"]
    return protocol


def _get_assoc_pae_file(entry_id, mdl_name):
    """Generate a associated file object to extract PAE to extra file."""
    return modelcif.associated.QAMetricsFile(
        f"{mdl_name}_local_pairwise_qa.cif",
        categories=["_ma_qa_metric_local_pairwise"],
        copy_categories=["_ma_qa_metric"],
        entry_id=entry_id,
        entry_details="This file is an associated file consisting "
        + "of local pairwise QA metrics. This is a partial mmCIF "
        + "file and can be validated by merging with the main "
        + "mmCIF file containing the model coordinates and other "
        + "associated data.",
        details="Predicted aligned error",
    )


def _get_associated_file(
    fle_path, data, file_format="other", file_content="other"
):
    """Generate a modelcif.associated.File object for given data."""
    afile = modelcif.associated.File(
        fle_path,
        details=data.name,
        data=data,
    )
    afile.file_format = file_format
    afile.file_content = file_content
    return afile


def _get_associated_files(mdl_name, arc_files):
    """Create entry for associated files."""
    # package all into zip file
    return modelcif.associated.Repository(
        "",
        [modelcif.associated.ZipFile(f"{mdl_name}.zip", files=arc_files)],
    )
    # NOTE: by convention MA expects zip file with same name as model-cif


def _get_sw_for_qe(steps, step_name):
    """Fetch suitable SW objects from protocol steps to use in QE."""
    # to maximally reduce duplicates we reuse single groups
    # otherwise new group created using same SoftwareWithParameters objects
    sw_groups = [step.software for step in steps if step.name == step_name]
    if len(sw_groups) == 0:
        return None
    if len(sw_groups) == 1:
        return sw_groups[0]
    # each sw_group is a list (SoftwareGroup) of SoftwareWithParameters
    # ...and we remove duplicates...just in case
    sw_dict = {}
    for sw_group in sw_groups:
        sw_dict.update({hash(swp): swp for swp in sw_group})
    return modelcif.SoftwareGroup(sw_dict.values())


def _package_associated_files(repo):
    """Compress associated files into single zip file and delete original."""
    # zip settings tested for good speed vs compression
    for archive in repo.files:
        with zipfile.ZipFile(archive.path, "w", zipfile.ZIP_BZIP2) as cif_zip:
            for zfile in archive.files:
                cif_zip.write(zfile.path, arcname=zfile.path)
                os.remove(zfile.path)


def _compress_cif_file(cif_file):
    """Compress cif file and delete original."""
    with open(cif_file, "rb") as f_in:
        with gzip.open(cif_file + ".gz", "wb") as f_out:
            shutil.copyfileobj(f_in, f_out)
    os.remove(cif_file)



[docs]
def store_as_modelcif(mdl_data, out_dir, mdl_fle_stem, compress):
    """Assemble model data into a ModelCIF file and write it to disk.

    Creates a :class:`modelcif.System` from the provided data, attaches
    entities, models, QA scores, associated files, and a modelling protocol,
    then writes the result as a ModelCIF file. Optionally compresses the output
    and packages associated files into a ZIP archive.

    Args:
        mdl_data (dict): Dictionary with model data. Expected keys:

            - ``title`` (str): Title of the modelling system.
            - ``mdl_id`` (str): Model identifier; converted to upper case.
            - ``model_details`` (str): Free-text description of the model.
            - ``audit_authors`` (list[str]): Author names for the audit record.
            - ``ranked_mdls`` (list[dict]): Per-model atom data.
            - ``target_entities`` (list[dict]): Target sequence data used to
              build asymmetric units and entities.
            - ``config_data`` (dict): Configuration data; must contain the
              key ``seq_dbs`` with reference database entries for the protocol.
            - ``protocol`` (dict): Modelling protocol description passed to
              :func:`_get_modelcif_protocol`.
            - ``acc_files`` (dict, optional): Mapping of labels to associated
              file descriptors, each containing ``details``,
              ``destination_file_name``, ``source_file_path``,
              ``file_format``, and ``file_content``.
            - ``af2_protocol_name`` (str, optional): If present, used to
              assign software metadata to AlphaFold 2 QA metric classes.

        out_dir (:class:`str` | :class:`~pathlib.Path`): Directory to write the
            output file(s) to.
        mdl_fle_stem (str): Base name for the output file, without extension.
        compress (bool): If ``True``, the mmCIF file is gzip-compressed after
            writing.

    Returns:
        str: File name of the written mmCIF file, relative to ``out_dir``.
        Ends with ``.cif`` or ``.cif.gz`` depending on ``compress``.
    """
    # allow more variables in Pylint, not gonna fix that atm
    # pylint: disable=too-many-locals
    logger.info("generating ModelCIF objects...")
    pstart = timer()
    # create system to gather all the data
    system = modelcif.System(
        title=mdl_data["title"],
        id=mdl_data["mdl_id"].upper(),
        model_details=mdl_data["model_details"],
    )

    # create an asymmetric unit and an entity per target sequence
    asym_units = {}
    _get_modelcif_entities(mdl_data["target_entities"], asym_units, system)

    # audit_authors
    system.authors.extend(mdl_data["audit_authors"])

    # set up the models to produce coordinates
    has_pae = False
    qa_time = 0
    model_group = modelcif.model.ModelGroup()
    for ranked_mdl in mdl_data["ranked_mdls"]:
        model = _OST2ModelCIF(
            assembly=modelcif.Assembly(asym_units.values()),
            asym=asym_units,
            **ranked_mdl,
        )
        if model.incl_pae:
            has_pae = True
        qa_start = timer()
        model.add_scores()
        qa_time += timer() - qa_start
        model_group.append(model)

    system.model_groups.append(model_group)

    # handle additional files
    arc_files = []
    acc_data = {}
    if has_pae:
        arc_files.append(_get_assoc_pae_file(system.id, mdl_fle_stem))
    for af_label, af_dict in mdl_data.get("acc_files", {}).items():
        # needs data (for protocol) and file (for associated files) object
        acc_data[af_label] = modelcif.data.Data(af_dict["details"])
        arc_files.append(
            _get_associated_file(
                af_dict["destination_file_name"],
                acc_data[af_label],
                af_dict["file_format"],
                af_dict["file_content"],
            )
        )
        # need to copy file for zip to work later
        shutil.copyfile(
            af_dict["source_file_path"],
            os.path.join(out_dir, af_dict["destination_file_name"]),
        )
    if arc_files:
        system.repositories.append(
            _get_associated_files(mdl_fle_stem, arc_files)
        )

    # get data and steps
    ref_dbs = mdl_data["config_data"]["seq_dbs"]
    protocol = _get_modelcif_protocol(
        mdl_data["protocol"],
        system.entities,
        model_group,
        ref_dbs,
        acc_data,
    )
    system.protocols.append(protocol)

    # set SW for QE
    if "af2_protocol_name" in mdl_data:
        sw4qe = _get_sw_for_qe(protocol.steps, mdl_data["af2_protocol_name"])
        for af2_class in [
            _GlobalPTM,
            GlobalIpTM,
            GlobalConfRankMultimer,
            _GlobalPLDDT,
            _LocalPLDDT,
            _LocalPairwisePAE,
        ]:
            af2_class.software = sw4qe

    logger.info("  (%.2fs; QA: %.2fs)", timer() - pstart, qa_time)

    # write modelcif System to file
    logger.info("write to disk...")
    pstart = timer()
    # NOTE: this will dump PAE on path provided in add_scores
    # -> hence we cheat by changing path and back while being exception-safe...
    oldpwd = Path.cwd()
    os.chdir(out_dir)
    mdl_fle = f"{mdl_fle_stem}.cif"
    try:
        with open(mdl_fle, "w", encoding="ascii") as mmcif_fh:
            modelcif.dumper.write(mmcif_fh, [system])
        if arc_files:
            _package_associated_files(system.repositories[0])
        if compress:
            _compress_cif_file(mdl_fle)
            mdl_fle += ".gz"
    finally:
        os.chdir(oldpwd)
    logger.info("  (%.2fs)", timer() - pstart)
    return mdl_fle



global_ref_dbs = {}


def _get_ref_db_object(name, url, version=None, release_date=None):
    """Cached access to modelcif.ReferenceDatabase objects.
    Needed to remove duplicates in ModelCIF.
    """
    key = (name, url, version, release_date)
    if key not in global_ref_dbs:
        global_ref_dbs[key] = modelcif.ReferenceDatabase(
            name, url, version, release_date
        )
    return global_ref_dbs[key]



[docs]
def get_af2_sequence_dbs(config_data):
    """Get AF2 sequence databases and store them in ``config_data``.

    Builds a list of :class:`modelcif.ReferenceDatabase` objects based on
    the AlphaFold 2 configuration and writes it to ``config_data["seq_dbs"]``.
    The selection depends on the database preset, AF2 version, and whether
    multimer mode or templates are used.

    Args:
        config_data (dict): AF2 configuration data, as returned by
            :func:`get_af2_config`. Relevant keys:

            - ``af_version`` (str): AlphaFold 2 version string; determines
              which MGnify and UniRef variants are added.
            - ``use_small_bfd`` (bool): If ``True``, uses Reduced BFD instead
              of full BFD.
            - ``use_multimer`` (bool): If ``True``, adds TrEMBL, Swiss-Prot,
              and PDB seqres databases.
            - ``use_templates`` (bool): If ``True``, adds a PDB sequence
              database (PDB seqres for multimer, PDB70 for monomer).
            - ``up_version`` (str or None): UniProt release version, passed
              to the ``version`` attribute of UniRef90, TrEMBL, and
              Swiss-Prot database objects.
            - ``up_rel_date`` (datetime.date or None): UniProt release date,
              passed to the ``release_date`` attribute of UniRef90, TrEMBL,
              and Swiss-Prot database objects.
            - ``pdb_rel_date`` (datetime.date or None): PDB release date,
              passed to the ``release_date`` attribute of the PDB seqres
              database object.

    Returns:
        None: Results are written to ``config_data["seq_dbs"]`` as a list of
        :class:`modelcif.ReferenceDatabase` objects.
    """
    up_version = config_data["up_version"]
    up_rel_date = config_data["up_rel_date"]
    # fill list of DBs
    seq_dbs = []
    if config_data["use_small_bfd"]:
        seq_dbs.append(
            _get_ref_db_object(
                "Reduced BFD",
                "https://storage.googleapis.com/alphafold-databases/"
                + "reduced_dbs/bfd-first_non_consensus_sequences.fasta.gz",
            )
        )
    else:
        seq_dbs.append(
            _get_ref_db_object(
                "BFD",
                "https://storage.googleapis.com/alphafold-databases/"
                + "casp14_versions/"
                + "bfd_metaclust_clu_complete_id30_c90_final_seq.sorted_opt.tar"
                + ".gz",
                version="6a634dc6eb105c2e9b4cba7bbae93412",
            )
        )
    if config_data["af_version"] < "2.3.0":
        seq_dbs.append(
            _get_ref_db_object(
                "MGnify",
                "https://storage.googleapis.com/alphafold-databases/"
                + "casp14_versions/mgy_clusters_2018_12.fa.gz",
                version="2018_12",
                release_date=datetime.date(2018, 12, 6),
            )
        )
        seq_dbs.append(
            _get_ref_db_object(
                "Uniclust30",
                "https://storage.googleapis.com/alphafold-databases/"
                + "casp14_versions/uniclust30_2018_08_hhsuite.tar.gz",
                version="2018_08",
                release_date=None,
            )
        )
    else:
        # Don't add a linebreak to URLs, allow long line in Pylint
        # pylint: disable=line-too-long
        # NOTE: release date according to https://ftp.ebi.ac.uk/pub/databases/metagenomics/peptide_database/2022_05/
        # pylint: enable=line-too-long
        seq_dbs.append(
            _get_ref_db_object(
                "MGnify",
                "https://storage.googleapis.com/alphafold-databases/"
                + "v2.3/mgy_clusters_2022_05.fa.gz",
                version="2022_05",
                release_date=datetime.date(2022, 5, 6),
            )
        )
        seq_dbs.append(
            _get_ref_db_object(
                "UniRef30",
                "https://storage.googleapis.com/alphafold-databases/"
                + "v2.3/UniRef30_2021_03.tar.gz",
                version="2021_03",
                release_date=None,
            )
        )
    if config_data["use_multimer"]:
        seq_dbs.append(
            _get_ref_db_object(
                "TrEMBL",
                "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/"
                + "knowledgebase/complete/uniprot_trembl.fasta.gz",
                version=up_version,
                release_date=up_rel_date,
            )
        )
        seq_dbs.append(
            _get_ref_db_object(
                "Swiss-Prot",
                "ftp://ftp.ebi.ac.uk/pub/databases/uniprot/current_release/"
                + "knowledgebase/complete/uniprot_sprot.fasta.gz",
                version=up_version,
                release_date=up_rel_date,
            )
        )
    seq_dbs.append(
        _get_ref_db_object(
            "UniRef90",
            "ftp://ftp.uniprot.org/pub/databases/uniprot/uniref/uniref90/"
            + "uniref90.fasta.gz",
            version=up_version,
            release_date=up_rel_date,
        )
    )
    if config_data["use_templates"]:
        if config_data["use_multimer"]:
            # uses whatever is latest set of PDB sequences
            # see AF2 scripts/download_pdb_seqres.sh
            seq_dbs.append(
                _get_ref_db_object(
                    "PDB seqres",
                    "https://files.wwpdb.org/pub/pdb/derived_data/"
                    + "pdb_seqres.txt",
                    release_date=config_data["pdb_rel_date"],
                )
            )
        else:
            # fixed version used in AF2 scripts/download_pdb70.sh
            seq_dbs.append(
                _get_ref_db_object(
                    "PDB70",
                    "http://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/"
                    + "hhsuite_dbs/old-releases/pdb70_from_mmcif_200401.tar.gz",
                    release_date=datetime.date(2020, 4, 1),
                )
            )
    config_data["seq_dbs"] = seq_dbs




[docs]
def get_af2_config(
    af_version,
    af_params=None,
    custom_ranking=None,
    up_version=None,
    up_rel_date=None,
    pdb_rel_date=None,
):
    """Get configuration data for an AlphaFold 2 modelling run.

    Derives modelling settings from the provided AlphaFold 2 version and
    parameters, builds a human-readable description of the run, and returns
    a configuration dictionary for use by downstream functions.

    Args:
        af_version (str): AlphaFold 2 version string (e.g. ``"2.3.2"``).
        af_params (dict, optional): Non-default AlphaFold 2 parameters.
            Recognised keys include ``model_preset``, ``db_preset``,
            ``num_multimer_predictions_per_model``, ``models_to_relax``,
            ``run_relax``, ``max_template_date``, and ``num_ensemble``.
            Defaults to an empty dict if not provided.
        custom_ranking (str, optional): Custom model ranking expression. If
            not provided, defaults to ``"pLDDT"`` for monomer runs and
            ``"ipTM*0.8+pTM*0.2"`` for multimer runs.
        up_version (str, optional): UniProt release in ``"YYYY_MM"`` format
            current at the time of AF2 installation. (see
            https://www.uniprot.org/release-notes)
        up_rel_date (datetime.date, optional): Release date corresponding to
            ``up_version``.
        pdb_rel_date (datetime.date, optional): PDB release date current at
            the time of AF2 installation. Relevant for multimer runs using
            templates.

    Returns:
        dict: Configuration data for downstream functions. Keys:

            - ``af_params`` (dict): Parameters as passed (or empty dict).
            - ``af_version`` (str): AlphaFold 2 version string as passed.
            - ``description`` (str): Human-readable run description.
            - ``use_templates`` (bool): Whether templates were used.
            - ``use_small_bfd`` (bool): Whether the reduced BFD database
              setting was used.
            - ``use_multimer`` (bool): Whether multimer mode was used.
            - ``up_version`` (str or None): UniProt release as passed.
            - ``up_rel_date`` (datetime.date or None): UniProt release date
              as passed.
            - ``pdb_rel_date`` (datetime.date or None): PDB release date
              as passed.
            - ``seq_dbs`` (list[modelcif.ReferenceDatabase]): Sequence DB
              objects.
    """
    # Disable some Pylint warnings, not going to be fixed atm
    # pylint: disable=too-many-arguments,too-many-positional-arguments
    # pylint: disable=too-many-locals,too-many-branches,too-many-statements
    if af_params is None:
        af_params = {}
    # get defaults
    model_preset = af_params.get("model_preset", "monomer")
    db_preset = af_params.get("db_preset", "full_dbs")
    use_multimer = model_preset == "multimer"
    # 5 models unless multimer with extra flag
    if use_multimer:
        if "num_multimer_predictions_per_model" in af_params:
            num_seeds = af_params["num_multimer_predictions_per_model"]
        else:
            num_seeds = 5 if af_version >= "2.2.0" else 1
    else:
        num_seeds = 1
    num_models = num_seeds * 5
    # default relax setting changed over time; translate to latest
    if "models_to_relax" in af_params:
        models_to_relax = af_params["models_to_relax"]
        assert "run_relax" not in af_params
    elif "run_relax" in af_params:
        models_to_relax = "all" if af_params["run_relax"] else "none"
    else:
        models_to_relax = "all" if af_version < "2.3.2" else "best"
    # templates turned off if max_template_date older than oldest PDB
    use_templates = (
        "max_template_date" not in af_params
        or af_params["max_template_date"] >= "1976-05-19"
    )
    # build description text
    description = (
        f"Model generated using AlphaFold (v{af_version}) "
        f"producing {num_models} {model_preset} models "
    )
    if num_seeds > 1:
        description += f"({num_seeds} random seeds per parameter set) "
    if use_multimer and af_version >= "2.3.0":
        description += "with up to 20 recycles "
    else:
        description += "with 3 recycles "
    if "num_ensemble" in af_params:
        description += f"and {af_params['num_ensemble']} ensemble "
    description += "each, "
    if models_to_relax == "all":
        description += "with AMBER relaxation, "
    elif models_to_relax == "best":
        description += "with AMBER relaxation on best model, "
    else:
        description += "without model relaxation, "
    if use_templates:
        if "max_template_date" in af_params:
            mtd_str = f" up to date {af_params['max_template_date']}"
        else:
            mtd_str = ""
        description += f"using templates{mtd_str}, "
    else:
        description += "without templates, "
    if custom_ranking:
        rank_str = custom_ranking
    elif use_multimer:
        rank_str = "ipTM*0.8+pTM*0.2"
    else:
        rank_str = "pLDDT"
    msa_str = "MSAs" if use_multimer else "an MSA"
    description += (
        f"ranked by {rank_str}, starting from {msa_str} with "
        f"{db_preset} setting."
    )
    config_data = {
        "af_params": af_params,
        "af_version": af_version,
        "description": description,
        "use_templates": use_templates,
        "use_small_bfd": db_preset == "reduced_dbs",
        "use_multimer": use_multimer,
        "up_version": up_version,
        "up_rel_date": up_rel_date,
        "pdb_rel_date": pdb_rel_date,
    }
    get_af2_sequence_dbs(config_data)
    return config_data




[docs]
def get_galaxy_software(version):
    """Get Galaxy as a software dictionary for a ModelCIF file.

    Builds a dictionary suitable for creating a :class:`modelcif.Software`
    object, with citation and download URL derived from the provided version
    string.

    Args:
        version (str): Galaxy AlphaFold 2 version string in the format
            ``[AF2v]+galaxy[X]``, e.g. ``"2.3.2+galaxy1"``.

    Returns:
        dict: Software descriptor with keys ``name``, ``classification``,
            ``description``, ``citation``, ``location``, ``type``, and
            ``version``.
    """
    galaxy_suffix = version.split("galaxy")[-1]
    return {
        "name": "Galaxy AlphaFold 2",
        "classification": "model building",
        "description": "Structure prediction",
        "citation": ihm.Citation(
            pmid="38769056",
            title="The Galaxy platform for accessible, reproducible, and "
            + "collaborative data analyses: 2024 update.",
            journal="Nucleic Acids Res",
            volume=52,
            page_range=["W83", "W94"],
            year=2024,
            authors=["Galaxy Community"],
            doi="10.1093/nar/gkae410",
        ),
        "location": (
            f"https://usegalaxy.eu/root?tool_id=toolshed.g2.bx.psu.edu/repos/"
            f"galaxy-australia/alphafold2/alphafold/2.3.2+galaxy{galaxy_suffix}"
        ),
        "type": "package",
        "version": version,
    }




[docs]
def get_cf_db_versions(dt, num_days_unk=1):
    """Get ColabFold database versions for a given date.

    Returns the UniRef30, template database name, and template database
    version used by the ColabFold MSA server on a given date. Based on
    https://github.com/sokrypton/ColabFold/wiki/MSA-Server-Database-History.

    Args:
        dt (datetime.date): Date for which to look up the database versions.
        num_days_unk (int): Number of days around a database switch date
            within which the result is considered unknown. Defaults to 1.

    Returns:
        tuple: A 3-tuple of ``(ur30_db_version, tpl_db, tpl_db_version)``,
            each a :class:`str`. Values are set to ``"UNK"`` if ``dt`` falls
            within ``num_days_unk`` days of a switch date, if the template
            database version is unknown, or if no matching date range is
            found.
    """
    # logic: newest first, tuple with ur30_db_version, tpl_db, tpl_db_version
    switch_dates = [
        (datetime.date(2023, 7, 31), ("2023_02", "PDB100", "230517")),
        (datetime.date(2023, 7, 27), ("2022_02", "PDB70", "220313")),
        (datetime.date(2023, 6, 12), ("2023_02", "PDB100", "230517")),
        (datetime.date(2022, 7, 13), ("2022_02", "PDB70", "220313")),
        (datetime.date(2021, 1, 1), ("2021_03", "PDB70", "UNK")),
    ]
    for switch_dt, dbs in switch_dates:
        dd = (dt - switch_dt).days
        if abs(dd) <= num_days_unk:
            return ("UNK", "UNK", "UNK")
        if dd > num_days_unk:
            return dbs

    return ("UNK", "UNK", "UNK")



#  LocalWords:  LocalColabFold Args str bool ColabFold config func msa num ur
#  LocalWords:  localcolabfold UniRef tpl ValueError