Source code for modelarchive.modelcif.fix_af3

"""ModelCIF files generated by AlphaFold 3 deviate from the official ModelCIF
definition dictionary in specific cases. In particular, for homomeric
assemblies, each molecular entity copy is written as a separate entity in the
CIF document, instead of defining a single entity referenced multiple times.
This module provides functionality to correct the deviations.
"""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from gemmi import cif

from . import access
from . import edit


def _is_null(value):
    """Borrowed from gemmi."""
    # ToDo: This may become a public function in the future.
    return len(value) == 1 and value[0] in ("?", ".")


def _char_table(c):
    """Borrowed from gemmi."""
    # fmt: off
    table = [
        0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        2, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0
    ] + [0] * 128
    # fmt: on
    return table[ord(c) % 256]


def _quote(v):
    """Borrowed from gemmi, prefer double quotes above single quotes for AF."""
    if all(_char_table(c) == 1 for c in v) and len(v) > 0 and not _is_null(v):
        return v
    q = ";"
    if "\n" not in v:
        if '"' not in v:
            q = '"'
        elif "'" not in v:
            q = "'"
    v = q + v
    if q == ";":
        v += "\n"
    v += q
    return v


[docs] def fix_model_name(block, mdl_rank): """Normalise _ma_model_list.model_name for given rank. AlphaFold 3 sets _ma_model_list.model_name to "Top ranked model" for all models, regardless of their rank. This function rewrites the value such that only ``mdl_rank == 1`` is labelled "Top ranked model". All other ranks are renamed to "#<``mdl_rank``> ranked model". Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import fix_af3 >>> # get sample CIF data >>> cif_data = '''data_test ... _ma_model_list.data_id 1 ... _ma_model_list.model_name "Top ranked model" ... _ma_model_list.model_type "Ab initio model" ... _ma_model_list.ordinal_id 1 ... ''' >>> block = cif.read_string(cif_data).sole_block() >>> fix_af3.fix_model_name(block, 2) >>> print(block.as_string()) data_test _ma_model_list.data_id 1 _ma_model_list.model_name "#2 ranked model" _ma_model_list.model_type "Ab initio model" _ma_model_list.ordinal_id 1 <BLANKLINE> >>> fix_af3.fix_model_name(block, 1) >>> print(block.as_string()) data_test _ma_model_list.data_id 1 _ma_model_list.model_name "Top ranked model" _ma_model_list.model_type "Ab initio model" _ma_model_list.ordinal_id 1 <BLANKLINE> Args: block (|gemmicifBlock|): CIF block to operate on. mdl_rank (int): Rank of the AlphaFold 3 model. If ``mdl_rank == 1``, the name is set to "Top ranked model". Returns: None Raises: RuntimeError: If the ``_ma_model_list`` category contains more than one row. edit.NotFoundCategoryError: no software entry found for AF3. edit.NotFoundItemError: If _ma_model_list.model_name can not be found in ``block``. """ if mdl_rank == 1: mdl_name = "Top ranked model" else: mdl_name = f"#{mdl_rank} ranked model" table = access.get_table(block, "_ma_model_list", items=["model_name"]) if not table: raise edit.NotFoundItemError( msg="File is missing _ma_model_list.model_name, single model " + "required" ) if len(table) != 1: raise RuntimeError("File must have a single model in _ma_model_list.") table[0]["model_name"] = _quote(mdl_name)
def _get_ordinal_ids(cur_ids, num_ids_needed): """Find set of ordinal IDs avoiding existing ones. - cur_ids: IDs as strings (hopefully something like 1, 2, 3, ...) - num_ids_needed: number of IDs to provide (next numerals not in cur_ids) """ possible_ids = [ str(i) for i in range(1, len(cur_ids) + num_ids_needed + 1) if str(i) not in cur_ids ] return possible_ids[:num_ids_needed]
[docs] class NotIdentifiedRecordError(RuntimeError): """General exception for records that can not be identified in a table. This exception should not be raised directly, it exists to define other "NotIdentified" exceptions inheriting from it. Args: msg (str): Exception message. """ def __init__(self, msg): super().__init__(msg)
[docs] class NotIdentifiedDuplicatedRecordError(NotIdentifiedRecordError): """Exception if a duplicated record is found in a table. Args: category (str): Missing category. record_id (str): Identifier for the duplicated record. Not bound to a specific item on purpose. """ def __init__(self, category, record_id): self.category = category msg = ( f"Duplicated records found in category '{category}' for " + f"'{record_id}'" ) super().__init__(msg)
[docs] class NotIdentifiedSingleRecordError(NotIdentifiedRecordError): """Exception if a specific record can not be identified in a table. Args: category (str): Affected category. item (str, optional): Missing item, extends the exception message. value (str, optional): Value, in case a record is found but with mismatching value. Extends the exception message. """ def __init__(self, category, item=None, value=None): self.category = category self.item = item msg = f"Could not identify record in category '{category}'" if item is not None and value is not None: msg += f", mismatch at item '{item}={value}'" elif item is not None: msg += f", missing item '{item}'" msg += "." super().__init__(msg)
[docs] class NotIdentifiedContextRecordError(NotIdentifiedRecordError): """Exception if a record for a specific context can not be identified. Args: category (str): Affected category. item (str, optional): Affected item. context (str, optional): Context, part of the message. """ def __init__(self, category, item=None, context=None): self.category = category self.item = item msg = f"Could not identify record in category '{category}'" if item is not None: msg += f", item '{item}'" if context is not None: msg += context msg += "." super().__init__(msg)
def _fix_citation_fallback(block, primary_citation_id): """Fix citation for AF3 if usual approach failed. Return old_af3_sw_cit_id, new_af3_sw_cit_id. Both set to None if something failed. """ exp_cit = { "country": "UK", "journal_full": "Nature", "journal_id_ASTM": "NATUAS", "journal_id_CSD": "0006", "journal_id_ISSN": "0028-0836", "journal_volume": "630", "page_first": "493", "page_last": "500", "pdbx_database_id_DOI": "10.1038/s41586-024-07487-w", "pdbx_database_id_PubMed": "38718835", "title": "Accurate structure prediction of biomolecular interactions " + "with AlphaFold 3", "year": "2024", } cat = "_citation." cit_table = access.get_table(block, cat) num_rows = len(cit_table) if cit_table else 0 # check and abort as needed if num_rows == 0: new_af3_sw_cit_id = primary_citation_id exp_cit = {"id": new_af3_sw_cit_id, **exp_cit} block.set_pairs(cat, exp_cit) return None, new_af3_sw_cit_id # check that all items exist for itm in exp_cit: if f"{cat}{itm}" not in cit_table.tags: raise NotIdentifiedSingleRecordError(cat[:-1], item=itm) # search for a record with all matching values or '?' for i in range(num_rows): found = True for key, val in exp_cit.items(): # Checking for empty string "" is because gemmi as_string() # translates "?" and "." to "". if cif.as_string(cit_table[i][f"{cat}{key}"]) not in ["", val]: found = False break if found: break if not found: # At this point, 'key' and 'val' are defined as 'num_rows' must be # greateer than 0. Silence Pylint warning. # pylint: disable=undefined-loop-variable raise NotIdentifiedSingleRecordError(cat[:-1], item=key, value=val) old_af3_sw_cit_id = cit_table[i]["id"] if old_af3_sw_cit_id in ["?", "."] or old_af3_sw_cit_id == "primary": new_af3_sw_cit_id = primary_citation_id else: new_af3_sw_cit_id = old_af3_sw_cit_id # fix dict cit_table[i]["id"] = new_af3_sw_cit_id for key, val in exp_cit.items(): cit_table[i][key] = val return old_af3_sw_cit_id, new_af3_sw_cit_id def _is_af3_sw_name(sw_name): """Check if given _software.name is for AF3.""" return sw_name.lower().startswith("alphafold") class _AF3ItemSetter: """Class as a callback for adding columns. Sets a fixed value in an AF3 row. Fails if there are multiple AF3 rows.""" # This is a tiny helper-class with the purpose of storing a state, only # for local use, disable Pylint warning # pylint: disable=too-few-public-methods def __init__(self, value, item="citation_id"): self.item = item self.sw_found = False self.value = value def __call__(self, row, same=False): if _is_af3_sw_name(row["name"]): if self.sw_found: raise NotIdentifiedDuplicatedRecordError( "_software", "AlphaFold" ) self.sw_found = True return self.value if same: return row[self.item] return "?" def _fix_software(block, new_af3_sw_cit_id): """Update _software with _citation record ID.""" sw_table = access.get_table(block, "_software") af3_cid_setter = _AF3ItemSetter(new_af3_sw_cit_id) if sw_table: if "_software.citation_id" not in sw_table.tags: edit.add_column(block, "_software", "citation_id", af3_cid_setter) else: for row in sw_table: row["citation_id"] = af3_cid_setter(row, same=True) if not af3_cid_setter.sw_found: raise NotIdentifiedContextRecordError( "_software", context=": AlphaFold 3 not found" ) def _get_key_primary(row): """For edit.sort(), make sure 'primary' comes first""" if row["id"] == "primary": return (-1, "") try: return (0, int(row["id"])) except ValueError: return (1, row["id"]) def _ensure_citation_id_first(block): """Make sure _citation.id is the first tag of a table.""" cif_dict = block.get_mmcif_category("_citation") table = access.get_table(block, "_citation") if table.tags[0] == "_citation.id": return cif_dict = {"id": list(table.find_column("id"))} for itm in table.tags: itm = itm.split(".", maxsplit=1)[1] if itm == "id": continue cif_dict[itm] = list(table.find_column(itm)) table.erase() edit.add_category( block, "_citation", item_data=cif_dict, index="before:_citation_author", raw=True, )
[docs] def fix_citation(block): """Normalise the AlphaFold 3 citation in a `ModelCIF`_ ``block``. Ensures that the AlphaFold 3 publication (`PMID 38718835 <https://pubmed.ncbi.nlm.nih.gov/38718835/>`_) is not marked as the "primary" citation, assigns a numeric citation ID instead. Fixes an incomplete AlphaFold 3 citation. Replaces the author list with the full curated list of names and updates its citation ID. Reorders citations so that the primary entry appears first and links the citation to the corresponding software record. This adjustment is not required for valid `ModelCIF`_ files, but follows `ModelArchive`_ conventions where the primary citation must refer to the deposited model rather than the software used to generate it. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import access, fix_af3 >>> # start with an empty CIF document >>> CIF_DATA = '''data_test ... _citation.id primary ... _citation.country UK ... _citation.journal_full Nature ... _citation.journal_id_ASTM NATUAS ... _citation.journal_id_CSD 0006 ... _citation.journal_id_ISSN 0028-0836 ... _citation.journal_volume 630 ... _citation.page_first 493 ... _citation.page_last 500 ... _citation.pdbx_database_id_DOI 10.1038/s41586-024-07487-w ... _citation.pdbx_database_id_PubMed 38718835 ... _citation.title 'Accurate structure prediction of biomolecular ...' ... _citation.year 2024 ... # ... loop_ ... _citation_author.citation_id ... _citation_author.name ... _citation_author.ordinal ... primary "Google DeepMind AlphaFold Team" 1 ... primary "Isomorphic Labs Team" 2 ... # ... loop_ ... _software.classification ... _software.date ... _software.description ... _software.name ... _software.pdbx_ordinal ... _software.type ... _software.version ... other ? "Structure prediction" AlphaFold 1 package AlphaFold-beta ... ''' >>> block = cif.read_string(CIF_DATA).sole_block() >>> fix_af3.fix_citation(block) >>> # The usual block.as_string() output would be too much for a >>> # docstring, just check some important values. >>> table = access.get_table(block, "_citation") >>> assert table[0]["id"] == "1" >>> table = access.get_table(block, "_citation_author") >>> assert table[0]["name"] != "Google DeepMind AlphaFold Team" >>> table = access.get_table(block, "_software") >>> assert table[0]["citation_id"] == "1" Args: block (|gemmicifBlock|): CIF block to operate on. Returns: None Raises: edit.NotFoundCategoryError: If _software category can not be found. NotIdentifiedSingleRecordError: If required item is missing from _citation category. If item values are not as expected for _citation category. NotIdentifiedDuplicatedRecordError: If multiple entries for AlphaFold are found in _software category. In that case, the "right" record can not be identified. """ old_af3_sw_cit_id = None new_af3_sw_cit_id = None cat = "_citation" itms = ["id", "pdbx_database_id_PubMed"] table = access.get_table(block, cat, itms) # pick first numeric value not yet taken primary_citation_id = _quote( _get_ordinal_ids(set(row["id"] for row in table), 1)[0] ) # correct IDs and find AF3 citation for row in table: if row["pdbx_database_id_PubMed"] == "38718835": old_af3_sw_cit_id = row["id"] if row["id"] == "primary": row["id"] = primary_citation_id new_af3_sw_cit_id = primary_citation_id else: new_af3_sw_cit_id = old_af3_sw_cit_id if old_af3_sw_cit_id is None or _is_null(old_af3_sw_cit_id): # citation available w/o PMID? Fallback option replacing citation old_af3_sw_cit_id, new_af3_sw_cit_id = _fix_citation_fallback( block, primary_citation_id ) if len(table) > 0: # _citation table may have changed, sort it to have "primary" first edit.sort(table, "id", key=_get_key_primary) # make sure _citation.id comes first _ensure_citation_id_first(block) # fix authors (completely replace ones for AF3 publication) cit_auth_dict = block.get_mmcif_category("_citation_author.") cit_auth_dict_new = {"citation_id": [], "name": [], "ordinal": []} if cit_auth_dict: for idx, citation_id in enumerate(cit_auth_dict["citation_id"]): if citation_id != old_af3_sw_cit_id: cit_auth_dict_new["citation_id"].append(citation_id) cit_auth_dict_new["name"].append(cit_auth_dict["name"][idx]) # note: fixed to be in correct style and without special characters af3_authors = [ "Abramson, J.", "Adler, J.", "Dunger, J.", "Evans, R.", "Green, T.", "Pritzel, A.", "Ronneberger, O.", "Willmore, L.", "Ballard, A.J.", "Bambrick, J.", "Bodenstein, S.W.", "Evans, D.A.", "Hung, C.C.", "O'Neill, M.", "Reiman, D.", "Tunyasuvunakool, K.", "Wu, Z.", "Zemgulyte, A.", "Arvaniti, E.", "Beattie, C.", "Bertolli, O.", "Bridgland, A.", "Cherepanov, A.", "Congreve, M.", "Cowen-Rivers, A.I.", "Cowie, A.", "Figurnov, M.", "Fuchs, F.B.", "Gladman, H.", "Jain, R.", "Khan, Y.A.", "Low, C.M.R.", "Perlin, K.", "Potapenko, A.", "Savy, P.", "Singh, S.", "Stecula, A.", "Thillaisundaram, A.", "Tong, C.", "Yakneen, S.", "Zhong, E.D.", "Zielinski, M.", "Zidek, A.", "Bapst, V.", "Kohli, P.", "Jaderberg, M.", "Hassabis, D.", "Jumper, J.M.", ] cit_auth_dict_new["citation_id"].extend( [new_af3_sw_cit_id] * len(af3_authors) ) cit_auth_dict_new["name"].extend(af3_authors) if cit_auth_dict: cit_auth_dict_new["ordinal"] = list( range(1, len(cit_auth_dict_new["name"]) + 1) ) block.set_mmcif_category("_citation_author.", cit_auth_dict_new) _fix_software(block, new_af3_sw_cit_id)
def _is_af3_server(block): """Check if block was produced with AF3 server or code. True means server, False means "from code". On problems to identify record, raise exception. """ # this is a heuristic and may fail! table = access.get_table(block, "_pdbx_data_usage", items=["details"]) is_server = any(True for r in table if "server" in r["details"].lower()) github_url = "github.com/google-deepmind/alphafold3" is_code = any(True for r in table if github_url in r["details"].lower()) if is_server and is_code or (not is_server and not is_code): raise NotIdentifiedContextRecordError( "_pdbx_data_usage", context=": AlphaFold 3 license type" ) return is_server
[docs] def fix_software_location(block): """Ensures the AlphaFold 3 _software entry has a correct location URL. Determines whether the `ModelCIF`_ ``block`` originates from the AlphaFold 3 server or a local installation and sets the corresponding URL in _software.location. If the column does not yet exist it is created; otherwise only the row for AlphaFold 3 is updated. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import access, fix_af3 >>> # start with an empty CIF document >>> CIF_DATA = '''data_test ... _pdbx_data_usage.details "... alphafoldserver.com/output-terms." ... _pdbx_data_usage.id 1 ... _pdbx_data_usage.type license ... _pdbx_data_usage.url ? ... # ... loop_ ... _software.classification ... _software.date ... _software.description ... _software.name ... _software.pdbx_ordinal ... _software.type ... _software.version ... other ? "Structure prediction" AlphaFold 1 package AlphaFold-beta ... ''' >>> block = cif.read_string(CIF_DATA).sole_block() >>> fix_af3.fix_software_location(block) >>> # Just check that _software.location exists and has the right value >>> table = access.get_table(block, "_software") >>> assert "_software.location" in table.tags >>> assert table[0]["location"] == "https://alphafoldserver.com/" >>> # Change block to look like ModelCIF file from local installation >>> table = access.get_table(block, "_pdbx_data_usage") >>> table[0]["details"] = "...github.com/google-deepmind/alphafold3..." >>> fix_af3.fix_software_location(block) >>> # Check _software.location to point to GitHub, now >>> table = access.get_table(block, "_software") >>> assert table[0]["location"] == \ "https://github.com/google-deepmind/alphafold3" Args: block (|gemmicifBlock|): CIF block to operate on. Returns: None Raises: NotIdentifiedContextRecordError: If no AlphaFold 3 entry is found in the _software table. NotIdentifiedContextRecordError: If the origin of the AlphaFold 3 license could not be identified in the _pdbx_data_usage table. NotIdentifiedDuplicatedRecordError: If multiple entries for AlphaFold 3 are found in the _software table. """ is_server = _is_af3_server(block) if is_server: af3_sw_url = "https://alphafoldserver.com/" else: af3_sw_url = "https://github.com/google-deepmind/alphafold3" sw_table = access.get_table(block, "_software") af3_url_setter = _AF3ItemSetter(af3_sw_url, item="location") if sw_table: if "_software.location" not in sw_table.tags: edit.add_column(block, "_software", "location", af3_url_setter) else: for row in sw_table: row["location"] = af3_url_setter(row, same=True) if not af3_url_setter.sw_found: raise NotIdentifiedContextRecordError( "_software", context=": AlphaFold 3 not found" )
def _get_cat_dict(block, category): """Fetch category as dict, raise exception otherwise.""" cat_dict = block.get_mmcif_category(f"{category}.") if not cat_dict: raise edit.NotFoundCategoryError(category) return cat_dict def _get_af3_sw_group(block): """Get software_group_id for AF3 (to be used in protocols and QE). Assumption: single SW group exists which points to AF3. """ sw_group_table = access.get_table(block, "_ma_software_group") if not sw_group_table: raise edit.NotFoundCategoryError("_ma_software_group") if len(sw_group_table) == 1: return sw_group_table[0]["group_id"] # Multiple SW groups, look for AF3 entry sw_id = None sw_table = access.get_table(block, "_software") if not sw_table: raise edit.NotFoundCategoryError("_software") for row in sw_table: if _is_af3_sw_name(row["name"]): if sw_id is not None: raise NotIdentifiedDuplicatedRecordError( "_software", "AlphaFold" ) sw_id = row["pdbx_ordinal"] for row in sw_group_table: if row["software_id"] == sw_id: return row["group_id"] raise NotIdentifiedContextRecordError( "_software", context=": AlphaFold 3 not found" )
[docs] def fix_protocol(block): """Fix the MA protocol to a single well-formed step. Rewrites _ma_data, _ma_data_group, and _ma_protocol_step from scratch based on the existing _ma_target_entity, _ma_model_list and _ma_software_group categories. Any prior content in those three categories is silently overwritten. Data layout after the call: _ma_data: One record per target entity (content_type "target") followed by one record per model (content_type "model coordinates"). IDs are assigned sequentially starting at 1. _ma_data_group: Group 1 - all target data IDs (input side). Group 2 - all model data IDs (output side). _ma_protocol_step: A single step referencing the AF3 software group, group 1 as input, and group 2 as output. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import access, fix_af3 >>> # start with an empty CIF document >>> CIF_DATA = '''data_test ... # ... loop_ ... _entity.id ... _entity.pdbx_description ... _entity.type ... 1 "bestest polymer in universe" polymer ... 2 "second best polythingi in universe" polymer ... # ... loop_ ... _ma_target_entity.data_id ... _ma_target_entity.entity_id ... _ma_target_entity.origin ... 1 1 . ... 1 2 . ... # ... _ma_model_list.data_id 1 ... _ma_model_list.model_group_id 1 ... _ma_model_list.model_group_name "AlphaFold-beta-20231127 (...)" ... _ma_model_list.model_id 1 ... _ma_model_list.model_name "Top ranked model" ... _ma_model_list.model_type "Ab initio model" ... _ma_model_list.ordinal_id 1 ... # ... loop_ ... _ma_software_group.group_id ... _ma_software_group.ordinal_id ... _ma_software_group.software_id ... 1 1 1 ... # ... loop_ ... _software.classification ... _software.date ... _software.description ... _software.name ... _software.pdbx_ordinal ... _software.type ... _software.version ... other ? "Structure prediction" AlphaFold 1 package AlphaFold-beta ... ''' >>> block = cif.read_string(CIF_DATA).sole_block() >>> fix_af3.fix_protocol(block) >>> access.get_table(block, "_entity").erase() >>> access.get_table(block, "_ma_data").erase() >>> access.get_table(block, "_ma_data_group").erase() >>> access.get_table(block, "_ma_model_list").erase() >>> access.get_table(block, "_ma_software_group").erase() >>> access.get_table(block, "_ma_target_entity").erase() >>> access.get_table(block, "_software").erase() >>> print(block.as_string()) data_test loop_ _ma_protocol_step.ordinal_id _ma_protocol_step.protocol_id _ma_protocol_step.step_id _ma_protocol_step.method_type _ma_protocol_step.details _ma_protocol_step.software_group_id _ma_protocol_step.input_data_group_id _ma_protocol_step.output_data_group_id 1 1 1 modeling 'Model generated with AlphaFold 3.' 1 1 2 <BLANKLINE> Args: block (|gemmicifBlock|): CIF block to operate on. Returns: None Raises: edit.NotFoundCategoryError: If any required source category is absent: _entity, _ma_target_entity, _ma_model_list, or _ma_software_group. edit.NotFoundItemError: If _ma_target_entity.data_id, _ma_model_list.data_id or _ma_model_list.model_name are missing. NotIdentifiedDuplicatedRecordError: If multiple _ma_software_group records exist and the AF3 entry cannot be unambiguously identified in _software. NotIdentifiedContextRecordError: If multiple _ma_software_group records exist but no AF3 entry can be found in _software at all. """ # collect data to add data_dict = {"id": [], "name": [], "content_type": []} data_ids_in = [] data_ids_out = [] # add targets entity_dict = _get_cat_dict(block, "_entity") entity_descs = dict(zip(entity_dict["id"], entity_dict["pdbx_description"])) trg_ent_table = access.get_table(block, "_ma_target_entity") if not trg_ent_table: raise edit.NotFoundCategoryError("_ma_target_entity") # Raise when _ma_target_entity.data_id does not exist, that could mean # a lot has changed in the way AF writes ModelCIF files. if "_ma_target_entity.data_id" not in trg_ent_table.tags: raise edit.NotFoundItemError("_ma_target_entity.data_id") for i, row in enumerate(trg_ent_table, start=1): row["data_id"] = str(i) data_dict["id"].append(row["data_id"]) data_ids_in.append(row["data_id"]) data_dict["name"].append(entity_descs[row["entity_id"]]) data_dict["content_type"].append("target") # add model mdl_list_table = access.get_table(block, "_ma_model_list") if not mdl_list_table: raise edit.NotFoundCategoryError("_ma_model_list") for itm in ["data_id", "model_name"]: if f"_ma_model_list.{itm}" not in mdl_list_table.tags: raise edit.NotFoundItemError(f"_ma_model_list.{itm}") for i, row in enumerate(mdl_list_table, start=len(data_dict["id"]) + 1): row["data_id"] = str(i) data_dict["id"].append(row["data_id"]) data_dict["name"].append(cif.as_string(row["model_name"])) data_dict["content_type"].append("model coordinates") data_ids_out.append(row["data_id"]) # write data (need to be able to overwrite!) # Using set_mmcif_category() here is OK, there are at least to data records, # target & model, so it will alwyas be a loop. block.set_mmcif_category("_ma_data.", data_dict) edit.move_category(block, "_ma_data", "after:_ma_model_list") # add 2 data groups (1 for input, 2 for output) num_data_ids = len(data_ids_in) + len(data_ids_out) block.set_mmcif_category( "_ma_data_group.", { "ordinal_id": list(range(1, num_data_ids + 1)), "group_id": [1] * len(data_ids_in) + [2] * len(data_ids_out), "data_id": data_ids_in + data_ids_out, }, ) edit.move_category(block, "_ma_data_group", "after:_ma_data") # find SW group af3_sw_group = _get_af3_sw_group(block) # add single protocol step block.set_mmcif_category( "_ma_protocol_step.", { "ordinal_id": [1], "protocol_id": [1], "step_id": [1], "method_type": ["modeling"], "details": ["Model generated with AlphaFold 3."], "software_group_id": [af3_sw_group], "input_data_group_id": [1], "output_data_group_id": [2], }, ) edit.move_category(block, "_ma_protocol_step", "after:_ma_data_group")
# LocalWords: CIF homomeric mdl gemmi cif modelarchive modelcif af BLANKLINE # LocalWords: Args gemmicifBlock NotFoundItemError RuntimeError str mmcif # LocalWords: AlphaFold PMID ModelArchive ModelCIF NotFoundCategoryError # LocalWords: NotIdentifiedSingleRecordError pdbx # LocalWords: NotIdentifiedContextRecordError # LocalWords: NotIdentifiedDuplicatedRecordError