"""ModelCIF files generated by AlphaFold 3 deviate from the official ModelCIF
definition dictionary in specific cases. In particular, for homomeric
assemblies, each molecular entity copy is written as a separate entity in the
CIF document, instead of defining a single entity referenced multiple times.
This module provides functionality to correct the deviations.
"""
# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
# Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from gemmi import cif
from . import access
from . import edit
def _is_null(value):
"""Borrowed from gemmi."""
# ToDo: This may become a public function in the future.
return len(value) == 1 and value[0] in ("?", ".")
def _char_table(c):
"""Borrowed from gemmi."""
# fmt: off
table = [
0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 0, 0, 2, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0
] + [0] * 128
# fmt: on
return table[ord(c) % 256]
def _quote(v):
"""Borrowed from gemmi, prefer double quotes above single quotes for AF."""
if all(_char_table(c) == 1 for c in v) and len(v) > 0 and not _is_null(v):
return v
q = ";"
if "\n" not in v:
if '"' not in v:
q = '"'
elif "'" not in v:
q = "'"
v = q + v
if q == ";":
v += "\n"
v += q
return v
[docs]
def fix_model_name(block, mdl_rank):
"""Normalise _ma_model_list.model_name for given rank.
AlphaFold 3 sets _ma_model_list.model_name to "Top ranked model" for
all models, regardless of their rank. This function rewrites the value such
that only ``mdl_rank == 1`` is labelled "Top ranked model". All other ranks
are renamed to "#<``mdl_rank``> ranked model".
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import fix_af3
>>> # get sample CIF data
>>> cif_data = '''data_test
... _ma_model_list.data_id 1
... _ma_model_list.model_name "Top ranked model"
... _ma_model_list.model_type "Ab initio model"
... _ma_model_list.ordinal_id 1
... '''
>>> block = cif.read_string(cif_data).sole_block()
>>> fix_af3.fix_model_name(block, 2)
>>> print(block.as_string())
data_test
_ma_model_list.data_id 1
_ma_model_list.model_name "#2 ranked model"
_ma_model_list.model_type "Ab initio model"
_ma_model_list.ordinal_id 1
<BLANKLINE>
>>> fix_af3.fix_model_name(block, 1)
>>> print(block.as_string())
data_test
_ma_model_list.data_id 1
_ma_model_list.model_name "Top ranked model"
_ma_model_list.model_type "Ab initio model"
_ma_model_list.ordinal_id 1
<BLANKLINE>
Args:
block (|gemmicifBlock|): CIF block to operate on.
mdl_rank (int): Rank of the AlphaFold 3 model. If ``mdl_rank == 1``, the
name is set to "Top ranked model".
Returns:
None
Raises:
RuntimeError: If the ``_ma_model_list`` category contains more than one
row.
edit.NotFoundCategoryError: no software entry found for AF3.
edit.NotFoundItemError: If _ma_model_list.model_name can not be
found in ``block``.
"""
if mdl_rank == 1:
mdl_name = "Top ranked model"
else:
mdl_name = f"#{mdl_rank} ranked model"
table = access.get_table(block, "_ma_model_list", items=["model_name"])
if not table:
raise edit.NotFoundItemError(
msg="File is missing _ma_model_list.model_name, single model "
+ "required"
)
if len(table) != 1:
raise RuntimeError("File must have a single model in _ma_model_list.")
table[0]["model_name"] = _quote(mdl_name)
def _get_ordinal_ids(cur_ids, num_ids_needed):
"""Find set of ordinal IDs avoiding existing ones.
- cur_ids: IDs as strings (hopefully something like 1, 2, 3, ...)
- num_ids_needed: number of IDs to provide (next numerals not in cur_ids)
"""
possible_ids = [
str(i)
for i in range(1, len(cur_ids) + num_ids_needed + 1)
if str(i) not in cur_ids
]
return possible_ids[:num_ids_needed]
[docs]
class NotIdentifiedRecordError(RuntimeError):
"""General exception for records that can not be identified in a table.
This exception should not be raised directly, it exists to define other
"NotIdentified" exceptions inheriting from it.
Args:
msg (str): Exception message.
"""
def __init__(self, msg):
super().__init__(msg)
[docs]
class NotIdentifiedDuplicatedRecordError(NotIdentifiedRecordError):
"""Exception if a duplicated record is found in a table.
Args:
category (str): Missing category.
record_id (str): Identifier for the duplicated record. Not bound to a
specific item on purpose.
"""
def __init__(self, category, record_id):
self.category = category
msg = (
f"Duplicated records found in category '{category}' for "
+ f"'{record_id}'"
)
super().__init__(msg)
[docs]
class NotIdentifiedSingleRecordError(NotIdentifiedRecordError):
"""Exception if a specific record can not be identified in a table.
Args:
category (str): Affected category.
item (str, optional): Missing item, extends the exception message.
value (str, optional): Value, in case a record is found but with
mismatching value. Extends the exception message.
"""
def __init__(self, category, item=None, value=None):
self.category = category
self.item = item
msg = f"Could not identify record in category '{category}'"
if item is not None and value is not None:
msg += f", mismatch at item '{item}={value}'"
elif item is not None:
msg += f", missing item '{item}'"
msg += "."
super().__init__(msg)
[docs]
class NotIdentifiedContextRecordError(NotIdentifiedRecordError):
"""Exception if a record for a specific context can not be identified.
Args:
category (str): Affected category.
item (str, optional): Affected item.
context (str, optional): Context, part of the message.
"""
def __init__(self, category, item=None, context=None):
self.category = category
self.item = item
msg = f"Could not identify record in category '{category}'"
if item is not None:
msg += f", item '{item}'"
if context is not None:
msg += context
msg += "."
super().__init__(msg)
def _fix_citation_fallback(block, primary_citation_id):
"""Fix citation for AF3 if usual approach failed.
Return old_af3_sw_cit_id, new_af3_sw_cit_id.
Both set to None if something failed.
"""
exp_cit = {
"country": "UK",
"journal_full": "Nature",
"journal_id_ASTM": "NATUAS",
"journal_id_CSD": "0006",
"journal_id_ISSN": "0028-0836",
"journal_volume": "630",
"page_first": "493",
"page_last": "500",
"pdbx_database_id_DOI": "10.1038/s41586-024-07487-w",
"pdbx_database_id_PubMed": "38718835",
"title": "Accurate structure prediction of biomolecular interactions "
+ "with AlphaFold 3",
"year": "2024",
}
cat = "_citation."
cit_table = access.get_table(block, cat)
num_rows = len(cit_table) if cit_table else 0
# check and abort as needed
if num_rows == 0:
new_af3_sw_cit_id = primary_citation_id
exp_cit = {"id": new_af3_sw_cit_id, **exp_cit}
block.set_pairs(cat, exp_cit)
return None, new_af3_sw_cit_id
# check that all items exist
for itm in exp_cit:
if f"{cat}{itm}" not in cit_table.tags:
raise NotIdentifiedSingleRecordError(cat[:-1], item=itm)
# search for a record with all matching values or '?'
for i in range(num_rows):
found = True
for key, val in exp_cit.items():
# Checking for empty string "" is because gemmi as_string()
# translates "?" and "." to "".
if cif.as_string(cit_table[i][f"{cat}{key}"]) not in ["", val]:
found = False
break
if found:
break
if not found:
# At this point, 'key' and 'val' are defined as 'num_rows' must be
# greateer than 0. Silence Pylint warning.
# pylint: disable=undefined-loop-variable
raise NotIdentifiedSingleRecordError(cat[:-1], item=key, value=val)
old_af3_sw_cit_id = cit_table[i]["id"]
if old_af3_sw_cit_id in ["?", "."] or old_af3_sw_cit_id == "primary":
new_af3_sw_cit_id = primary_citation_id
else:
new_af3_sw_cit_id = old_af3_sw_cit_id
# fix dict
cit_table[i]["id"] = new_af3_sw_cit_id
for key, val in exp_cit.items():
cit_table[i][key] = val
return old_af3_sw_cit_id, new_af3_sw_cit_id
def _is_af3_sw_name(sw_name):
"""Check if given _software.name is for AF3."""
return sw_name.lower().startswith("alphafold")
class _AF3ItemSetter:
"""Class as a callback for adding columns.
Sets a fixed value in an AF3 row.
Fails if there are multiple AF3 rows."""
# This is a tiny helper-class with the purpose of storing a state, only
# for local use, disable Pylint warning
# pylint: disable=too-few-public-methods
def __init__(self, value, item="citation_id"):
self.item = item
self.sw_found = False
self.value = value
def __call__(self, row, same=False):
if _is_af3_sw_name(row["name"]):
if self.sw_found:
raise NotIdentifiedDuplicatedRecordError(
"_software", "AlphaFold"
)
self.sw_found = True
return self.value
if same:
return row[self.item]
return "?"
def _fix_software(block, new_af3_sw_cit_id):
"""Update _software with _citation record ID."""
sw_table = access.get_table(block, "_software")
af3_cid_setter = _AF3ItemSetter(new_af3_sw_cit_id)
if sw_table:
if "_software.citation_id" not in sw_table.tags:
edit.add_column(block, "_software", "citation_id", af3_cid_setter)
else:
for row in sw_table:
row["citation_id"] = af3_cid_setter(row, same=True)
if not af3_cid_setter.sw_found:
raise NotIdentifiedContextRecordError(
"_software", context=": AlphaFold 3 not found"
)
def _get_key_primary(row):
"""For edit.sort(), make sure 'primary' comes first"""
if row["id"] == "primary":
return (-1, "")
try:
return (0, int(row["id"]))
except ValueError:
return (1, row["id"])
def _ensure_citation_id_first(block):
"""Make sure _citation.id is the first tag of a table."""
cif_dict = block.get_mmcif_category("_citation")
table = access.get_table(block, "_citation")
if table.tags[0] == "_citation.id":
return
cif_dict = {"id": list(table.find_column("id"))}
for itm in table.tags:
itm = itm.split(".", maxsplit=1)[1]
if itm == "id":
continue
cif_dict[itm] = list(table.find_column(itm))
table.erase()
edit.add_category(
block,
"_citation",
item_data=cif_dict,
index="before:_citation_author",
raw=True,
)
[docs]
def fix_citation(block):
"""Normalise the AlphaFold 3 citation in a `ModelCIF`_ ``block``.
Ensures that the AlphaFold 3 publication
(`PMID 38718835 <https://pubmed.ncbi.nlm.nih.gov/38718835/>`_) is not marked
as the "primary" citation, assigns a numeric citation ID instead. Fixes an
incomplete AlphaFold 3 citation. Replaces the author list with the full
curated list of names and updates its citation ID. Reorders citations so
that the primary entry appears first and links the citation to the
corresponding software record.
This adjustment is not required for valid `ModelCIF`_ files, but follows
`ModelArchive`_ conventions where the primary citation must refer to
the deposited model rather than the software used to generate it.
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import access, fix_af3
>>> # start with an empty CIF document
>>> CIF_DATA = '''data_test
... _citation.id primary
... _citation.country UK
... _citation.journal_full Nature
... _citation.journal_id_ASTM NATUAS
... _citation.journal_id_CSD 0006
... _citation.journal_id_ISSN 0028-0836
... _citation.journal_volume 630
... _citation.page_first 493
... _citation.page_last 500
... _citation.pdbx_database_id_DOI 10.1038/s41586-024-07487-w
... _citation.pdbx_database_id_PubMed 38718835
... _citation.title 'Accurate structure prediction of biomolecular ...'
... _citation.year 2024
... #
... loop_
... _citation_author.citation_id
... _citation_author.name
... _citation_author.ordinal
... primary "Google DeepMind AlphaFold Team" 1
... primary "Isomorphic Labs Team" 2
... #
... loop_
... _software.classification
... _software.date
... _software.description
... _software.name
... _software.pdbx_ordinal
... _software.type
... _software.version
... other ? "Structure prediction" AlphaFold 1 package AlphaFold-beta
... '''
>>> block = cif.read_string(CIF_DATA).sole_block()
>>> fix_af3.fix_citation(block)
>>> # The usual block.as_string() output would be too much for a
>>> # docstring, just check some important values.
>>> table = access.get_table(block, "_citation")
>>> assert table[0]["id"] == "1"
>>> table = access.get_table(block, "_citation_author")
>>> assert table[0]["name"] != "Google DeepMind AlphaFold Team"
>>> table = access.get_table(block, "_software")
>>> assert table[0]["citation_id"] == "1"
Args:
block (|gemmicifBlock|): CIF block to operate on.
Returns:
None
Raises:
edit.NotFoundCategoryError: If _software category can not be found.
NotIdentifiedSingleRecordError: If required item is missing from
_citation category. If item values are not as expected for
_citation category.
NotIdentifiedDuplicatedRecordError: If multiple entries for AlphaFold
are found in _software category. In that case, the "right"
record can not be identified.
"""
old_af3_sw_cit_id = None
new_af3_sw_cit_id = None
cat = "_citation"
itms = ["id", "pdbx_database_id_PubMed"]
table = access.get_table(block, cat, itms)
# pick first numeric value not yet taken
primary_citation_id = _quote(
_get_ordinal_ids(set(row["id"] for row in table), 1)[0]
)
# correct IDs and find AF3 citation
for row in table:
if row["pdbx_database_id_PubMed"] == "38718835":
old_af3_sw_cit_id = row["id"]
if row["id"] == "primary":
row["id"] = primary_citation_id
new_af3_sw_cit_id = primary_citation_id
else:
new_af3_sw_cit_id = old_af3_sw_cit_id
if old_af3_sw_cit_id is None or _is_null(old_af3_sw_cit_id):
# citation available w/o PMID? Fallback option replacing citation
old_af3_sw_cit_id, new_af3_sw_cit_id = _fix_citation_fallback(
block, primary_citation_id
)
if len(table) > 0:
# _citation table may have changed, sort it to have "primary" first
edit.sort(table, "id", key=_get_key_primary)
# make sure _citation.id comes first
_ensure_citation_id_first(block)
# fix authors (completely replace ones for AF3 publication)
cit_auth_dict = block.get_mmcif_category("_citation_author.")
cit_auth_dict_new = {"citation_id": [], "name": [], "ordinal": []}
if cit_auth_dict:
for idx, citation_id in enumerate(cit_auth_dict["citation_id"]):
if citation_id != old_af3_sw_cit_id:
cit_auth_dict_new["citation_id"].append(citation_id)
cit_auth_dict_new["name"].append(cit_auth_dict["name"][idx])
# note: fixed to be in correct style and without special characters
af3_authors = [
"Abramson, J.",
"Adler, J.",
"Dunger, J.",
"Evans, R.",
"Green, T.",
"Pritzel, A.",
"Ronneberger, O.",
"Willmore, L.",
"Ballard, A.J.",
"Bambrick, J.",
"Bodenstein, S.W.",
"Evans, D.A.",
"Hung, C.C.",
"O'Neill, M.",
"Reiman, D.",
"Tunyasuvunakool, K.",
"Wu, Z.",
"Zemgulyte, A.",
"Arvaniti, E.",
"Beattie, C.",
"Bertolli, O.",
"Bridgland, A.",
"Cherepanov, A.",
"Congreve, M.",
"Cowen-Rivers, A.I.",
"Cowie, A.",
"Figurnov, M.",
"Fuchs, F.B.",
"Gladman, H.",
"Jain, R.",
"Khan, Y.A.",
"Low, C.M.R.",
"Perlin, K.",
"Potapenko, A.",
"Savy, P.",
"Singh, S.",
"Stecula, A.",
"Thillaisundaram, A.",
"Tong, C.",
"Yakneen, S.",
"Zhong, E.D.",
"Zielinski, M.",
"Zidek, A.",
"Bapst, V.",
"Kohli, P.",
"Jaderberg, M.",
"Hassabis, D.",
"Jumper, J.M.",
]
cit_auth_dict_new["citation_id"].extend(
[new_af3_sw_cit_id] * len(af3_authors)
)
cit_auth_dict_new["name"].extend(af3_authors)
if cit_auth_dict:
cit_auth_dict_new["ordinal"] = list(
range(1, len(cit_auth_dict_new["name"]) + 1)
)
block.set_mmcif_category("_citation_author.", cit_auth_dict_new)
_fix_software(block, new_af3_sw_cit_id)
def _is_af3_server(block):
"""Check if block was produced with AF3 server or code.
True means server, False means "from code". On problems to identify record,
raise exception.
"""
# this is a heuristic and may fail!
table = access.get_table(block, "_pdbx_data_usage", items=["details"])
is_server = any(True for r in table if "server" in r["details"].lower())
github_url = "github.com/google-deepmind/alphafold3"
is_code = any(True for r in table if github_url in r["details"].lower())
if is_server and is_code or (not is_server and not is_code):
raise NotIdentifiedContextRecordError(
"_pdbx_data_usage", context=": AlphaFold 3 license type"
)
return is_server
[docs]
def fix_software_location(block):
"""Ensures the AlphaFold 3 _software entry has a correct location URL.
Determines whether the `ModelCIF`_ ``block`` originates from the AlphaFold 3
server or a local installation and sets the corresponding URL in
_software.location. If the column does not yet exist it is created;
otherwise only the row for AlphaFold 3 is updated.
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import access, fix_af3
>>> # start with an empty CIF document
>>> CIF_DATA = '''data_test
... _pdbx_data_usage.details "... alphafoldserver.com/output-terms."
... _pdbx_data_usage.id 1
... _pdbx_data_usage.type license
... _pdbx_data_usage.url ?
... #
... loop_
... _software.classification
... _software.date
... _software.description
... _software.name
... _software.pdbx_ordinal
... _software.type
... _software.version
... other ? "Structure prediction" AlphaFold 1 package AlphaFold-beta
... '''
>>> block = cif.read_string(CIF_DATA).sole_block()
>>> fix_af3.fix_software_location(block)
>>> # Just check that _software.location exists and has the right value
>>> table = access.get_table(block, "_software")
>>> assert "_software.location" in table.tags
>>> assert table[0]["location"] == "https://alphafoldserver.com/"
>>> # Change block to look like ModelCIF file from local installation
>>> table = access.get_table(block, "_pdbx_data_usage")
>>> table[0]["details"] = "...github.com/google-deepmind/alphafold3..."
>>> fix_af3.fix_software_location(block)
>>> # Check _software.location to point to GitHub, now
>>> table = access.get_table(block, "_software")
>>> assert table[0]["location"] == \
"https://github.com/google-deepmind/alphafold3"
Args:
block (|gemmicifBlock|): CIF block to operate on.
Returns:
None
Raises:
NotIdentifiedContextRecordError: If no AlphaFold 3 entry is found in
the _software table.
NotIdentifiedContextRecordError: If the origin of the AlphaFold 3
license could not be identified in the _pdbx_data_usage table.
NotIdentifiedDuplicatedRecordError: If multiple entries for AlphaFold
3 are found in the _software table.
"""
is_server = _is_af3_server(block)
if is_server:
af3_sw_url = "https://alphafoldserver.com/"
else:
af3_sw_url = "https://github.com/google-deepmind/alphafold3"
sw_table = access.get_table(block, "_software")
af3_url_setter = _AF3ItemSetter(af3_sw_url, item="location")
if sw_table:
if "_software.location" not in sw_table.tags:
edit.add_column(block, "_software", "location", af3_url_setter)
else:
for row in sw_table:
row["location"] = af3_url_setter(row, same=True)
if not af3_url_setter.sw_found:
raise NotIdentifiedContextRecordError(
"_software", context=": AlphaFold 3 not found"
)
def _get_cat_dict(block, category):
"""Fetch category as dict, raise exception otherwise."""
cat_dict = block.get_mmcif_category(f"{category}.")
if not cat_dict:
raise edit.NotFoundCategoryError(category)
return cat_dict
def _get_af3_sw_group(block):
"""Get software_group_id for AF3 (to be used in protocols and QE).
Assumption: single SW group exists which points to AF3.
"""
sw_group_table = access.get_table(block, "_ma_software_group")
if not sw_group_table:
raise edit.NotFoundCategoryError("_ma_software_group")
if len(sw_group_table) == 1:
return sw_group_table[0]["group_id"]
# Multiple SW groups, look for AF3 entry
sw_id = None
sw_table = access.get_table(block, "_software")
if not sw_table:
raise edit.NotFoundCategoryError("_software")
for row in sw_table:
if _is_af3_sw_name(row["name"]):
if sw_id is not None:
raise NotIdentifiedDuplicatedRecordError(
"_software", "AlphaFold"
)
sw_id = row["pdbx_ordinal"]
for row in sw_group_table:
if row["software_id"] == sw_id:
return row["group_id"]
raise NotIdentifiedContextRecordError(
"_software", context=": AlphaFold 3 not found"
)
[docs]
def fix_protocol(block):
"""Fix the MA protocol to a single well-formed step.
Rewrites _ma_data, _ma_data_group, and _ma_protocol_step from scratch based
on the existing _ma_target_entity, _ma_model_list and _ma_software_group
categories. Any prior content in those three categories is silently
overwritten.
Data layout after the call:
_ma_data:
One record per target entity (content_type "target")
followed by one record per model (content_type
"model coordinates"). IDs are assigned sequentially
starting at 1.
_ma_data_group:
Group 1 - all target data IDs (input side).
Group 2 - all model data IDs (output side).
_ma_protocol_step:
A single step referencing the AF3 software group, group 1
as input, and group 2 as output.
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import access, fix_af3
>>> # start with an empty CIF document
>>> CIF_DATA = '''data_test
... #
... loop_
... _entity.id
... _entity.pdbx_description
... _entity.type
... 1 "bestest polymer in universe" polymer
... 2 "second best polythingi in universe" polymer
... #
... loop_
... _ma_target_entity.data_id
... _ma_target_entity.entity_id
... _ma_target_entity.origin
... 1 1 .
... 1 2 .
... #
... _ma_model_list.data_id 1
... _ma_model_list.model_group_id 1
... _ma_model_list.model_group_name "AlphaFold-beta-20231127 (...)"
... _ma_model_list.model_id 1
... _ma_model_list.model_name "Top ranked model"
... _ma_model_list.model_type "Ab initio model"
... _ma_model_list.ordinal_id 1
... #
... loop_
... _ma_software_group.group_id
... _ma_software_group.ordinal_id
... _ma_software_group.software_id
... 1 1 1
... #
... loop_
... _software.classification
... _software.date
... _software.description
... _software.name
... _software.pdbx_ordinal
... _software.type
... _software.version
... other ? "Structure prediction" AlphaFold 1 package AlphaFold-beta
... '''
>>> block = cif.read_string(CIF_DATA).sole_block()
>>> fix_af3.fix_protocol(block)
>>> access.get_table(block, "_entity").erase()
>>> access.get_table(block, "_ma_data").erase()
>>> access.get_table(block, "_ma_data_group").erase()
>>> access.get_table(block, "_ma_model_list").erase()
>>> access.get_table(block, "_ma_software_group").erase()
>>> access.get_table(block, "_ma_target_entity").erase()
>>> access.get_table(block, "_software").erase()
>>> print(block.as_string())
data_test
loop_
_ma_protocol_step.ordinal_id
_ma_protocol_step.protocol_id
_ma_protocol_step.step_id
_ma_protocol_step.method_type
_ma_protocol_step.details
_ma_protocol_step.software_group_id
_ma_protocol_step.input_data_group_id
_ma_protocol_step.output_data_group_id
1 1 1 modeling 'Model generated with AlphaFold 3.' 1 1 2
<BLANKLINE>
Args:
block (|gemmicifBlock|): CIF block to operate on.
Returns:
None
Raises:
edit.NotFoundCategoryError: If any required source category is
absent: _entity, _ma_target_entity, _ma_model_list, or
_ma_software_group.
edit.NotFoundItemError: If _ma_target_entity.data_id,
_ma_model_list.data_id or _ma_model_list.model_name are missing.
NotIdentifiedDuplicatedRecordError: If multiple _ma_software_group
records exist and the AF3 entry cannot be unambiguously identified
in _software.
NotIdentifiedContextRecordError: If multiple _ma_software_group records
exist but no AF3 entry can be found in _software at all.
"""
# collect data to add
data_dict = {"id": [], "name": [], "content_type": []}
data_ids_in = []
data_ids_out = []
# add targets
entity_dict = _get_cat_dict(block, "_entity")
entity_descs = dict(zip(entity_dict["id"], entity_dict["pdbx_description"]))
trg_ent_table = access.get_table(block, "_ma_target_entity")
if not trg_ent_table:
raise edit.NotFoundCategoryError("_ma_target_entity")
# Raise when _ma_target_entity.data_id does not exist, that could mean
# a lot has changed in the way AF writes ModelCIF files.
if "_ma_target_entity.data_id" not in trg_ent_table.tags:
raise edit.NotFoundItemError("_ma_target_entity.data_id")
for i, row in enumerate(trg_ent_table, start=1):
row["data_id"] = str(i)
data_dict["id"].append(row["data_id"])
data_ids_in.append(row["data_id"])
data_dict["name"].append(entity_descs[row["entity_id"]])
data_dict["content_type"].append("target")
# add model
mdl_list_table = access.get_table(block, "_ma_model_list")
if not mdl_list_table:
raise edit.NotFoundCategoryError("_ma_model_list")
for itm in ["data_id", "model_name"]:
if f"_ma_model_list.{itm}" not in mdl_list_table.tags:
raise edit.NotFoundItemError(f"_ma_model_list.{itm}")
for i, row in enumerate(mdl_list_table, start=len(data_dict["id"]) + 1):
row["data_id"] = str(i)
data_dict["id"].append(row["data_id"])
data_dict["name"].append(cif.as_string(row["model_name"]))
data_dict["content_type"].append("model coordinates")
data_ids_out.append(row["data_id"])
# write data (need to be able to overwrite!)
# Using set_mmcif_category() here is OK, there are at least to data records,
# target & model, so it will alwyas be a loop.
block.set_mmcif_category("_ma_data.", data_dict)
edit.move_category(block, "_ma_data", "after:_ma_model_list")
# add 2 data groups (1 for input, 2 for output)
num_data_ids = len(data_ids_in) + len(data_ids_out)
block.set_mmcif_category(
"_ma_data_group.",
{
"ordinal_id": list(range(1, num_data_ids + 1)),
"group_id": [1] * len(data_ids_in) + [2] * len(data_ids_out),
"data_id": data_ids_in + data_ids_out,
},
)
edit.move_category(block, "_ma_data_group", "after:_ma_data")
# find SW group
af3_sw_group = _get_af3_sw_group(block)
# add single protocol step
block.set_mmcif_category(
"_ma_protocol_step.",
{
"ordinal_id": [1],
"protocol_id": [1],
"step_id": [1],
"method_type": ["modeling"],
"details": ["Model generated with AlphaFold 3."],
"software_group_id": [af3_sw_group],
"input_data_group_id": [1],
"output_data_group_id": [2],
},
)
edit.move_category(block, "_ma_protocol_step", "after:_ma_data_group")
# LocalWords: CIF homomeric mdl gemmi cif modelarchive modelcif af BLANKLINE
# LocalWords: Args gemmicifBlock NotFoundItemError RuntimeError str mmcif
# LocalWords: AlphaFold PMID ModelArchive ModelCIF NotFoundCategoryError
# LocalWords: NotIdentifiedSingleRecordError pdbx
# LocalWords: NotIdentifiedContextRecordError
# LocalWords: NotIdentifiedDuplicatedRecordError