Source code for modelarchive.modelcif.access

"""Functionality to access data in a ModelCIF file."""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from pathlib import Path
import gzip

from gemmi import cif
import gemmi


[docs] def get_table(block, category, items=None): """Get a |gemmi.cif.Table|_ from a |gemmicifBlock| for a category. It is much more convenient to work with |gemmi.cif.Table|_ objects instead of Gemmi's loops and pairs directly. Imagine a ModelCIF file in which a certain category is represented as loop, while another ModelCIF file stores the same category as list of pairs. Both representations may be valid ModelCIF files and would require two separate handlers implemented for essentially the same data. By using |gemmi.cif.Table|_ as a wrapper, loops and pairs can be treated uniformly, allowing you to handle both cases through a single code base. Gemmi provides two functions to retrieve tables, :func:`find_mmcif_category()` and :func:`find()`. One of them just needs a category name and the other requires a category name and a list of columns to fetch. So, different behaviour again and... lets just accept: :func:`get_table()` hides these details away and happily returns a table, whether you provide a list of items or not. If a list of items is given, the resulting table will contain only those columns. Plus, in case the category can't be found in block, an empty list is returned, which feels more pythonic than getting an empty table back of length 0. Retrieving an empty list also makes looping over a table easier. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import access >>> # get sample CIF data >>> cif_data = '''data_test ... _ma_qa_metric.id 1 ... _ma_qa_metric.description test_score ... loop_ ... _ma_qa_metric_local.ordinal_id ... _ma_qa_metric_local.metric_value ... _ma_qa_metric_local.metric_id ... 1 1.0 1 ... 2 1.5 1 ... ''' >>> block = cif.read_string(cif_data).sole_block() >>> table = access.get_table(block, "_ma_qa_metric") >>> len(table) 1 >>> table[-1]["description"] 'test_score' >>> table = access.get_table( ... block, ... "_ma_qa_metric_local", ... items=["metric_id", "metric_value"], ... ) >>> # table should have 2 columns and 2 rows >>> table <gemmi.cif.Table 2 x 2> >>> # columns are sorted as requested, not as stored >>> table.tags[0] '_ma_qa_metric_local.metric_id' >>> table.tags[1] '_ma_qa_metric_local.metric_value' Args: block (|gemmicifBlock|): CIF data block holding the categories of the CIF document. category (str): Category to fetch from ``block``, single category only, no Joins. Gemmi requires category names to end with ``.``, so this function adds it if missing. items (list[str]): List of items to fetch as columns. Order of columns (items) follows the provided list. If ``None``, the whole category with all its items as columns will be fetched. In case of ``None``, items are fetched in the same order as they are found in the CIF document. Returns: |gemmi.cif.Table|_ | :class:`list`: The requested table if category can be found, otherwise empty list. """ if not category.endswith("."): category = f"{category}." if items is None: table = block.find_mmcif_category(category) else: table = block.find(category, items) if len(table) == 0: return [] return table
def _gemmi_quote(value): """Quote string values when necessary.""" if ( isinstance(value, str) and " " in value and not (value.startswith("'") and value.endswith("'")) and not (value.startswith('"') and value.endswith('"')) ): return cif.quote(value) return value def _fix_category_name(category_name): """Add starting '-' and trailing '.' if needed. Allows to easily use "struct" instead of "_struct." in calls. """ if not category_name.startswith("_"): category_name = "_" + category_name if not category_name.endswith("."): category_name += "." return category_name
[docs] class MABlock: """Wrapper around |gemmicifBlock| for mmCIF/ ModelCIF structure files. Reads a single mmCIF block from a file or string and exposes the full |gemmicifBlock| interface via attribute delegation, extended by convenience methods for common ModelCIF operations. Args: model_data (str | pathlib.Path): Path to the mmCIF input file or CIF data as text. Attributes: source (str | None): Path to the mmCIF input file, ``None`` if ``model_data`` provides CIF data as string. doc (|gemmi.cif.Document|_): The parsed CIF document. block (|gemmicifBlock|): The sole block of the CIF document. """ def __init__(self, model_data): self.source = None if isinstance(model_data, Path): model_data = str(model_data) self.doc = cif.read(model_data) self.source = model_data else: try: self.doc = cif.read(model_data) except (FileNotFoundError, OSError): self.doc = cif.read_string(model_data) else: self.source = model_data self.block = self.doc.sole_block() self._targets = None self._polymer_targets = None
[docs] def __getattr__(self, name): """Delegate attribute lookup to the wrapped |gemmicifBlock|. Called only when normal attribute resolution has already failed (default Python behaviour), so ``self.block`` itself is always found through the standard mechanism. Any attribute present on :attr:`block` is transparently forwarded; anything else raises :exc:`AttributeError` as usual. Args: name (str): Name of the attribute to look up. Returns: object: The attribute value from :attr:`block`. Raises: AttributeError: If ``name`` is not found on :attr:`block` either. """ if hasattr(self.block, name): return getattr(self.block, name) raise AttributeError( f"'{type(self).__name__}' object has no attribute '{name}'" )
[docs] def __iter__(self): """Iterate over the wrapped |gemmicifBlock|. Delegates directly to :meth:`gemmi.cif.Block.__iter__`, yielding whatever the underlying block exposes during iteration (typically its items). Returns: iterator: An iterator over the block's contents. """ return self.block.__iter__()
@property def targets(self): """Mapping of target entities keyed by their entity ID. Lazily populated on first access from the _ma_target_entity and _entity categories. Each value is a dict with the keys _ma_target_entity.entity_id, _ma_target_entity.sequence and _ma_target_entity.type. Returns: dict: Target entity information keyed by entity ID string. Raises: RuntimeError: If an entity ID appears more than once in _ma_target_entity. """ if self._targets is not None: return self._targets self._targets = {} table = self.find_strict("_ma_target_entity.", ["entity_id"]) for row in table: if row["entity_id"] in self._targets: raise RuntimeError( f"Target with entity_id '{row['entity_id']}' is duplicated." ) self._targets[row["entity_id"]] = { "entity_id": row["entity_id"], "sequence": self.get_sequence(row["entity_id"]), } table = self.find_strict("_entity.", ["id", "type"]) for row in table: if row["id"] in self._targets: self._targets[row["id"]]["type"] = row["type"] return self._targets @property def polymer_targets(self): """Subset of :attr:`targets` whose _entity.type is ``'polymer'``. Lazily populated on first access by filtering :attr:`targets`. Returns: list[dict]: Dicts of target entity information for polymer entities. """ if self._polymer_targets is not None: return self._polymer_targets self._polymer_targets = [] for target in self.targets.values(): if target["type"] == "polymer": self._polymer_targets.append(target) return self._polymer_targets
[docs] def find_strict(self, name, columns): """Return a table from the block, raising if it is absent or empty. Args: name (str): Category name, e.g. ``'_entity.'``. columns (list[str]): Column names to select. Returns: gemmi.cif.Table: The requested table. Raises: RuntimeError: If the table is not found or contains no rows. """ table = self.block.find(name, columns) if len(table) == 0: raise RuntimeError( f"""Table '{name}' with columns '{"', '".join(columns)}' """ + "not found." ) return table
[docs] def get_sequence(self, entity): """Return the one-letter sequence for a polymer entity. Reads residue entries from _entity_poly_seq and converts each three-letter code to a one-letter code via :func:`gemmi.find_tabulated_residue`. Reading is done without caching, so calling this method repeatedly for many entities is inefficient. Args: entity (str): Numeric entity ID as a string. Returns: str: One-letter amino-acid sequence for the entity. Raises: RuntimeError: If _entity_poly_seq is absent or empty. RuntimeError: If residue numbering in _entity_poly_seq is not strictly sequential. """ table = self.find_strict( "_entity_poly_seq.", ["entity_id", "num", "mon_id"] ) sequence = "" num = 0 for row in table: if row["entity_id"] != entity: continue num += 1 if int(row["num"]) != num: raise RuntimeError("Residue numbering not sequential.") sequence += gemmi.find_tabulated_residue( row["mon_id"] ).one_letter_code return sequence
[docs] def write_file(self, filename, compress=False, style=cif.Style.Simple): """Write the CIF document to disk, with optional gzip compression. If ``compress`` is :obj:`True`, or if ``filename`` already ends with ``'.gz'``, the output is written as a gzip-compressed file. The ``'.gz'`` suffix is appended automatically when ``compress`` is :obj:`True` but the suffix is missing. Args: filename (str): Destination file path. compress (bool): Whether to gzip-compress the output. Defaults to :obj:`False`. style (gemmi.cif.Style): Formatting style for the CIF output. Defaults to :attr:`gemmi.cif.Style.Simple`. Returns: None """ if compress or filename.endswith(".gz"): if not filename.endswith(".gz"): filename += ".gz" with gzip.open(filename, mode="wt", compresslevel=9) as gfh: gfh.write(self.doc.as_string(style)) else: self.doc.write_file(filename, style)
[docs] def add_to_category(self, category, match=None, silent=False, **kwargs): """Update item values in an existing mmCIF category. Locates a row in ``category`` and overwrites the values of the items named by ``kwargs``. When ``match`` is given, the row is identified by a key–value pair; otherwise the category must contain exactly one row. Existing non-placeholder values (i.e. not ``'.'`` or ``'?'``) are reported to *stderr* unless ``silent`` is :obj:`True`. Args: category (str): mmCIF category name, e.g. ``'_entry'``. match (tuple[str, str] | None): A ``(item_name, value)`` pair used to identify the target row. If :obj:`None`, the category must have exactly one row. silent (bool): Suppress replacement messages. Defaults to :obj:`False`. **kwargs: Item names mapped to their new values. Returns: None Raises: RuntimeError: If ``category`` is absent or empty (via :meth:`find_strict`), or if no row matches the ``match`` criterion. """ category = _fix_category_name(category) items = list(kwargs.keys()) row = None if match is not None: table = self.find_strict(category, items + [match[0]]) for row in table: if row[match[0]] == match[1]: break row = None if row is None: raise RuntimeError( f"No item {match[0]}=={match[1]} found in category " + f"{category}." ) else: table = self.find_strict(category, items) assert len(table) == 1 row = table[0] for itm, val in kwargs.items(): if row[itm] not in [".", "?"]: if not silent: print( f" replacing '{cif.as_string(row[itm])}' with " + f"'{val}' ({itm})" ) row[itm] = _gemmi_quote(val)
[docs] def add_category(self, category, after=None, **kwargs): """Add a new single-row category to the block. If the category already exists it is overwritten. The new category can be positioned immediately after an existing one by passing its name as ``after``. Args: category (str): mmCIF category name to create. after (str | None): Name of an existing category after which the new one should be placed. If :obj:`None`, the category is appended to the end. **kwargs: Item names mapped to lists of values (one value per row — pass single-element lists for a one-row category). Returns: None """ category = _fix_category_name(category) for values in kwargs.values(): for i, val in enumerate(values): values[i] = _gemmi_quote(val) self.block.set_mmcif_category(category, kwargs, raw=True) if after is None: return after = _fix_category_name(after) table = self.block.find_mmcif_category(after) idx = self.block.get_index(table.tags[-1]) table = self.block.find_mmcif_category(category) old_idx = self.block.get_index(table.tags[-1]) self.block.move_item(old_idx, idx + 1)
[docs] def get_category(self, category): """Return a category as a dict, compatible with :meth:`add_category`. Wraps :meth:`gemmi.cif.Block.get_mmcif_category`. Returns an empty dict when the category is not present in the block. The returned dict can be modified and passed directly to :meth:`add_category` to round-trip a category:: entity_dict = block.get_category("entity") # ... modify entity_dict ... block.add_category("entity", **entity_dict) Args: category (str): mmCIF category name, e.g. ``'_entity'``. Returns: dict: Mapping of item names to lists of values, or an empty dict if the category is absent. """ return self.block.get_mmcif_category(_fix_category_name(category))
[docs] def has_category(self, category): """Check whether a category is present in the block. Args: category (str): mmCIF category name to look up. Returns: bool: :obj:`True` if the category exists in the block, :obj:`False` otherwise. """ return ( _fix_category_name(category) in self.block.get_mmcif_category_names() )
# LocalWords: mmcif param str gemmi cif pythonic Gemmi's func