Source code for modelarchive.modelcif.edit

"""Functionality to extend and modify ModelCIF files."""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Pylint complains about too many lines. Since we have lots of documentation
# inline, lets ignore this.
# pylint: disable=too-many-lines

import sys

from gemmi import cif

from .. import _utils
from . import access


[docs] class NotFoundError(RuntimeError): """General exception for 'things' that can not be found. If ``msg`` is omitted, generates a message "<SUBJECT> '<VALUE>' does not exist". If ``value`` is a list with more than one element, the message will be written in plural mode. If ``subject`` is a list or tuple, a second element will be used as plural of the subject. This exception should not be raised directly, it exists to define other "NotFound" exceptions inheriting from it. Args: subject (str|list|tuple): The 'thing' that can not be found, used in the generated message. If :class:`list:` or :class:`tuple`, a second element is used as plural. value (str|list): The name of what can not be found, used in the generated message. Provied a list of values to get a message fitting plural. msg (str): Optional alternative error message. """ def __init__(self, subject, value, msg): dos = "does" if msg is None: if not isinstance(subject, str): if isinstance(value, (list, tuple)): if len(value) > 1: subject = subject[1] dos = "do" value = "', '".join(value) else: subject = subject[0] else: if isinstance(value, (list, tuple)): if len(value) > 1: dos = "do" value = "', '".join(value) msg = f"{subject} '{value}' {dos} not exist." super().__init__(msg)
[docs] class NotFoundCategoryError(NotFoundError): """Exception if a category can not be found. This exception should be raised when a function expects a specific category to exist in the corresponding |gemmicifBlock|, but the category cannot be retrieved. Attributes: category (tuple): Tuple of category names that could not be found. Args: category (str|list): Name of the category that could not be found. Using a list of categories writes the generated message in plural. msg (str): Optional alternative error message. """ def __init__(self, category=None, msg=None): if isinstance(category, list): self.category = tuple(category) elif isinstance(category, tuple): self.category = category else: self.category = (category,) super().__init__(("Category", "Categories"), category, msg)
[docs] class NotFoundItemError(NotFoundError): """Exception if an item can not be found. This exception should be raised when a function expects a specific item to exist in the corresponding CIF category, but the item cannot be retrieved. Attributes: item (tuple): Tuple of item names that could not be found. Args: item (str): Name of the item that could not be found. Use as "<CATEGORY>.<ITEM>" for clarity. Using a list of items writes the generated message in plural. msg (str): Optional alternative error message. """ def __init__(self, item=None, msg=None): if isinstance(item, list): self.item = tuple(item) elif isinstance(item, tuple): self.item = item else: self.item = (item,) super().__init__(("Item", "Items"), item, msg)
def _get_idx_for_placement(plcmnt, tbmvd, block): """Turn a relative placement into an index in a block.""" try: pos, cat = plcmnt.split(":", maxsplit=1) except ValueError: # ToDo: turn into own exception if needed raise ValueError( f"Couldn't split placement string '{plcmnt}', maybe the " + "':' is missing. Placement string needs to be of form " + "'[after|before]:<CATEGORY>'." ) from None table = access.get_table(block, cat) if not table: _utils.warn_msg( f"Category '{cat}' for relative placement not found. " + "Skipping relocation." ) return None # get idx of first or last if pos.upper() == "AFTER": # get the index of the first tag of the category to be moved tbm_idx = block.get_index(tbmvd.tags[0]) idx = block.get_index(table.tags[-1]) # when moving from before category, hit the same index to push it down if tbm_idx < idx: return idx # when pushing from behind category, move to index + 1 if idx < sum(1 for _ in block) - 1: return idx + 1 if pos.upper() == "BEFORE": return block.get_index(table.tags[0]) # ToDo: turn into own exception if needed raise ValueError( f"Relative placement string '{pos}' not recognised. " + "Valid directions are 'after' and 'before'." )
[docs] class MoveIdxToFarError(RuntimeError): """Exception if repositioning exceeds the size of document-category-list. Primarily used by :func:`move_category`, on the attempt to move a category to a position that does not exist within the corresponding |gemmicifBlock|. For example, if the |gemmicifBlock| object contains 10 categories, trying to move a category to position 15 will fail and should raise this exception. Args: category (str): Name of the category that could not be moved. idx (int): Target position to which the category was to be moved. """ def __init__(self, category, idx): super().__init__( f"Cannot move '{category}' to position '{idx}', exceeds range." )
[docs] def move_category(block, cat, idx): """Move a category to a new position in a |gemmicifBlock|. By design, ModelCIF files are not intended to be read or edited manually. Instead, dedicated applications should handle the format, providing functionality to view and modify the data. However, at `ModelArchive`_ we occasionally need to open ModelCIF files in an editor to inspect specific details. In such cases, it is helpful to have related categories grouped together, reducing the need to jump back and forth between different categories. This asks for a function to reposition categories within a ModelCIF file. :func:`move_category` takes category ``cat`` and moves it to position ``idx`` in the CIF block ``block``. The parameter ``idx`` is somewhat special: it can be just an integer index, specifying the exact position to move ``cat`` to. That comes in handy placing categories at the beginning (``idx=0``) or at the end (``idx=-1``) of ``block``. However, specifying an absolute index is often less useful in practice, as categories are typically organised relative to related categories. For this purpose, ``idx`` provides a special syntax: ``[after|before]:<CATEGORY>``. For example, if you want to put category ``_ma_qa_metric`` in front of category ``_ma_qa_metric_local``, you can use ``idx="before:_ma_qa_metric_local"`` for ``cat=_ma_qa_metric``... Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import edit >>> # get sample CIF data >>> cif_data = '''data_test ... _ma_qa_metric.id 1 ... _ma_qa_metric.description test_score ... loop_ ... _ma_qa_metric_local.ordinal_id ... _ma_qa_metric_local.metric_value ... _ma_qa_metric_local.metric_id ... 1 1.0 1 ... 2 1.5 1 ... ''' >>> block = cif.read_string(cif_data).sole_block() >>> # move _ma_qa_metric_local to BEFORE _ma_qa_metric >>> edit.move_category( ... block, ... "_ma_qa_metric_local", ... "before:_ma_qa_metric", ... ) >>> print(block.as_string()) data_test loop_ _ma_qa_metric_local.ordinal_id _ma_qa_metric_local.metric_value _ma_qa_metric_local.metric_id 1 1.0 1 2 1.5 1 <BLANKLINE> _ma_qa_metric.id 1 _ma_qa_metric.description test_score <BLANKLINE> >>> # move _ma_qa_metric to the front >>> edit.move_category(block, "_ma_qa_metric", 0) >>> print(block.as_string()) data_test _ma_qa_metric.id 1 _ma_qa_metric.description test_score <BLANKLINE> loop_ _ma_qa_metric_local.ordinal_id _ma_qa_metric_local.metric_value _ma_qa_metric_local.metric_id 1 1.0 1 2 1.5 1 <BLANKLINE> Args: block (|gemmicifBlock|): CIF block to operate on. cat (str): Name of the CIF category to be moved. idx (int|str): Position to move ``cat`` to. This can be an integer for exact positioning, or a string of form ``[after|before]:<CATEGORY>`` for relative positioning. In relative positioning, ``<CATEGORY>`` specifies the name of the category before or after which ``cat`` will be placed. If ``<CATEGORY>`` can not be found, ``cat`` will not be relocated. Returns: None Raises: NotFoundCategoryError: If ``cat`` can not be found in ``block``. MoveIdxToFarError: If the target position is outside ``block``. For example, if ``block`` contains 10 categories, trying to move a category to position 15 will raise this error. """ table = access.get_table(block, cat) if not table: raise NotFoundCategoryError(cat) if idx is None: return if isinstance(idx, str): idx = _get_idx_for_placement(idx, table, block) if idx is None: return if table.loop is None: _move_pairs(block, cat, idx, table) else: try: block.move_item(block.get_index(table.tags[0]), idx) except RuntimeError as rexc: if str(rexc) == "move_item: new_pos out of range": raise MoveIdxToFarError(cat, idx) from None raise # pragma: no cover (fallback for general RuntimeError)
def _move_pairs(block, cat, idx, table): """Move a named-pair category.""" # We need to create a new list of tags because we are going to modify # the global gemmi.cif list in the loop. if idx < 0: items = list(table.tags) else: items = list(reversed(table.tags)) for i, itm in enumerate(items): itm_idx = block.get_index(itm) # adapt the idx to moving before/ after a category if idx > itm_idx: dst = idx - i else: dst = idx try: block.move_item(itm_idx, dst) except RuntimeError as rexc: if str(rexc) == "move_item: new_pos out of range": raise MoveIdxToFarError(cat, idx) from None raise # pragma: no cover (fallback for general RuntimeError)
[docs] def make_copy_value_in_row(src_item): """Returns a callback that returns a value from the same row. Supposed to be used in functions that require a callback, e.g. :func:`add_column`. Meant to copy values over from the same row. That is handy in case a missing column needs to be populated with values, e.g. if author defined values are missing but required, copy over the "label" fields. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import edit >>> # Add _atom_site.auth_comp_id from _atom_site.label_comp_id >>> # Note: category _atom_site is heavily cropped in thsi example to >>> # keep it concise >>> CIF_DATA = '''data_test ... # ... loop_ ... _atom_site.group_PDB ... _atom_site.type_symbol ... _atom_site.label_atom_id ... _atom_site.label_comp_id ... _atom_site.label_asym_id ... _atom_site.auth_seq_id ... ATOM C CA MET A 1 ... ATOM C CA ALA A 2 ... ATOM C CA THR A 3 ... ATOM C CA ALA A 4 ... ATOM C CA ALA A 5 ... ATOM C CA TYR A 6 ... ''' >>> block = cif.read_string(CIF_DATA).sole_block() >>> edit.add_column( ... block, ... "_atom_site", ... "auth_comp_id", ... edit.make_copy_value_in_row("label_comp_id"), ... ) >>> print(block.as_string()) data_test loop_ _atom_site.group_PDB _atom_site.type_symbol _atom_site.label_atom_id _atom_site.label_comp_id _atom_site.label_asym_id _atom_site.auth_seq_id _atom_site.auth_comp_id ATOM C CA MET A 1 MET ATOM C CA ALA A 2 ALA ATOM C CA THR A 3 THR ATOM C CA ALA A 4 ALA ATOM C CA ALA A 5 ALA ATOM C CA TYR A 6 TYR <BLANKLINE> Args: src_item (str): the column name to copy from (source). Returns: Callable[[:class:`gemmi.cif.Table.Row`], str]: Callback function usable as ``callback`` in :func:`add_column`. Note: This function may be outsourced to a supporting module, if :mod:`~modelarchive.modelcif.edit` gets to- big. """ def callback(row): return row[src_item] return callback
[docs] def make_res_per_chain_counter(asym_id_item): """Returns a stateful callback function counting residues per chain. :func:`make_res_per_chain_counter` returns a function that can be used as ``callback`` in :func:`add_column`. The returned callback assigns consecutive residue numbers within each chain of a table, starting at 1. When the chain identifier changes between two rows while iterating over the table, the counter is reset to 1. Examples: >>> # Add item "ndb_seq_num" to category "_pdbx_nonpoly_scheme" >>> # Reminder: "ndb_seq_num" -> column, "_pdbx_nonpoly_scheme" -> table >>> from gemmi import cif >>> from modelarchive.modelcif import edit >>> cif_data = '''data_test ... loop_ ... _pdbx_nonpoly_scheme.asym_id ... _pdbx_nonpoly_scheme.auth_seq_num ... _pdbx_nonpoly_scheme.entity_id ... _pdbx_nonpoly_scheme.mon_id ... _pdbx_nonpoly_scheme.pdb_seq_num ... C 1 3 ATP 1 ... D 1 4 HEM 1 ... E 1 5 HOH 1 ... E 2 5 HOH 2 ... ''' >>> block = cif.read_string(cif_data).sole_block() >>> # Using make_res_per_chain_counter() in add_column() will add a >>> # column to the loop_ and populate it with values: >>> edit.add_column( ... block, ... "_pdbx_nonpoly_scheme", ... "ndb_seq_num", ... edit.make_res_per_chain_counter("asym_id"), # CALLBACK ... pos=5, ... ) >>> print(block.as_string()) data_test loop_ _pdbx_nonpoly_scheme.asym_id _pdbx_nonpoly_scheme.auth_seq_num _pdbx_nonpoly_scheme.entity_id _pdbx_nonpoly_scheme.mon_id _pdbx_nonpoly_scheme.ndb_seq_num _pdbx_nonpoly_scheme.pdb_seq_num C 1 3 ATP 1 1 D 1 4 HEM 1 1 E 1 5 HOH 1 1 E 2 5 HOH 2 2 <BLANKLINE> >>> # "ndb_seq_num" is inserted as fifth column. The ATP in chain C >>> # ("asym_id") gets "ndb_seq_num" 1 and the HEM in chain D also gets >>> # "ndb_seq_num" 1. But the HOH, both live in chain E together, get >>> # "ndb_seq_num" 1 and 2. So for each chain, counting starts at 1 >>> # and per compound in a chain, the counter is increased by 1. Args: asym_id_item (str): Item name hosting the chain name. Returns: Callable[[:class:`gemmi.cif.Table.Row`], int]: Callback function usable as ``callback`` in :func:`add_column`. Note: This function may be outsourced to a supporting module, if :mod:`~modelarchive.modelcif.edit` gets too big. """ last_asym_id = None last_num = 0 def callback(row): nonlocal last_asym_id, last_num if last_asym_id != row[asym_id_item]: last_num = 1 else: last_num += 1 last_asym_id = row[asym_id_item] return last_num return callback
def _add_column(cat_itm, pos, table, block): """Add a new item to a name-value pair category.""" if table.loop is None: block.set_pair(cat_itm, "?") if pos == -1: pos = len(table.tags) else: pos -= 1 block.move_item( block.get_index(cat_itm), block.get_index(table.tags[0]) + pos, ) else: if pos != -1: pos -= 1 table.loop.add_columns([cat_itm], value="?", pos=pos)
[docs] def add_column( block, category, item, callback, pos=-1, # mod_cat_itms=None, raw=False, ): # No clue how to reduce no. of arguments, so allow it # pylint: disable=too-many-arguments,too-many-positional-arguments """Extend a category with a new item and populate it using a callback. Thinking of ModelCIF categories as tables, this function adds a new column (item) to a table that already exists in ``block``. A ``callback`` function, to be provided, is executed with each row to compute the value for the new column. This avoids having a static list to fetch the values from. :func:`make_res_per_chain_counter()` is an example of a stateful implementation of a working callback. The callback has to be of form ``function(row)`` and return the value to be set for the ``item`` in the given ``row``. Examples: >>> # Add "ndb_seq_num" to "_pdbx_nonpoly_scheme" including values >>> # Reminder: "ndb_seq_num" -> column, "_pdbx_nonpoly_scheme" -> table >>> from gemmi import cif >>> from modelarchive.modelcif import edit >>> cif_data = '''data_test ... loop_ ... _pdbx_nonpoly_scheme.asym_id ... _pdbx_nonpoly_scheme.entity_id ... _pdbx_nonpoly_scheme.mon_id ... _pdbx_nonpoly_scheme.pdb_seq_num ... C 1 ATP 1 ... D 2 HEM 1 ... E 3 HOH 1 ... E 3 HOH 2 ... ''' >>> block = cif.read_string(cif_data).sole_block() >>> edit.add_column( ... block, ... "_pdbx_nonpoly_scheme", ... "ndb_seq_num", ... edit.make_res_per_chain_counter("asym_id"), ... pos=-1, ... ) >>> print(block.as_string()) data_test loop_ _pdbx_nonpoly_scheme.asym_id _pdbx_nonpoly_scheme.entity_id _pdbx_nonpoly_scheme.mon_id _pdbx_nonpoly_scheme.pdb_seq_num _pdbx_nonpoly_scheme.ndb_seq_num C 1 ATP 1 1 D 2 HEM 1 1 E 3 HOH 1 1 E 3 HOH 2 2 <BLANKLINE> >>> # "ndb_seq_num" was appended as last column according to pos=-1 Args: block (|gemmicifBlock|): block holding the categories of the CIF document. category (str): The CIF category (table) to add the item to. item (str): The item (column) to be added. callback (Callable[[:class:`gemmi.cif.Table.Row`], int]): Function to be executed to compute values for each row of the new column. pos (int): Position to insert the column at. Default is at the end (-1). Inserting at the beginning requires ``pos=1``. raw (bool): Force to not quote strings containing white-spaces. Returns: None Raises: NotFoundCategoryError: If ``category`` can not be found in ``block``. """ # fetch original data table = access.get_table(block, category) if not table: raise NotFoundCategoryError(category) # init mod_cat_itms if needed # mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category) # check if column exists try: table.find_column(f"{category}.{item}") except RuntimeError: # If we get an exception, the item does not exist and we add it pass else: # if we do *NOT* get an exception, the item already exists _utils.warn_msg(f"'{category}.{item}' already exists', not updated.") # if len(mod_cat_itms[category]) == 0: # del mod_cat_itms[category] # return mod_cat_itms return # add column, independend if category is loop or name-value pairs _add_column(f"{category}.{item}", pos, table, block) # Re-fetch table: easier to work with but is not updated after adding a # column. Instead, the last column goes missing. That means if a column is # inserted in the middle of the table, the last column from the original # table becomes invisible. table = access.get_table(block, category) # add values for row in table: val = callback(row) val = str(val) if " " in val and not raw: val = cif.quote(val) row[item] = val
# # Register item for revision annotation # if item not in mod_cat_itms[category]: # mod_cat_itms[category].add(item) # return mod_cat_itms def _add_loop(block, cat, itms, mod_cat_itms, raw): """Add a loop to block - supporter for `add_category()`.""" block.set_mmcif_category(cat, itms, raw=raw) for itm in itms.keys(): mod_cat_itms[cat].add(itm) def _add_or_init_mod_cat_itms(mod_cat_itms, category): """Init modified categories/ items counter or add the category.""" if mod_cat_itms is None: mod_cat_itms = {category: set()} else: if category not in mod_cat_itms: # This needs to be unit tested once in use mod_cat_itms[category] = set() # pragma: no cover return mod_cat_itms def _add_pairs(block, cat, itms, mod_cat_itms, raw): """Add named pairs to block - supporter for `add_category()`.""" pyld = {} # payload for the set_pairs() call for k, v in itms.items(): if isinstance(v, list): pyld[k] = v[0] else: pyld[k] = v block.set_pairs(f"{cat}.", pyld, raw=raw) for itm in itms.keys(): mod_cat_itms[cat].add(itm)
[docs] def add_category( block, category, item_data, index=None, mod_cat_itms=None, raw=False, ): # No clue how to reduce no. of arguments, so allow it # pylint: disable=too-many-positional-arguments,too-many-arguments """Introduce a new category to a |gemmicifblock| and populate it. Add ``category`` to ``block`` using data from ``item_data``. ``item_data`` is a dictionary with the CIF item names as keys and values as values to the items. On single values, named-pairs will be created, on lists with more than one value, a loop will be created. ``index`` can be used to place the category at a certain position. Use an integer for a specific place in the category list or a string of form ``[after|before]:<CATEGORY>`` for relative positioning. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import edit >>> # start with an empty CIF document >>> cif_data = '''data_test ... ''' >>> block = cif.read_string(cif_data).sole_block() >>> # lets add entities >>> _ = edit.add_category( ... block, ... "_entity", ... { ... "id": [1, 2, 3], ... "type": ["polymer", "non-polymer", "water"], ... }, ... ) >>> print(block.as_string()) data_test loop_ _entity.id _entity.type 1 polymer 2 non-polymer 3 water <BLANKLINE> >>> # lets add an "_entry" ID before the entities >>> _ = edit.add_category( ... block, "_entry", {"id": "1FOO"}, index="before:_entity" ... ) >>> print(block.as_string()) data_test _entry.id 1FOO <BLANKLINE> loop_ _entity.id _entity.type 1 polymer 2 non-polymer 3 water <BLANKLINE> Args: block (|gemmicifblock|): CIF data block holding the categories of the CIF document. category (str): Name of the new category to be created. item_data (dict[str, list[Any]|Any]): Attributes and values to be added to the new category. Dictionary with item names as keys. Values are either a list of values or a single value. If a single value is provided (or a list containing only one element), a named key-value pair is created instead of a loop. index (int|str): Placement of the new category within ``block``. This can be an integer for exact positioning, or a string of form ``[after|before]:<CATEGORY>`` for relative positioning. In relative positioning, ``<CATEGORY>`` specifies the name of the category before or after which ``cat`` will be placed. mod_cat_itms (dict[str, set[str]] | None): A record of what has been modified. Dictionary of category assigned a set of items changed. Items which already have the value of the update, are not recorded. This is meant for the revision history, most likely you can ignore it. raw (bool, optional): If True, do not force quoting strings containing whitespace. Returns: dict[str, set[str]]: A record of what has been modified. To be used with a revision history, most likely you can ignore it. Raises: MoveIdxToFarError: If the target position is outside ``block``. For example, if ``block`` contains 10 categories, trying to create a category at position 15 will raise this error. """ mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category) if access.get_table(block, category): _utils.warn_msg( f"Category '{category} already exists, will not be added.'" ) if len(mod_cat_itms[category]) == 0: del mod_cat_itms[category] return mod_cat_itms if len(item_data) == 0: _utils.warn_msg(f"No itmes provided, not adding category '{category}'.") if len(mod_cat_itms[category]) == 0: del mod_cat_itms[category] return mod_cat_itms # figure out if we got pair or loop vals = next(iter(item_data.values())) if isinstance(vals, list): if len(vals) == 1: _add_pairs(block, category, item_data, mod_cat_itms, raw) else: _add_loop(block, category, item_data, mod_cat_itms, raw) else: _add_pairs(block, category, item_data, mod_cat_itms, raw) move_category(block, category, index) if len(mod_cat_itms[category]) == 0: del mod_cat_itms[category] # pragma: no cover (tested when active) return mod_cat_itms
def _get_next_ordinal(table, category, item): """From the ordinal (row ID) column, get the highest number+1.""" max_val = -sys.maxsize - 1 for val in table.find_column(category + "." + item): val = int(val) max_val = max(max_val, val) return max_val + 1 def _validate_row_dict(row_dict): """Check that the input of add_rows() is valid and return a copy.""" data = {} itm_count = next(iter(row_dict.values())) if isinstance(itm_count, list): itm_count = len(itm_count) else: itm_count = 1 for itm, val in row_dict.items(): if isinstance(val, list): n_count = len(val) data[itm] = val else: n_count = 1 data[itm] = [val] if n_count != itm_count: raise ValueError("Lists of row_dict are not of equal length.") itm_count = n_count return data, itm_count def _ensure_category( block, category, itm_count, data, ordinal_item, mod_cat_itms, raw ): # Just a helper function removing complexity from add_rows(), allow all args # pylint: disable=too-many-positional-arguments,too-many-arguments """Create a new cateogry with ordinal, if needed.""" # add ordinal to data if ordinal_item is not None: if ordinal_item not in data: data = { ordinal_item: list(range(1, itm_count + 1)), **data, } add_category(block, category, data, mod_cat_itms=mod_cat_itms, raw=raw) return mod_cat_itms def _build_loop_row(i, itm_names, data, ordinal_item, next_ordinal, raw): # Just a helper function removing complexity from add_rows(), allow all args # pylint: disable=too-many-positional-arguments,too-many-arguments """Assemble a row to be added with add_rows().""" loop_row = [] for itm in itm_names: try: data[itm][i] = str(data[itm][i]) if " " in data[itm][i] and not raw: loop_row.append(cif.quote(data[itm][i])) else: loop_row.append(data[itm][i]) except KeyError: if itm == ordinal_item: loop_row.append(str(next_ordinal)) next_ordinal += 1 else: loop_row.append(".") return loop_row, next_ordinal
[docs] def add_rows( block, category, row_dict, ordinal_item="ordinal", mod_cat_itms=None, raw=False, ): # No clue how to reduce no. of arguments, so allow it # pylint: disable=too-many-positional-arguments,too-many-arguments """Add rows to a ``category`` in ``block`` using an item-dictionary. Thinking of ModelCIF categories as tables, this function adds new rows (items) to a table (``category``) in ``block``. If ``category`` does not yet exist, it will be created. If multiple rows are provided, the new ``category`` will be created as loop, pairs otherwise. When adding row(s) to an existing pairs-category, the function will convert the ``category`` into a loop. Input data is provided via ``row_dict``. It must be a :class:`dict` of :class:`list` (for a single row, values may be single elements instead of lists). Item names are used as keys in ``row_dict``. Missing items that exist in ``category`` will be added as ``.`` in new rows. The order of items in ``row_dict`` can be arbitrary; this function will align them with the existing order in ``category``. ``ordinal_item`` describes a unique numerical ID for each row. If provided, the function will automatically increment it for new rows. In `ModelCIF`_, this column is often called ``ordinal`` though some categories use different names. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import edit >>> # start with an empty CIF document >>> cif_data = '''data_test ... ''' >>> block = cif.read_string(cif_data).sole_block() >>> # Lets add an entity to create a category in block. ordinal_item >>> # is set to None on purpose to show how it works later. >>> _ = edit.add_rows( ... block, ... "_entity", ... {"id": 1, "details": "Protein", "type": "polymer"}, ... ordinal_item=None, ... ) >>> # see how the _entity category is created as couple of pairs >>> print(block.as_string()) data_test _entity.id 1 _entity.details Protein _entity.type polymer <BLANKLINE> >>> # Add a second row (pairs will turn into a loop). This time, include >>> # ordinal_item to let the function take care of incrementing IDs. >>> _ = edit.add_rows( ... block, ... "_entity", ... {"details": ["H2O"], "type": ["water"]}, ... ordinal_item="id", ... ) >>> # Now _entity is a loop and _entity.id was incremented automatically >>> print(block.as_string()) data_test loop_ _entity.id _entity.details _entity.type 1 Protein polymer 2 H2O water <BLANKLINE> >>> # As a last example, add multiple new rows at once but skip the >>> # 'details' column. >>> _ = edit.add_rows( ... block, ... "_entity", ... {"type": ["polymer", "polymer"]}, ... ordinal_item="id", ... ) >>> # Now there are two more polymer entities in the loop but since >>> # the 'details' information was missing, the function added '.' in >>> # those fields. >>> print(block.as_string()) data_test loop_ _entity.id _entity.details _entity.type 1 Protein polymer 2 H2O water 3 . polymer 4 . polymer <BLANKLINE> Args: block (|gemmicifblock|): CIF data block holding the categories of the CIF document. category (str): Name of the category to which row(s) will be added. row_dict (dict[str, list | Any]): Row data to be added to ``category``. Keys are item names of the category. Values must be lists when adding multiple rows. For a single row, values may be provided as scalars instead of lists. If an item is missing from ``row_dict`` but exists in the category, '.' will be assigned for that item in the new row(s). ordinal_item (str | None): If the category includes an ordinal (in database terms a primary key), this identifies the item name of it. If ``ordinal_item`` is provided, the latest ordinal will be read from the category and automatically incremented for new rows. Use ``None`` in case the category does not have an ordinal or if the ordinal should be set explicitly. The ordinal does not need to be included in ``row_dict``. mod_cat_itms (dict[str, set[str]] | None): A record of what has been modified. Dictionary of category assigned a set of items changed. Items which already have the value of the update, are not recorded. This is meant for the revision history, most likely you can ignore it. raw (bool, optional): If True, do not force quoting strings containing whitespace. Returns: dict[str, set[str]]: A record of what has been modified. To be used with a revision history, most likely you can ignore it. Raises: ValueError: In case item lists in ``row_dict`` are not of equal length. """ if category.endswith("."): category = category[:-1] if len(row_dict) == 0: _utils.warn_msg(f"No data to be added for '{category}', skipped.") return mod_cat_itms # check that all value lists are of same length data, itm_count = _validate_row_dict(row_dict) mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category) # check if category already exists table = access.get_table(block, category) if not table: return _ensure_category( block, category, itm_count, data, ordinal_item, mod_cat_itms, raw ) # get the next row ID before turning table into a loop next_ordinal = 1 if ordinal_item is not None: next_ordinal = _get_next_ordinal(table, category, ordinal_item) # if exsits but is a pair, turn into loop to add more rows table.ensure_loop() table = table.loop # Create list of rows to be added to the loop, ordered by loop.tags. # A row is a list of (string) values. # copy item names from loop to preserve order itm_names = list(table.tags) # We are altering the list while iterating it, so using indexes above # enumeration is preferred. PyLint may be disabled in this case. # pylint: disable=consider-using-enumerate for i in range(len(itm_names)): itm_names[i] = itm_names[i].split(".")[1] mod_cat_itms[category].add(itm_names[i]) category += "." for i in range(itm_count): loop_row, next_ordinal = _build_loop_row( i, itm_names, data, ordinal_item, next_ordinal, raw ) table.add_row(loop_row) return mod_cat_itms
[docs] def sort(table_or_block, item, category=None, key=None): """Sort a |gemmi.cif.Table|_ or |gemmicifBlock| in-place by the given item. This may be useful after editing a table, to sort it by a selected column (e.g. the ordinal). Numerical values are sorted numerically, all others lexicographically. ``key`` can take a function to extract a comparison key from each row. This is helpful for cases like ``_citation.id``, where special values (e.g. ``id=primary``) might need to be placed first. Works on an already loaded |gemmi.cif.Table|_, or on a |gemmicifBlock| (requires ``category``) to sort many categories one after another in less code. Examples: >>> from gemmi import cif >>> from modelarchive.modelcif import access, edit >>> # start with an empty CIF document >>> CIF_DATA = '''data_test ... loop_ ... _citation.id ... _citation.journal_full ... _citation.title ... _citation.year ... _citation.journal_volume ... 3 "The Lord of the Rings" "Return of the King" 1955 3 ... 1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2 ... 2 "The Lord of the Rings" "The Two Towers" 1954 1 ... primary . "The Hobbit or There and Back Again" 1937 . ... ''' >>> block = cif.read_string(CIF_DATA).sole_block() >>> table = access.get_table(block, "_citation") >>> # first sort without a key function >>> edit.sort(table, "id") >>> # This sorts the LOTR books properly, but the 'primary' book is at >>> # the bottom >>> print(block.as_string()) data_test loop_ _citation.id _citation.journal_full _citation.title _citation.year _citation.journal_volume 1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2 2 "The Lord of the Rings" "The Two Towers" 1954 1 3 "The Lord of the Rings" "Return of the King" 1955 3 primary . "The Hobbit or There and Back Again" 1937 . <BLANKLINE> >>> # sort again (this time by block), with a lambda that puts >>> # 'primary' first >>> edit.sort( ... block, ... "id", ... category="_citation", ... key=lambda row: ( ... (0, "") if row["id"] == "primary" else (1, row["id"]) ... ), ... ) >>> print(block.as_string()) data_test loop_ _citation.id _citation.journal_full _citation.title _citation.year _citation.journal_volume primary . "The Hobbit or There and Back Again" 1937 . 1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2 2 "The Lord of the Rings" "The Two Towers" 1954 1 3 "The Lord of the Rings" "Return of the King" 1955 3 <BLANKLINE> Args: table_or_block (|gemmi.cif.Table|_ | |gemmicifBlock|): Object to be sorted. On |gemmicifBlock|, the corresponding table will be loaded using ``category``. item (str): Name of the column (item) in the table to sort by. category (str, optional): Name of the category when sorting a |gemmicifBlock|. key (callable, optional): Function taking a row and returning a sortable value. Defaults to lexicographic ``row[item]`` with a fix for numerical sorting. Returns: None Raises: ValueError: If ``table_or_block`` is a |gemmicifBlock| object but no ``category`` was provided. """ if isinstance(table_or_block, cif.Block): if category is None: raise ValueError( "Arg 'category' required for sorting gemmi.cif.Block" ) table = access.get_table(table_or_block, category) else: table = table_or_block if key is None: def key(row): # Create tuples 0/ 1 for number/ string so numbers are compared with # numbers, strings with strings and if a string hits a number, # string is pushed down. try: return (0, int(row[item])) except ValueError: return (1, row[item]) # Prepare guidance for re-ordering the table. Uses indices as gemmi.cif.Row # objects can not be used to identify list-items by index(). n_rows = len(table) indices = list(range(n_rows)) order = sorted(indices, key=lambda i: key(table[i])) cur = list(range(n_rows)) for t, idx in enumerate(order): p = cur.index(idx) if p != t: table.move_row(old_pos=p, new_pos=t) cur.insert(t, cur.pop(p))
# LocalWords: gemmicifBlock func idx CIF qa str ValueError ModelArchive ndb # LocalWords: MoveIdxToFarError Args num pdbx nonpoly gemmi cif modelarchive # LocalWords: modelcif asym auth mon pdb HOH pos BLANKLINE bool itms msg # LocalWords: NotFoundCategoryError gemmicifblock NotFound iterable