"""Functionality to extend and modify ModelCIF files."""
# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
# Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
# Pylint complains about too many lines. Since we have lots of documentation
# inline, lets ignore this.
# pylint: disable=too-many-lines
import sys
from gemmi import cif
from .. import _utils
from . import access
[docs]
class NotFoundError(RuntimeError):
"""General exception for 'things' that can not be found.
If ``msg`` is omitted, generates a message
"<SUBJECT> '<VALUE>' does not exist". If ``value`` is a list with more than
one element, the message will be written in plural mode. If ``subject`` is
a list or tuple, a second element will be used as plural of the subject.
This exception should not be raised directly, it exists to define other
"NotFound" exceptions inheriting from it.
Args:
subject (str|list|tuple): The 'thing' that can not be found, used in the
generated message. If :class:`list:` or :class:`tuple`, a second
element is used as plural.
value (str|list): The name of what can not be found, used in the
generated message. Provied a list of values to get a message
fitting plural.
msg (str): Optional alternative error message.
"""
def __init__(self, subject, value, msg):
dos = "does"
if msg is None:
if not isinstance(subject, str):
if isinstance(value, (list, tuple)):
if len(value) > 1:
subject = subject[1]
dos = "do"
value = "', '".join(value)
else:
subject = subject[0]
else:
if isinstance(value, (list, tuple)):
if len(value) > 1:
dos = "do"
value = "', '".join(value)
msg = f"{subject} '{value}' {dos} not exist."
super().__init__(msg)
[docs]
class NotFoundCategoryError(NotFoundError):
"""Exception if a category can not be found.
This exception should be raised when a function expects a specific
category to exist in the corresponding |gemmicifBlock|, but the category
cannot be retrieved.
Attributes:
category (tuple): Tuple of category names that could not be found.
Args:
category (str|list): Name of the category that could not be found.
Using a list of categories writes the generated message in plural.
msg (str): Optional alternative error message.
"""
def __init__(self, category=None, msg=None):
if isinstance(category, list):
self.category = tuple(category)
elif isinstance(category, tuple):
self.category = category
else:
self.category = (category,)
super().__init__(("Category", "Categories"), category, msg)
[docs]
class NotFoundItemError(NotFoundError):
"""Exception if an item can not be found.
This exception should be raised when a function expects a specific
item to exist in the corresponding CIF category, but the item cannot be
retrieved.
Args:
item (str): Name of the item that could not be found. Use as
"<CATEGORY>.<ITEM>" for clarity. Using a list of items writes the
generated message in plural.
msg (str): Optional alternative error message.
"""
def __init__(self, item=None, msg=None):
if isinstance(item, list):
self.item = tuple(item)
elif isinstance(item, tuple):
self.item = item
else:
self.item = (item,)
super().__init__(("Item", "Items"), item, msg)
def _get_idx_for_placement(plcmnt, block):
"""Turn a relative placement into an index in a block."""
try:
pos, cat = plcmnt.split(":", maxsplit=1)
except ValueError:
# ToDo: turn into own exception if needed
raise ValueError(
f"Couldn't split placement string '{plcmnt}', maybe the "
+ "':' is missing. Placement string needs to be of form "
+ "'[after|before]:<CATEGORY>'."
) from None
table = access.get_table(block, cat)
if not table:
_utils.warn_msg(
f"Category '{cat}' for relative placement not found. "
+ "Skipping relocation."
)
return None
# get idx of first or last
if pos.upper() == "AFTER":
idx = block.get_index(table.tags[-1])
if idx < sum(1 for _ in block) - 1:
idx += 1
return idx
if pos.upper() == "BEFORE":
return block.get_index(table.tags[0])
# ToDo: turn into own exception if needed
raise ValueError(
f"Relative placement string '{pos}' not recognised. "
+ "Valid directions are 'after' and 'before'."
)
[docs]
class MoveIdxToFarError(RuntimeError):
"""Exception if repositioning exceeds the size of document-category-list.
Primarily used by :func:`move_category`, on the attempt to move a category
to a position that does not exist within the corresponding |gemmicifBlock|.
For example, if the |gemmicifBlock| object contains 10 categories, trying
to move a category to position 15 will fail and should raise this
exception.
Args:
category (str): Name of the category that could not be moved.
idx (int): Target position to which the category was to be moved.
"""
def __init__(self, category, idx):
super().__init__(
f"Cannot move '{category}' to position '{idx}', exceeds range."
)
[docs]
def move_category(block, cat, idx):
"""Move a category to a new position in a |gemmicifBlock|.
By design, ModelCIF files are not intended to be read or edited manually.
Instead, dedicated applications should handle the format, providing
functionality to view and modify the data. However, at `ModelArchive`_ we
occasionally need to open ModelCIF files in an editor to inspect specific
details. In such cases, it is helpful to have related categories grouped
together, reducing the need to jump back and forth between different
categories. This asks for a function to reposition categories within a
ModelCIF file.
:func:`move_category` takes category ``cat`` and moves it to position
``idx`` in the CIF block ``block``. The parameter ``idx`` is somewhat
special: it can be just an integer index, specifying the exact position to
move ``cat`` to. That comes in handy placing categories at the beginning
(``idx=0``) or at the end (``idx=-1``) of ``block``. However, specifying
an absolute index is often less useful in practice, as categories are
typically organised relative to related categories. For this purpose,
``idx`` provides a special syntax: ``[after|before]:<CATEGORY>``. For
example, if you want to put category ``_ma_qa_metric`` in front of
category ``_ma_qa_metric_local``, you can use
``idx="before:_ma_qa_metric_local"`` for ``cat=_ma_qa_metric``...
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import edit
>>> # get sample CIF data
>>> cif_data = '''data_test
... _ma_qa_metric.id 1
... _ma_qa_metric.description test_score
... loop_
... _ma_qa_metric_local.ordinal_id
... _ma_qa_metric_local.metric_value
... _ma_qa_metric_local.metric_id
... 1 1.0 1
... 2 1.5 1
... '''
>>> block = cif.read_string(cif_data).sole_block()
>>> # move _ma_qa_metric_local to BEFORE _ma_qa_metric
>>> edit.move_category(
... block,
... "_ma_qa_metric_local",
... "before:_ma_qa_metric",
... )
>>> print(block.as_string())
data_test
loop_
_ma_qa_metric_local.ordinal_id
_ma_qa_metric_local.metric_value
_ma_qa_metric_local.metric_id
1 1.0 1
2 1.5 1
<BLANKLINE>
_ma_qa_metric.id 1
_ma_qa_metric.description test_score
<BLANKLINE>
>>> # move _ma_qa_metric to the front
>>> edit.move_category(block, "_ma_qa_metric", 0)
>>> print(block.as_string())
data_test
_ma_qa_metric.id 1
_ma_qa_metric.description test_score
<BLANKLINE>
loop_
_ma_qa_metric_local.ordinal_id
_ma_qa_metric_local.metric_value
_ma_qa_metric_local.metric_id
1 1.0 1
2 1.5 1
<BLANKLINE>
Args:
block (|gemmicifBlock|): CIF block to operate on.
cat (str): Name of the CIF category to be moved.
idx (int|str): Position to move ``cat`` to. This can be an integer for
exact positioning, or a string of form
``[after|before]:<CATEGORY>`` for relative positioning. In
relative positioning, ``<CATEGORY>`` specifies the name of the
category before or after which ``cat`` will be placed. If
``<CATEGORY>`` can not be found, ``cat`` will not be relocated.
Returns:
None
Raises:
NotFoundCategoryError: If ``cat`` can not be found in ``block``.
MoveIdxToFarError: If the target position is outside ``block``. For
example, if ``block`` contains 10 categories, trying to move a
category to position 15 will raise this error.
"""
if idx is None:
return
if isinstance(idx, str):
idx = _get_idx_for_placement(idx, block)
if idx is None:
return
table = access.get_table(block, cat)
if not table:
raise NotFoundCategoryError(cat)
if table.loop is None:
_move_pairs(block, cat, idx, table)
else:
try:
block.move_item(block.get_index(table.tags[0]), idx)
except RuntimeError as rexc:
if str(rexc) == "move_item: new_pos out of range":
raise MoveIdxToFarError(cat, idx) from None
raise # pragma: no cover (fallback for general RuntimeError)
def _move_pairs(block, cat, idx, table):
"""Move a named-pair category."""
# We need to create a new list of tags because we are going to modify
# the global gemmi.cif list in the loop.
if idx < 0:
items = list(table.tags)
else:
items = list(reversed(table.tags))
for i, itm in enumerate(items):
itm_idx = block.get_index(itm)
# adapt the idx to moving before/ after a category
if idx > itm_idx:
dst = idx - i
else:
dst = idx
try:
block.move_item(itm_idx, dst)
except RuntimeError as rexc:
if str(rexc) == "move_item: new_pos out of range":
raise MoveIdxToFarError(cat, idx) from None
raise # pragma: no cover (fallback for general RuntimeError)
[docs]
def make_res_per_chain_counter(asym_id_item):
"""Returns a stateful callback function counting residues per chain.
:func:`make_res_per_chain_counter` returns a function that can be used as
``callback`` in :func:`add_column`.
The returned callback assigns consecutive residue numbers within each chain
of a table, starting at 1. When the chain identifier changes between two
rows while iterating over the table, the counter is reset to 1.
Examples:
>>> # Add item "ndb_seq_num" to category "_pdbx_nonpoly_scheme"
>>> # Reminder: "ndb_seq_num" -> column, "_pdbx_nonpoly_scheme" -> table
>>> from gemmi import cif
>>> from modelarchive.modelcif import edit
>>> cif_data = '''data_test
... loop_
... _pdbx_nonpoly_scheme.asym_id
... _pdbx_nonpoly_scheme.auth_seq_num
... _pdbx_nonpoly_scheme.entity_id
... _pdbx_nonpoly_scheme.mon_id
... _pdbx_nonpoly_scheme.pdb_seq_num
... C 1 3 ATP 1
... D 1 4 HEM 1
... E 1 5 HOH 1
... E 2 5 HOH 2
... '''
>>> block = cif.read_string(cif_data).sole_block()
>>> # Using make_res_per_chain_counter() in add_column() will add a
>>> # column to the loop_ and populate it with values:
>>> edit.add_column(
... block,
... "_pdbx_nonpoly_scheme",
... "ndb_seq_num",
... edit.make_res_per_chain_counter("asym_id"), # CALLBACK
... pos=5,
... )
>>> print(block.as_string())
data_test
loop_
_pdbx_nonpoly_scheme.asym_id
_pdbx_nonpoly_scheme.auth_seq_num
_pdbx_nonpoly_scheme.entity_id
_pdbx_nonpoly_scheme.mon_id
_pdbx_nonpoly_scheme.ndb_seq_num
_pdbx_nonpoly_scheme.pdb_seq_num
C 1 3 ATP 1 1
D 1 4 HEM 1 1
E 1 5 HOH 1 1
E 2 5 HOH 2 2
<BLANKLINE>
>>> # "ndb_seq_num" is inserted as fifth column. The ATP in chain C
>>> # ("asym_id") gets "ndb_seq_num" 1 and the HEM in chain D also gets
>>> # "ndb_seq_num" 1. But the HOH, both live in chain E together, get
>>> # "ndb_seq_num" 1 and 2. So for each chain, counting starts at 1
>>> # and per compound in a chain, the counter is increased by 1.
Args:
asym_id_item (str): Item name hosting the chain name.
Returns:
Callable[[:class:`gemmi.cif.Table.Row`], int]: Callback function usable
as ``callback`` in :func:`add_column`.
Note:
This function may be outsourced to a supporting module, if
:mod:`~modelarchive.modelcif.edit` gets to big.
"""
last_asym_id = None
last_num = 0
def callback(row):
nonlocal last_asym_id, last_num
if last_asym_id != row[asym_id_item]:
last_num = 1
else:
last_num += 1
last_asym_id = row[asym_id_item]
return last_num
return callback
def _add_column(cat_itm, pos, table, block):
"""Add a new item to a name-value pair category."""
if table.loop is None:
block.set_pair(cat_itm, "?")
if pos == -1:
pos = len(table.tags)
else:
pos -= 1
block.move_item(
block.get_index(cat_itm),
block.get_index(table.tags[0]) + pos,
)
else:
if pos != -1:
pos -= 1
table.loop.add_columns([cat_itm], value="?", pos=pos)
[docs]
def add_column(
block,
category,
item,
callback,
pos=-1,
# mod_cat_itms=None,
raw=False,
):
# No clue how to reduce no. of arguments, so allow it
# pylint: disable=too-many-arguments,too-many-positional-arguments
"""Extend a category with a new item and populate it using a callback.
Thinking of ModelCIF categories as tables, this function adds a new column
(item) to a table that already exists in ``block``. A ``callback``
function, to be provided, is executed with each row to compute the value
for the new column. This avoids having a static list to fetch the values
from.
:func:`make_res_per_chain_counter()` is an example of a stateful
implementation of a working callback.
The callback has to be of form ``function(row)`` and return the value to be
set for the ``item`` in the given ``row``.
Examples:
>>> # Add "ndb_seq_num" to "_pdbx_nonpoly_scheme" including values
>>> # Reminder: "ndb_seq_num" -> column, "_pdbx_nonpoly_scheme" -> table
>>> from gemmi import cif
>>> from modelarchive.modelcif import edit
>>> cif_data = '''data_test
... loop_
... _pdbx_nonpoly_scheme.asym_id
... _pdbx_nonpoly_scheme.entity_id
... _pdbx_nonpoly_scheme.mon_id
... _pdbx_nonpoly_scheme.pdb_seq_num
... C 1 ATP 1
... D 2 HEM 1
... E 3 HOH 1
... E 3 HOH 2
... '''
>>> block = cif.read_string(cif_data).sole_block()
>>> edit.add_column(
... block,
... "_pdbx_nonpoly_scheme",
... "ndb_seq_num",
... edit.make_res_per_chain_counter("asym_id"),
... pos=-1,
... )
>>> print(block.as_string())
data_test
loop_
_pdbx_nonpoly_scheme.asym_id
_pdbx_nonpoly_scheme.entity_id
_pdbx_nonpoly_scheme.mon_id
_pdbx_nonpoly_scheme.pdb_seq_num
_pdbx_nonpoly_scheme.ndb_seq_num
C 1 ATP 1 1
D 2 HEM 1 1
E 3 HOH 1 1
E 3 HOH 2 2
<BLANKLINE>
>>> # "ndb_seq_num" was appended as last column according to pos=-1
Args:
block (|gemmicifBlock|): block holding the categories of the CIF
document.
category (str): The CIF category (table) to add the item to.
item (str): The item (column) to be added.
callback (Callable[[:class:`gemmi.cif.Table.Row`], int]): Function to be
executed to compute values for each row of the new column.
pos (int): Position to insert the column at. Default is at the end (-1).
Inserting at the beginning requires ``pos=1``.
raw (bool): Force to not quote strings containing white-spaces.
Returns:
None
Raises:
NotFoundCategoryError: If ``category`` can not be found in ``block``.
"""
# fetch original data
table = access.get_table(block, category)
if not table:
raise NotFoundCategoryError(category)
# init mod_cat_itms if needed
# mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category)
# check if column exists
try:
table.find_column(f"{category}.{item}")
except RuntimeError:
# If we get an exception, the item does not exist and we add it
pass
else:
# if we do *NOT* get an exception, the item already exists
_utils.warn_msg(f"'{category}.{item}' already exists', not updated.")
# if len(mod_cat_itms[category]) == 0:
# del mod_cat_itms[category]
# return mod_cat_itms
return
# add column, independend if category is loop or name-value pairs
_add_column(f"{category}.{item}", pos, table, block)
# Re-fetch table: easier to work with but is not updated after adding a
# column. Instead, the last column goes missing. That means if a column is
# inserted in the middle of the table, the last column from the original
# table becomes invisible.
table = access.get_table(block, category)
# add values
for row in table:
val = callback(row)
val = str(val)
if " " in val and not raw:
val = cif.quote(val)
row[item] = val
# # Register item for revision annotation
# if item not in mod_cat_itms[category]:
# mod_cat_itms[category].add(item)
# return mod_cat_itms
def _add_loop(block, cat, itms, mod_cat_itms, raw):
"""Add a loop to block - supporter for `add_category()`."""
block.set_mmcif_category(cat, itms, raw=raw)
for itm in itms.keys():
mod_cat_itms[cat].add(itm)
def _add_or_init_mod_cat_itms(mod_cat_itms, category):
"""Init modified categories/ items counter or add the category."""
if mod_cat_itms is None:
mod_cat_itms = {category: set()}
else:
if category not in mod_cat_itms:
# This needs to be unit tested once in use
mod_cat_itms[category] = set() # pragma: no cover
return mod_cat_itms
def _add_pairs(block, cat, itms, mod_cat_itms, raw):
"""Add named pairs to block - supporter for `add_category()`."""
pyld = {} # payload for the set_pairs() call
for k, v in itms.items():
if isinstance(v, list):
pyld[k] = v[0]
else:
pyld[k] = v
block.set_pairs(f"{cat}.", pyld, raw=raw)
for itm in itms.keys():
mod_cat_itms[cat].add(itm)
[docs]
def add_category(
block,
category,
item_data,
index=None,
mod_cat_itms=None,
raw=False,
):
# No clue how to reduce no. of arguments, so allow it
# pylint: disable=too-many-positional-arguments,too-many-arguments
"""Introduce a new category to a |gemmicifblock| and populate it.
Add ``category`` to ``block`` using data from ``item_data``. ``item_data``
is a dictionary with the CIF item names as keys and values as values to the
items. On single values, named-pairs will be created, on lists with more
than one value, a loop will be created. ``index`` can be used to place the
category at a certain position. Use an integer for a specific place in the
category list or a string of form ``[after|before]:<CATEGORY>`` for relative
positioning.
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import edit
>>> # start with an empty CIF document
>>> cif_data = '''data_test
... '''
>>> block = cif.read_string(cif_data).sole_block()
>>> # lets add entities
>>> _ = edit.add_category(
... block,
... "_entity",
... {
... "id": [1, 2, 3],
... "type": ["polymer", "non-polymer", "water"],
... },
... )
>>> print(block.as_string())
data_test
loop_
_entity.id
_entity.type
1 polymer
2 non-polymer
3 water
<BLANKLINE>
>>> # lets add an "_entry" ID before the entities
>>> _ = edit.add_category(
... block, "_entry", {"id": "1FOO"}, index="before:_entity"
... )
>>> print(block.as_string())
data_test
_entry.id 1FOO
<BLANKLINE>
loop_
_entity.id
_entity.type
1 polymer
2 non-polymer
3 water
<BLANKLINE>
Args:
block (|gemmicifblock|): CIF data block holding the categories of the
CIF document.
category (str): Name of the new category to be created.
item_data (dict[str, list[Any]|Any]): Attributes and values to be
added to the new category. Dictionary with item names as keys.
Values are either a list of values or a single value. If a single
value is provided (or a list containing only one element), a named
key-value pair is created instead of a loop.
index (int|str): Placement of the new category within ``block``. This
can be an integer for exact positioning, or a string of form
``[after|before]:<CATEGORY>`` for relative positioning. In relative
positioning, ``<CATEGORY>`` specifies the name of the category
before or after which ``cat`` will be placed.
mod_cat_itms (dict[str, set[str]] | None): A record of what has been
modified. Dictionary of category assigned a set of items changed.
Items which already have the value of the update, are not recorded.
This is meant for the revision history, most likely you can ignore
it.
raw (bool, optional): If True, do not force quoting strings containing
whitespace.
Returns:
dict[str, set[str]]: A record of what has been modified. To be used
with a revision history, most likely you can ignore it.
Raises:
MoveIdxToFarError: If the target position is outside ``block``. For
example, if ``block`` contains 10 categories, trying to create a
category at position 15 will raise this error.
"""
mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category)
if access.get_table(block, category):
_utils.warn_msg(
f"Category '{category} already exists, will not be added.'"
)
if len(mod_cat_itms[category]) == 0:
del mod_cat_itms[category]
return mod_cat_itms
if len(item_data) == 0:
_utils.warn_msg(f"No itmes provided, not adding category '{category}'.")
if len(mod_cat_itms[category]) == 0:
del mod_cat_itms[category]
return mod_cat_itms
# figure out if we got pair or loop
vals = next(iter(item_data.values()))
if isinstance(vals, list):
if len(vals) == 1:
_add_pairs(block, category, item_data, mod_cat_itms, raw)
else:
_add_loop(block, category, item_data, mod_cat_itms, raw)
else:
_add_pairs(block, category, item_data, mod_cat_itms, raw)
move_category(block, category, index)
if len(mod_cat_itms[category]) == 0:
del mod_cat_itms[category] # pragma: no cover (tested when active)
return mod_cat_itms
def _get_next_ordinal(table, category, item):
"""From the ordinal (row ID) column, get the highest number+1."""
max_val = -sys.maxsize - 1
for val in table.find_column(category + "." + item):
val = int(val)
max_val = max(max_val, val)
return max_val + 1
def _validate_row_dict(row_dict):
"""Check that the input of add_rows() is valid and return a copy."""
data = {}
itm_count = next(iter(row_dict.values()))
if isinstance(itm_count, list):
itm_count = len(itm_count)
else:
itm_count = 1
for itm, val in row_dict.items():
if isinstance(val, list):
n_count = len(val)
data[itm] = val
else:
n_count = 1
data[itm] = [val]
if n_count != itm_count:
raise ValueError("Lists of row_dict are not of equal length.")
itm_count = n_count
return data, itm_count
def _ensure_category(
block, category, itm_count, data, ordinal_item, mod_cat_itms, raw
):
# Just a helper function removing complexity from add_rows(), allow all args
# pylint: disable=too-many-positional-arguments,too-many-arguments
"""Create a new cateogry with ordinal, if needed."""
# add ordinal to data
if ordinal_item is not None:
if ordinal_item not in data:
data = {
ordinal_item: list(range(1, itm_count + 1)),
**data,
}
add_category(block, category, data, mod_cat_itms=mod_cat_itms, raw=raw)
return mod_cat_itms
def _build_loop_row(i, itm_names, data, ordinal_item, next_ordinal, raw):
# Just a helper function removing complexity from add_rows(), allow all args
# pylint: disable=too-many-positional-arguments,too-many-arguments
"""Assemble a row to be added with add_rows()."""
loop_row = []
for itm in itm_names:
try:
data[itm][i] = str(data[itm][i])
if " " in data[itm][i] and not raw:
loop_row.append(cif.quote(data[itm][i]))
else:
loop_row.append(data[itm][i])
except KeyError:
if itm == ordinal_item:
loop_row.append(str(next_ordinal))
next_ordinal += 1
else:
loop_row.append(".")
return loop_row, next_ordinal
[docs]
def add_rows(
block,
category,
row_dict,
ordinal_item="ordinal",
mod_cat_itms=None,
raw=False,
):
# No clue how to reduce no. of arguments, so allow it
# pylint: disable=too-many-positional-arguments,too-many-arguments
"""Add rows to a ``category`` in ``block`` using an item-dictionary.
Thinking of ModelCIF categories as tables, this function adds new rows
(items) to a table (``category``) in ``block``. If ``category`` does not yet
exist, it will be created. If multiple rows are provided, the new
``category`` will be created as loop, pairs otherwise. When adding row(s)
to an existing pairs-category, the function will convert the ``category``
into a loop.
Input data is provided via ``row_dict``. It must be a :class:`dict` of
:class:`list` (for a single row, values may be single elements instead of
lists). Item names are used as keys in ``row_dict``. Missing items that
exist in ``category`` will be added as ``.`` in new rows. The order of
items in ``row_dict`` can be arbitrary; this function will align them with
the existing order in ``category``.
``ordinal_item`` describes a unique numerical ID for each row. If provided,
the function will automatically increment it for new rows. In `ModelCIF`_,
this column is often called ``ordinal`` though some categories use
different names.
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import edit
>>> # start with an empty CIF document
>>> cif_data = '''data_test
... '''
>>> block = cif.read_string(cif_data).sole_block()
>>> # Lets add an entity to create a category in block. ordinal_item
>>> # is set to None on purpose to show how it works later.
>>> _ = edit.add_rows(
... block,
... "_entity",
... {"id": 1, "details": "Protein", "type": "polymer"},
... ordinal_item=None,
... )
>>> # see how the _entity category is created as couple of pairs
>>> print(block.as_string())
data_test
_entity.id 1
_entity.details Protein
_entity.type polymer
<BLANKLINE>
>>> # Add a second row (pairs will turn into a loop). This time, include
>>> # ordinal_item to let the function take care of incrementing IDs.
>>> _ = edit.add_rows(
... block,
... "_entity",
... {"details": ["H2O"], "type": ["water"]},
... ordinal_item="id",
... )
>>> # Now _entity is a loop and _entity.id was incremented automatically
>>> print(block.as_string())
data_test
loop_
_entity.id
_entity.details
_entity.type
1 Protein polymer
2 H2O water
<BLANKLINE>
>>> # As a last example, add multiple new rows at once but skip the
>>> # 'details' column.
>>> _ = edit.add_rows(
... block,
... "_entity",
... {"type": ["polymer", "polymer"]},
... ordinal_item="id",
... )
>>> # Now there are two more polymer entities in the loop but since
>>> # the 'details' information was missing, the function added '.' in
>>> # those fields.
>>> print(block.as_string())
data_test
loop_
_entity.id
_entity.details
_entity.type
1 Protein polymer
2 H2O water
3 . polymer
4 . polymer
<BLANKLINE>
Args:
block (|gemmicifblock|): CIF data block holding the categories of the
CIF document.
category (str): Name of the category to which row(s) will be added.
row_dict (dict[str, list | Any]): Row data to be added to ``category``.
Keys are item names of the category. Values must be lists when
adding multiple rows. For a single row, values may be provided as
scalars instead of lists. If an item is missing from ``row_dict``
but exists in the category, '.' will be assigned for that item in
the new row(s).
ordinal_item (str | None): If the category includes an ordinal (in
database terms a primary key), this identifies the item name of
it. If ``ordinal_item`` is provided, the latest ordinal will be
read from the category and automatically incremented for new rows.
Use ``None`` in case the category does not have an ordinal or if
the ordinal should be set explicitly. The ordinal does not need to
be included in ``row_dict``.
mod_cat_itms (dict[str, set[str]] | None): A record of what has been
modified. Dictionary of category assigned a set of items changed.
Items which already have the value of the update, are not recorded.
This is meant for the revision history, most likely you can ignore
it.
raw (bool, optional): If True, do not force quoting strings containing
whitespace.
Returns:
dict[str, set[str]]: A record of what has been modified. To be used
with a revision history, most likely you can ignore it.
Raises:
ValueError: In case item lists in ``row_dict`` are not of equal length.
"""
if category.endswith("."):
category = category[:-1]
if len(row_dict) == 0:
_utils.warn_msg(f"No data to be added for '{category}', skipped.")
return mod_cat_itms
# check that all value lists are of same length
data, itm_count = _validate_row_dict(row_dict)
mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category)
# check if category already exists
table = access.get_table(block, category)
if not table:
return _ensure_category(
block, category, itm_count, data, ordinal_item, mod_cat_itms, raw
)
# get the next row ID before turning table into a loop
next_ordinal = 1
if ordinal_item is not None:
next_ordinal = _get_next_ordinal(table, category, ordinal_item)
# if exsits but is a pair, turn into loop to add more rows
table.ensure_loop()
table = table.loop
# Create list of rows to be added to the loop, ordered by loop.tags.
# A row is a list of (string) values.
# copy item names from loop to preserve order
itm_names = list(table.tags)
# We are altering the list while iterating it, so using indexes above
# enumeration is preferred. PyLint may be disabled in this case.
# pylint: disable=consider-using-enumerate
for i in range(len(itm_names)):
itm_names[i] = itm_names[i].split(".")[1]
mod_cat_itms[category].add(itm_names[i])
category += "."
for i in range(itm_count):
loop_row, next_ordinal = _build_loop_row(
i, itm_names, data, ordinal_item, next_ordinal, raw
)
table.add_row(loop_row)
return mod_cat_itms
[docs]
def sort(table_or_block, item, category=None, key=None):
"""Sort a |gemmi.cif.Table|_ or |gemmicifBlock| in-place by the given item.
This may be useful after editing a table, to sort it by a selected column
(e.g. the ordinal). Numerical values are sorted numerically, all others
lexicographically. ``key`` can take a function to extract a comparison key
from each row. This is helpful for cases like ``_citation.id``, where
special values (e.g. ``id=primary``) might need to be placed first.
Works on an already loaded |gemmi.cif.Table|_, or on a |gemmicifBlock|
(requires ``category``) to sort many categories one after another in less
code.
Examples:
>>> from gemmi import cif
>>> from modelarchive.modelcif import access, edit
>>> # start with an empty CIF document
>>> CIF_DATA = '''data_test
... loop_
... _citation.id
... _citation.journal_full
... _citation.title
... _citation.year
... _citation.journal_volume
... 3 "The Lord of the Rings" "Return of the King" 1955 3
... 1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2
... 2 "The Lord of the Rings" "The Two Towers" 1954 1
... primary . "The Hobbit or There and Back Again" 1937 .
... '''
>>> block = cif.read_string(CIF_DATA).sole_block()
>>> table = access.get_table(block, "_citation")
>>> # first sort without a key function
>>> edit.sort(table, "id")
>>> # This sorts the LOTR books properly, but the 'primary' book is at
>>> # the bottom
>>> print(block.as_string())
data_test
loop_
_citation.id
_citation.journal_full
_citation.title
_citation.year
_citation.journal_volume
1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2
2 "The Lord of the Rings" "The Two Towers" 1954 1
3 "The Lord of the Rings" "Return of the King" 1955 3
primary . "The Hobbit or There and Back Again" 1937 .
<BLANKLINE>
>>> # sort again (this time by block), with a lambda that puts
>>> # 'primary' first
>>> edit.sort(
... block,
... "id",
... category="_citation",
... key=lambda row: (
... (0, "") if row["id"] == "primary" else (1, row["id"])
... ),
... )
>>> print(block.as_string())
data_test
loop_
_citation.id
_citation.journal_full
_citation.title
_citation.year
_citation.journal_volume
primary . "The Hobbit or There and Back Again" 1937 .
1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2
2 "The Lord of the Rings" "The Two Towers" 1954 1
3 "The Lord of the Rings" "Return of the King" 1955 3
<BLANKLINE>
Args:
table_or_block (|gemmi.cif.Table|_ | |gemmicifBlock|):
Object to be sorted. On |gemmicifBlock|, the corresponding table
will be loaded using ``category``.
item (str): Name of the column (item) in the table to sort by.
category (str, optional): Name of the category when sorting a
|gemmicifBlock|.
key (callable, optional): Function taking a row and returning a
sortable value. Defaults to lexicographic ``row[item]`` with a fix
for numerical sorting.
Returns:
None
Raises:
ValueError: If ``table_or_block`` is a |gemmicifBlock| object but no
``category`` was provided.
"""
if isinstance(table_or_block, cif.Block):
if category is None:
raise ValueError(
"Arg 'category' required for sorting gemmi.cif.Block"
)
table = access.get_table(table_or_block, category)
else:
table = table_or_block
if key is None:
def key(row):
# Create tuples 0/ 1 for number/ string so numbers are compared with
# numbers, strings with strings and if a string hits a number,
# string is pushed down.
try:
return (0, int(row[item]))
except ValueError:
return (1, row[item])
# Prepare guidance for re-ordering the table. Uses indices as gemmi.cif.Row
# objects can not be used to identify list-items by index().
n_rows = len(table)
indices = list(range(n_rows))
order = sorted(indices, key=lambda i: key(table[i]))
cur = list(range(n_rows))
for t, idx in enumerate(order):
p = cur.index(idx)
if p != t:
table.move_row(old_pos=p, new_pos=t)
cur.insert(t, cur.pop(p))
# LocalWords: gemmicifBlock func idx CIF qa str ValueError ModelArchive ndb
# LocalWords: MoveIdxToFarError Args num pdbx nonpoly gemmi cif modelarchive
# LocalWords: modelcif asym auth mon pdb HOH pos BLANKLINE bool itms msg
# LocalWords: NotFoundCategoryError gemmicifblock NotFound iterable