Source code for modelarchive.modelcif.edit

"""Functionality to extend and modify ModelCIF files."""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

# Pylint complains about too many lines. Since we have lots of documentation
# inline, lets ignore this.
# pylint: disable=too-many-lines

import sys

from gemmi import cif

from .. import _utils
from . import access



[docs]
class NotFoundError(RuntimeError):
    """General exception for 'things' that can not be found.

    If ``msg`` is omitted, generates a message
    "<SUBJECT> '<VALUE>' does not exist". If ``value`` is a list with more than
    one element, the message will be written in plural mode. If ``subject`` is
    a list or tuple, a second element will be used as plural of the subject.

    This exception should not be raised directly, it exists to define other
    "NotFound" exceptions inheriting from it.

    Args:
        subject (str|list|tuple): The 'thing' that can not be found, used in the
            generated message. If :class:`list:` or :class:`tuple`, a second
            element is used as plural.
        value (str|list): The name of what can not be found, used in the
            generated message. Provied a list of values to get a message
            fitting plural.
        msg (str): Optional alternative error message.
    """

    def __init__(self, subject, value, msg):
        dos = "does"
        if msg is None:
            if not isinstance(subject, str):
                if isinstance(value, (list, tuple)):
                    if len(value) > 1:
                        subject = subject[1]
                        dos = "do"
                    value = "', '".join(value)
                else:
                    subject = subject[0]
            else:
                if isinstance(value, (list, tuple)):
                    if len(value) > 1:
                        dos = "do"
                    value = "', '".join(value)

            msg = f"{subject} '{value}' {dos} not exist."
        super().__init__(msg)




[docs]
class NotFoundCategoryError(NotFoundError):
    """Exception if a category can not be found.

    This exception should be raised when a function expects a specific
    category to exist in the corresponding |gemmicifBlock|, but the category
    cannot be retrieved.

    Attributes:
        category (tuple): Tuple of category names that could not be found.

    Args:
        category (str|list): Name of the category that could not be found.
            Using a list of categories writes the generated message in plural.
        msg (str): Optional alternative error message.
    """

    def __init__(self, category=None, msg=None):
        if isinstance(category, list):
            self.category = tuple(category)
        elif isinstance(category, tuple):
            self.category = category
        else:
            self.category = (category,)
        super().__init__(("Category", "Categories"), category, msg)




[docs]
class NotFoundItemError(NotFoundError):
    """Exception if an item can not be found.

    This exception should be raised when a function expects a specific
    item to exist in the corresponding CIF category, but the item cannot be
    retrieved.

    Attributes:
        item (tuple): Tuple of item names that could not be found.

    Args:
        item (str): Name of the item that could not be found. Use as
            "<CATEGORY>.<ITEM>" for clarity. Using a list of items writes the
            generated message in plural.
        msg (str): Optional alternative error message.
    """

    def __init__(self, item=None, msg=None):
        if isinstance(item, list):
            self.item = tuple(item)
        elif isinstance(item, tuple):
            self.item = item
        else:
            self.item = (item,)
        super().__init__(("Item", "Items"), item, msg)



def _get_idx_for_placement(plcmnt, tbmvd, block):
    """Turn a relative placement into an index in a block."""
    try:
        pos, cat = plcmnt.split(":", maxsplit=1)
    except ValueError:
        # ToDo: turn into own exception if needed
        raise ValueError(
            f"Couldn't split placement string '{plcmnt}', maybe the "
            + "':' is missing. Placement string needs to be of form "
            + "'[after|before]:<CATEGORY>'."
        ) from None
    table = access.get_table(block, cat)
    if not table:
        _utils.warn_msg(
            f"Category '{cat}' for relative placement not found. "
            + "Skipping relocation."
        )
        return None
    # get idx of first or last
    if pos.upper() == "AFTER":
        # get the index of the first tag of the category to be moved
        tbm_idx = block.get_index(tbmvd.tags[0])
        idx = block.get_index(table.tags[-1])
        # when moving from before category, hit the same index to push it down
        if tbm_idx < idx:
            return idx
        # when pushing from behind category, move to index + 1
        if idx < sum(1 for _ in block) - 1:
            return idx + 1
    if pos.upper() == "BEFORE":
        return block.get_index(table.tags[0])

    # ToDo: turn into own exception if needed
    raise ValueError(
        f"Relative placement string '{pos}' not recognised. "
        + "Valid directions are 'after' and 'before'."
    )



[docs]
class MoveIdxToFarError(RuntimeError):
    """Exception if repositioning exceeds the size of document-category-list.

    Primarily used by :func:`move_category`, on the attempt to move a category
    to a position that does not exist within the corresponding |gemmicifBlock|.
    For example, if the |gemmicifBlock| object contains 10 categories, trying
    to move a category to position 15 will fail and should raise this
    exception.

    Args:
        category (str): Name of the category that could not be moved.
        idx (int): Target position to which the category was to be moved.
    """

    def __init__(self, category, idx):
        super().__init__(
            f"Cannot move '{category}' to position '{idx}', exceeds range."
        )




[docs]
def move_category(block, cat, idx):
    """Move a category to a new position in a |gemmicifBlock|.

    By design, ModelCIF files are not intended to be read or edited manually.
    Instead, dedicated applications should handle the format, providing
    functionality to view and modify the data. However, at `ModelArchive`_ we
    occasionally need to open ModelCIF files in an editor to inspect specific
    details. In such cases, it is helpful to have related categories grouped
    together, reducing the need to jump back and forth between different
    categories. This asks for a function to reposition categories within a
    ModelCIF file.

    :func:`move_category` takes category ``cat`` and moves it to position
    ``idx`` in the CIF block ``block``. The parameter ``idx`` is somewhat
    special: it can be just an integer index, specifying the exact position to
    move ``cat`` to. That comes in handy placing categories at the beginning
    (``idx=0``) or at the end (``idx=-1``) of ``block``. However, specifying
    an absolute index is often less useful in practice, as categories are
    typically organised relative to related categories. For this purpose,
    ``idx`` provides a special syntax: ``[after|before]:<CATEGORY>``. For
    example, if you want to put category ``_ma_qa_metric`` in front of
    category ``_ma_qa_metric_local``, you can use
    ``idx="before:_ma_qa_metric_local"`` for ``cat=_ma_qa_metric``...

    Examples:
        >>> from gemmi import cif
        >>> from modelarchive.modelcif import edit
        >>> # get sample CIF data
        >>> cif_data = '''data_test
        ... _ma_qa_metric.id 1
        ... _ma_qa_metric.description test_score
        ... loop_
        ... _ma_qa_metric_local.ordinal_id
        ... _ma_qa_metric_local.metric_value
        ... _ma_qa_metric_local.metric_id
        ... 1 1.0 1
        ... 2 1.5 1
        ... '''
        >>> block = cif.read_string(cif_data).sole_block()
        >>> # move _ma_qa_metric_local to BEFORE _ma_qa_metric
        >>> edit.move_category(
        ...     block,
        ...     "_ma_qa_metric_local",
        ...     "before:_ma_qa_metric",
        ... )
        >>> print(block.as_string())
        data_test
        loop_
        _ma_qa_metric_local.ordinal_id
        _ma_qa_metric_local.metric_value
        _ma_qa_metric_local.metric_id
        1 1.0 1
        2 1.5 1
        <BLANKLINE>
        _ma_qa_metric.id 1
        _ma_qa_metric.description test_score
        <BLANKLINE>
        >>> # move _ma_qa_metric to the front
        >>> edit.move_category(block, "_ma_qa_metric", 0)
        >>> print(block.as_string())
        data_test
        _ma_qa_metric.id 1
        _ma_qa_metric.description test_score
        <BLANKLINE>
        loop_
        _ma_qa_metric_local.ordinal_id
        _ma_qa_metric_local.metric_value
        _ma_qa_metric_local.metric_id
        1 1.0 1
        2 1.5 1
        <BLANKLINE>

    Args:
        block (|gemmicifBlock|): CIF block to operate on.
        cat (str): Name of the CIF category to be moved.
        idx (int|str): Position to move ``cat`` to. This can be an integer for
            exact positioning, or a string of form
            ``[after|before]:<CATEGORY>`` for relative positioning. In
            relative positioning, ``<CATEGORY>`` specifies the name of the
            category before or after which ``cat`` will be placed. If
            ``<CATEGORY>`` can not be found, ``cat`` will not be relocated.

    Returns:
        None

    Raises:
        NotFoundCategoryError: If ``cat`` can not be found in ``block``.
        MoveIdxToFarError: If the target position is outside ``block``. For
            example, if ``block`` contains 10 categories, trying to move a
            category to position 15 will raise this error.
    """
    table = access.get_table(block, cat)
    if not table:
        raise NotFoundCategoryError(cat)
    if idx is None:
        return
    if isinstance(idx, str):
        idx = _get_idx_for_placement(idx, table, block)
        if idx is None:
            return
    if table.loop is None:
        _move_pairs(block, cat, idx, table)
    else:
        try:
            block.move_item(block.get_index(table.tags[0]), idx)
        except RuntimeError as rexc:
            if str(rexc) == "move_item: new_pos out of range":
                raise MoveIdxToFarError(cat, idx) from None
            raise  # pragma: no cover (fallback for general RuntimeError)



def _move_pairs(block, cat, idx, table):
    """Move a named-pair category."""
    # We need to create a new list of tags because we are going to modify
    # the global gemmi.cif list in the loop.
    if idx < 0:
        items = list(table.tags)
    else:
        items = list(reversed(table.tags))
    for i, itm in enumerate(items):
        itm_idx = block.get_index(itm)
        # adapt the idx to moving before/ after a category
        if idx > itm_idx:
            dst = idx - i
        else:
            dst = idx
        try:
            block.move_item(itm_idx, dst)
        except RuntimeError as rexc:
            if str(rexc) == "move_item: new_pos out of range":
                raise MoveIdxToFarError(cat, idx) from None
            raise  # pragma: no cover (fallback for general RuntimeError)



[docs]
def make_copy_value_in_row(src_item):
    """Returns a callback that returns a value from the same row.

    Supposed to be used in functions that require a callback, e.g.
    :func:`add_column`.

    Meant to copy values over from the same row. That is handy in case a
    missing column needs to be populated with values, e.g. if author defined
    values are missing but required, copy over the "label" fields.

    Examples:
        >>> from gemmi import cif
        >>> from modelarchive.modelcif import edit
        >>> # Add _atom_site.auth_comp_id from _atom_site.label_comp_id
        >>> # Note: category _atom_site is heavily cropped in thsi example to
        >>> # keep it concise
        >>> CIF_DATA = '''data_test
        ... #
        ... loop_
        ... _atom_site.group_PDB
        ... _atom_site.type_symbol
        ... _atom_site.label_atom_id
        ... _atom_site.label_comp_id
        ... _atom_site.label_asym_id
        ... _atom_site.auth_seq_id
        ... ATOM C CA MET A 1
        ... ATOM C CA ALA A 2
        ... ATOM C CA THR A 3
        ... ATOM C CA ALA A 4
        ... ATOM C CA ALA A 5
        ... ATOM C CA TYR A 6
        ... '''
        >>> block = cif.read_string(CIF_DATA).sole_block()
        >>> edit.add_column(
        ...     block,
        ...     "_atom_site",
        ...     "auth_comp_id",
        ...     edit.make_copy_value_in_row("label_comp_id"),
        ... )
        >>> print(block.as_string())
        data_test
        loop_
        _atom_site.group_PDB
        _atom_site.type_symbol
        _atom_site.label_atom_id
        _atom_site.label_comp_id
        _atom_site.label_asym_id
        _atom_site.auth_seq_id
        _atom_site.auth_comp_id
        ATOM C CA MET A 1 MET
        ATOM C CA ALA A 2 ALA
        ATOM C CA THR A 3 THR
        ATOM C CA ALA A 4 ALA
        ATOM C CA ALA A 5 ALA
        ATOM C CA TYR A 6 TYR
        <BLANKLINE>

    Args:
        src_item (str): the column name to copy from (source).

    Returns:
        Callable[[:class:`gemmi.cif.Table.Row`], str]: Callback function usable
        as ``callback`` in :func:`add_column`.

    Note:
        This function may be outsourced to a supporting module, if
        :mod:`~modelarchive.modelcif.edit` gets to- big.
    """

    def callback(row):
        return row[src_item]

    return callback




[docs]
def make_res_per_chain_counter(asym_id_item):
    """Returns a stateful callback function counting residues per chain.

    :func:`make_res_per_chain_counter` returns a function that can be used as
    ``callback`` in :func:`add_column`.

    The returned callback assigns consecutive residue numbers within each chain
    of a table, starting at 1. When the chain identifier changes between two
    rows while iterating over the table, the counter is reset to 1.

    Examples:
        >>> # Add item "ndb_seq_num" to category "_pdbx_nonpoly_scheme"
        >>> # Reminder: "ndb_seq_num" -> column, "_pdbx_nonpoly_scheme" -> table
        >>> from gemmi import cif
        >>> from modelarchive.modelcif import edit
        >>> cif_data = '''data_test
        ... loop_
        ... _pdbx_nonpoly_scheme.asym_id
        ... _pdbx_nonpoly_scheme.auth_seq_num
        ... _pdbx_nonpoly_scheme.entity_id
        ... _pdbx_nonpoly_scheme.mon_id
        ... _pdbx_nonpoly_scheme.pdb_seq_num
        ... C 1 3  ATP 1
        ... D 1 4  HEM 1
        ... E 1 5  HOH 1
        ... E 2 5  HOH 2
        ... '''
        >>> block = cif.read_string(cif_data).sole_block()
        >>> # Using make_res_per_chain_counter() in add_column() will add a
        >>> # column to the loop_ and populate it with values:
        >>> edit.add_column(
        ...     block,
        ...     "_pdbx_nonpoly_scheme",
        ...     "ndb_seq_num",
        ...     edit.make_res_per_chain_counter("asym_id"), # CALLBACK
        ...     pos=5,
        ... )
        >>> print(block.as_string())
        data_test
        loop_
        _pdbx_nonpoly_scheme.asym_id
        _pdbx_nonpoly_scheme.auth_seq_num
        _pdbx_nonpoly_scheme.entity_id
        _pdbx_nonpoly_scheme.mon_id
        _pdbx_nonpoly_scheme.ndb_seq_num
        _pdbx_nonpoly_scheme.pdb_seq_num
        C 1 3 ATP 1 1
        D 1 4 HEM 1 1
        E 1 5 HOH 1 1
        E 2 5 HOH 2 2
        <BLANKLINE>
        >>> # "ndb_seq_num" is inserted as fifth column. The ATP in chain C
        >>> # ("asym_id") gets "ndb_seq_num" 1 and the HEM in chain D also gets
        >>> # "ndb_seq_num" 1. But the HOH, both live in chain E together, get
        >>> # "ndb_seq_num" 1 and 2. So for each chain, counting starts at 1
        >>> # and per compound in a chain, the counter is increased by 1.

    Args:
        asym_id_item (str): Item name hosting the chain name.

    Returns:
        Callable[[:class:`gemmi.cif.Table.Row`], int]: Callback function usable
        as ``callback`` in :func:`add_column`.

    Note:
        This function may be outsourced to a supporting module, if
        :mod:`~modelarchive.modelcif.edit` gets too big.
    """
    last_asym_id = None
    last_num = 0

    def callback(row):
        nonlocal last_asym_id, last_num

        if last_asym_id != row[asym_id_item]:
            last_num = 1
        else:
            last_num += 1

        last_asym_id = row[asym_id_item]
        return last_num

    return callback



def _add_column(cat_itm, pos, table, block):
    """Add a new item to a name-value pair category."""
    if table.loop is None:
        block.set_pair(cat_itm, "?")
        if pos == -1:
            pos = len(table.tags)
        else:
            pos -= 1
        block.move_item(
            block.get_index(cat_itm),
            block.get_index(table.tags[0]) + pos,
        )
    else:
        if pos != -1:
            pos -= 1
        table.loop.add_columns([cat_itm], value="?", pos=pos)



[docs]
def add_column(
    block,
    category,
    item,
    callback,
    pos=-1,
    # mod_cat_itms=None,
    raw=False,
):
    # No clue how to reduce no. of arguments, so allow it
    # pylint: disable=too-many-arguments,too-many-positional-arguments
    """Extend a category with a new item and populate it using a callback.

    Thinking of ModelCIF categories as tables, this function adds a new column
    (item) to a table that already exists in ``block``. A ``callback``
    function, to be provided, is executed with each row to compute the value
    for the new column. This avoids having a static list to fetch the values
    from.

    :func:`make_res_per_chain_counter()` is an example of a stateful
    implementation of a working callback.

    The callback has to be of form ``function(row)`` and return the value to be
    set for the ``item`` in the given ``row``.

    Examples:
        >>> # Add "ndb_seq_num" to "_pdbx_nonpoly_scheme" including values
        >>> # Reminder: "ndb_seq_num" -> column, "_pdbx_nonpoly_scheme" -> table
        >>> from gemmi import cif
        >>> from modelarchive.modelcif import edit
        >>> cif_data = '''data_test
        ... loop_
        ... _pdbx_nonpoly_scheme.asym_id
        ... _pdbx_nonpoly_scheme.entity_id
        ... _pdbx_nonpoly_scheme.mon_id
        ... _pdbx_nonpoly_scheme.pdb_seq_num
        ... C 1 ATP 1
        ... D 2 HEM 1
        ... E 3 HOH 1
        ... E 3 HOH 2
        ... '''
        >>> block = cif.read_string(cif_data).sole_block()
        >>> edit.add_column(
        ...     block,
        ...     "_pdbx_nonpoly_scheme",
        ...     "ndb_seq_num",
        ...     edit.make_res_per_chain_counter("asym_id"),
        ...     pos=-1,
        ... )
        >>> print(block.as_string())
        data_test
        loop_
        _pdbx_nonpoly_scheme.asym_id
        _pdbx_nonpoly_scheme.entity_id
        _pdbx_nonpoly_scheme.mon_id
        _pdbx_nonpoly_scheme.pdb_seq_num
        _pdbx_nonpoly_scheme.ndb_seq_num
        C 1 ATP 1 1
        D 2 HEM 1 1
        E 3 HOH 1 1
        E 3 HOH 2 2
        <BLANKLINE>
        >>> # "ndb_seq_num" was appended as last column according to pos=-1

    Args:
        block (|gemmicifBlock|):  block holding the categories of the CIF
            document.
        category (str): The CIF category (table) to add the item to.
        item (str): The item (column) to be added.
        callback (Callable[[:class:`gemmi.cif.Table.Row`], int]): Function to be
            executed to compute values for each row of the new column.
        pos (int): Position to insert the column at. Default is at the end (-1).
            Inserting at the beginning requires ``pos=1``.
        raw (bool): Force to not quote strings containing white-spaces.

    Returns:
        None

    Raises:
        NotFoundCategoryError: If ``category`` can not be found in ``block``.
    """
    # fetch original data
    table = access.get_table(block, category)
    if not table:
        raise NotFoundCategoryError(category)
    # init mod_cat_itms if needed
    # mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category)
    # check if column exists
    try:
        table.find_column(f"{category}.{item}")
    except RuntimeError:
        # If we get an exception, the item does not exist and we add it
        pass
    else:
        # if we do *NOT* get an exception, the item already exists
        _utils.warn_msg(f"'{category}.{item}' already exists', not updated.")
        # if len(mod_cat_itms[category]) == 0:
        #     del mod_cat_itms[category]
        # return mod_cat_itms
        return
    # add column, independend if category is loop or name-value pairs
    _add_column(f"{category}.{item}", pos, table, block)

    # Re-fetch table: easier to work with but is not updated after adding a
    # column. Instead, the last column goes missing. That means if a column is
    # inserted in the middle of the table, the last column from the original
    # table becomes invisible.
    table = access.get_table(block, category)
    # add values
    for row in table:
        val = callback(row)
        val = str(val)
        if " " in val and not raw:
            val = cif.quote(val)
        row[item] = val


    # # Register item for revision annotation
    # if item not in mod_cat_itms[category]:
    #     mod_cat_itms[category].add(item)

    # return mod_cat_itms


def _add_loop(block, cat, itms, mod_cat_itms, raw):
    """Add a loop to block - supporter for `add_category()`."""
    block.set_mmcif_category(cat, itms, raw=raw)
    for itm in itms.keys():
        mod_cat_itms[cat].add(itm)


def _add_or_init_mod_cat_itms(mod_cat_itms, category):
    """Init modified categories/ items counter or add the category."""
    if mod_cat_itms is None:
        mod_cat_itms = {category: set()}
    else:
        if category not in mod_cat_itms:
            # This needs to be unit tested once in use
            mod_cat_itms[category] = set()  # pragma: no cover

    return mod_cat_itms


def _add_pairs(block, cat, itms, mod_cat_itms, raw):
    """Add named pairs to block - supporter for `add_category()`."""
    pyld = {}  # payload for the set_pairs() call
    for k, v in itms.items():
        if isinstance(v, list):
            pyld[k] = v[0]
        else:
            pyld[k] = v

    block.set_pairs(f"{cat}.", pyld, raw=raw)
    for itm in itms.keys():
        mod_cat_itms[cat].add(itm)



[docs]
def add_category(
    block,
    category,
    item_data,
    index=None,
    mod_cat_itms=None,
    raw=False,
):
    # No clue how to reduce no. of arguments, so allow it
    # pylint: disable=too-many-positional-arguments,too-many-arguments
    """Introduce a new category to a |gemmicifblock| and populate it.

    Add ``category`` to ``block`` using data from ``item_data``. ``item_data``
    is a dictionary with the CIF item names as keys and values as values to the
    items. On single values, named-pairs will be created, on lists with more
    than one value, a loop will be created. ``index`` can be used to place the
    category at a certain position. Use an integer for a specific place in the
    category list or a string of form ``[after|before]:<CATEGORY>`` for relative
    positioning.

    Examples:
        >>> from gemmi import cif
        >>> from modelarchive.modelcif import edit
        >>> # start with an empty CIF document
        >>> cif_data = '''data_test
        ... '''
        >>> block = cif.read_string(cif_data).sole_block()
        >>> # lets add entities
        >>> _ = edit.add_category(
        ...     block,
        ...     "_entity",
        ...     {
        ...         "id": [1, 2, 3],
        ...         "type": ["polymer", "non-polymer", "water"],
        ...     },
        ... )
        >>> print(block.as_string())
        data_test
        loop_
        _entity.id
        _entity.type
        1 polymer
        2 non-polymer
        3 water
        <BLANKLINE>
        >>> # lets add an "_entry" ID before the entities
        >>> _ = edit.add_category(
        ...         block, "_entry", {"id": "1FOO"}, index="before:_entity"
        ...     )
        >>> print(block.as_string())
        data_test
        _entry.id 1FOO
        <BLANKLINE>
        loop_
        _entity.id
        _entity.type
        1 polymer
        2 non-polymer
        3 water
        <BLANKLINE>

    Args:
        block (|gemmicifblock|): CIF data block holding the categories of the
            CIF document.
        category (str): Name of the new category to be created.
        item_data (dict[str, list[Any]|Any]): Attributes and values to be
            added to the new category. Dictionary with item names as keys.
            Values are either a list of values or a single value. If a single
            value is provided (or a list containing only one element), a named
            key-value pair is created instead of a loop.
        index (int|str): Placement of the new category within ``block``. This
            can be an integer for exact positioning, or a string of form
            ``[after|before]:<CATEGORY>`` for relative positioning. In relative
            positioning, ``<CATEGORY>`` specifies the name of the category
            before or after which ``cat`` will be placed.
        mod_cat_itms (dict[str, set[str]] | None): A record of what has been
            modified. Dictionary of category assigned a set of items changed.
            Items which already have the value of the update, are not recorded.
            This is meant for the revision history, most likely you can ignore
            it.
        raw (bool, optional): If True, do not force quoting strings containing
            whitespace.

    Returns:
        dict[str, set[str]]: A record of what has been modified. To be used
        with a revision history, most likely you can ignore it.

    Raises:
        MoveIdxToFarError: If the target position is outside ``block``. For
            example, if ``block`` contains 10 categories, trying to create a
            category at position 15 will raise this error.
    """
    mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category)
    if access.get_table(block, category):
        _utils.warn_msg(
            f"Category '{category} already exists, will not be added.'"
        )
        if len(mod_cat_itms[category]) == 0:
            del mod_cat_itms[category]
        return mod_cat_itms
    if len(item_data) == 0:
        _utils.warn_msg(f"No itmes provided, not adding category '{category}'.")
        if len(mod_cat_itms[category]) == 0:
            del mod_cat_itms[category]
        return mod_cat_itms

    # figure out if we got pair or loop
    vals = next(iter(item_data.values()))
    if isinstance(vals, list):
        if len(vals) == 1:
            _add_pairs(block, category, item_data, mod_cat_itms, raw)
        else:
            _add_loop(block, category, item_data, mod_cat_itms, raw)
    else:
        _add_pairs(block, category, item_data, mod_cat_itms, raw)
    move_category(block, category, index)

    if len(mod_cat_itms[category]) == 0:
        del mod_cat_itms[category]  # pragma: no cover (tested when active)
    return mod_cat_itms



def _get_next_ordinal(table, category, item):
    """From the ordinal (row ID) column, get the highest number+1."""
    max_val = -sys.maxsize - 1
    for val in table.find_column(category + "." + item):
        val = int(val)
        max_val = max(max_val, val)

    return max_val + 1


def _validate_row_dict(row_dict):
    """Check that the input of add_rows() is valid and return a copy."""
    data = {}
    itm_count = next(iter(row_dict.values()))
    if isinstance(itm_count, list):
        itm_count = len(itm_count)
    else:
        itm_count = 1
    for itm, val in row_dict.items():
        if isinstance(val, list):
            n_count = len(val)
            data[itm] = val
        else:
            n_count = 1
            data[itm] = [val]
        if n_count != itm_count:
            raise ValueError("Lists of row_dict are not of equal length.")
        itm_count = n_count

    return data, itm_count


def _ensure_category(
    block, category, itm_count, data, ordinal_item, mod_cat_itms, raw
):
    # Just a helper function removing complexity from add_rows(), allow all args
    # pylint: disable=too-many-positional-arguments,too-many-arguments
    """Create a new cateogry with ordinal, if needed."""
    # add ordinal to data
    if ordinal_item is not None:
        if ordinal_item not in data:
            data = {
                ordinal_item: list(range(1, itm_count + 1)),
                **data,
            }
    add_category(block, category, data, mod_cat_itms=mod_cat_itms, raw=raw)
    return mod_cat_itms


def _build_loop_row(i, itm_names, data, ordinal_item, next_ordinal, raw):
    # Just a helper function removing complexity from add_rows(), allow all args
    # pylint: disable=too-many-positional-arguments,too-many-arguments
    """Assemble a row to be added with add_rows()."""
    loop_row = []
    for itm in itm_names:
        try:
            data[itm][i] = str(data[itm][i])
            if " " in data[itm][i] and not raw:
                loop_row.append(cif.quote(data[itm][i]))
            else:
                loop_row.append(data[itm][i])
        except KeyError:
            if itm == ordinal_item:
                loop_row.append(str(next_ordinal))
                next_ordinal += 1
            else:
                loop_row.append(".")

    return loop_row, next_ordinal



[docs]
def add_rows(
    block,
    category,
    row_dict,
    ordinal_item="ordinal",
    mod_cat_itms=None,
    raw=False,
):
    # No clue how to reduce no. of arguments, so allow it
    # pylint: disable=too-many-positional-arguments,too-many-arguments
    """Add rows to a ``category`` in ``block`` using an item-dictionary.

    Thinking of ModelCIF categories as tables, this function adds new rows
    (items) to a table (``category``) in ``block``. If ``category`` does not yet
    exist, it will be created. If multiple rows are provided, the new
    ``category`` will be created as loop, pairs otherwise. When adding row(s)
    to an existing pairs-category, the function will convert the ``category``
    into a loop.

    Input data is provided via ``row_dict``. It must be a :class:`dict` of
    :class:`list` (for a single row, values may be single elements instead of
    lists). Item names are used as keys in ``row_dict``. Missing items that
    exist in ``category`` will be added as ``.`` in new rows. The order of
    items in ``row_dict`` can be arbitrary; this function will align them with
    the existing order in ``category``.

    ``ordinal_item`` describes a unique numerical ID for each row. If provided,
    the function will automatically increment it for new rows. In `ModelCIF`_,
    this column is often called ``ordinal`` though some categories use
    different names.

    Examples:
        >>> from gemmi import cif
        >>> from modelarchive.modelcif import edit
        >>> # start with an empty CIF document
        >>> cif_data = '''data_test
        ... '''
        >>> block = cif.read_string(cif_data).sole_block()
        >>> # Lets add an entity to create a category in block. ordinal_item
        >>> # is set to None on purpose to show how it works later.
        >>> _ = edit.add_rows(
        ...     block,
        ...     "_entity",
        ...     {"id": 1, "details": "Protein", "type": "polymer"},
        ...     ordinal_item=None,
        ... )
        >>> # see how the _entity category is created as couple of pairs
        >>> print(block.as_string())
        data_test
        _entity.id 1
        _entity.details Protein
        _entity.type polymer
        <BLANKLINE>
        >>> # Add a second row (pairs will turn into a loop). This time, include
        >>> # ordinal_item to let the function take care of incrementing IDs.
        >>> _ = edit.add_rows(
        ...     block,
        ...     "_entity",
        ...     {"details": ["H2O"], "type": ["water"]},
        ...     ordinal_item="id",
        ... )
        >>> # Now _entity is a loop and _entity.id was incremented automatically
        >>> print(block.as_string())
        data_test
        loop_
        _entity.id
        _entity.details
        _entity.type
        1 Protein polymer
        2 H2O water
        <BLANKLINE>
        >>> # As a last example, add multiple new rows at once but skip the
        >>> # 'details' column.
        >>> _ = edit.add_rows(
        ...     block,
        ...     "_entity",
        ...     {"type": ["polymer", "polymer"]},
        ...     ordinal_item="id",
        ... )
        >>> # Now there are two more polymer entities in the loop but since
        >>> # the 'details' information was missing, the function added '.' in
        >>> # those fields.
        >>> print(block.as_string())
        data_test
        loop_
        _entity.id
        _entity.details
        _entity.type
        1 Protein polymer
        2 H2O water
        3 . polymer
        4 . polymer
        <BLANKLINE>

    Args:
        block (|gemmicifblock|): CIF data block holding the categories of the
            CIF document.
        category (str): Name of the category to which row(s) will be added.
        row_dict (dict[str, list | Any]): Row data to be added to ``category``.
            Keys are item names of the category. Values must be lists when
            adding multiple rows. For a single row, values may be provided as
            scalars instead of lists. If an item is missing from ``row_dict``
            but exists in the category, '.' will be assigned for that item in
            the new row(s).
        ordinal_item (str | None): If the category includes an ordinal (in
            database terms a primary key), this identifies the item name of
            it. If ``ordinal_item`` is provided, the latest ordinal will be
            read from the category and automatically incremented for new rows.
            Use ``None`` in case the category does not have an ordinal or if
            the ordinal should be set explicitly. The ordinal does not need to
            be included in ``row_dict``.
        mod_cat_itms (dict[str, set[str]] | None): A record of what has been
            modified. Dictionary of category assigned a set of items changed.
            Items which already have the value of the update, are not recorded.
            This is meant for the revision history, most likely you can ignore
            it.
        raw (bool, optional): If True, do not force quoting strings containing
            whitespace.

    Returns:
        dict[str, set[str]]: A record of what has been modified. To be used
        with a revision history, most likely you can ignore it.

    Raises:
        ValueError: In case item lists in ``row_dict`` are not of equal length.
    """
    if category.endswith("."):
        category = category[:-1]
    if len(row_dict) == 0:
        _utils.warn_msg(f"No data to be added for '{category}', skipped.")
        return mod_cat_itms
    # check that all value lists are of same length
    data, itm_count = _validate_row_dict(row_dict)

    mod_cat_itms = _add_or_init_mod_cat_itms(mod_cat_itms, category)
    # check if category already exists
    table = access.get_table(block, category)
    if not table:
        return _ensure_category(
            block, category, itm_count, data, ordinal_item, mod_cat_itms, raw
        )
    # get the next row ID before turning table into a loop
    next_ordinal = 1
    if ordinal_item is not None:
        next_ordinal = _get_next_ordinal(table, category, ordinal_item)
    # if exsits but is a pair, turn into loop to add more rows
    table.ensure_loop()
    table = table.loop
    # Create list of rows to be added to the loop, ordered by loop.tags.
    # A row is a list of (string) values.
    # copy item names from loop to preserve order
    itm_names = list(table.tags)
    # We are altering the list while iterating it, so using indexes above
    # enumeration is preferred. PyLint may be disabled in this case.
    # pylint: disable=consider-using-enumerate
    for i in range(len(itm_names)):
        itm_names[i] = itm_names[i].split(".")[1]
        mod_cat_itms[category].add(itm_names[i])

    category += "."
    for i in range(itm_count):
        loop_row, next_ordinal = _build_loop_row(
            i, itm_names, data, ordinal_item, next_ordinal, raw
        )
        table.add_row(loop_row)

    return mod_cat_itms




[docs]
def sort(table_or_block, item, category=None, key=None):
    """Sort a |gemmi.cif.Table|_ or |gemmicifBlock| in-place by the given item.

    This may be useful after editing a table, to sort it by a selected column
    (e.g. the ordinal). Numerical values are sorted numerically, all others
    lexicographically. ``key`` can take a function to extract a comparison key
    from each row. This is helpful for cases like ``_citation.id``, where
    special values (e.g. ``id=primary``) might need to be placed first.

    Works on an already loaded |gemmi.cif.Table|_, or on a |gemmicifBlock|
    (requires ``category``) to sort many categories one after another in less
    code.

    Examples:
        >>> from gemmi import cif
        >>> from modelarchive.modelcif import access, edit
        >>> # start with an empty CIF document
        >>> CIF_DATA = '''data_test
        ... loop_
        ... _citation.id
        ... _citation.journal_full
        ... _citation.title
        ... _citation.year
        ... _citation.journal_volume
        ... 3 "The Lord of the Rings" "Return of the King" 1955 3
        ... 1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2
        ... 2 "The Lord of the Rings" "The Two Towers" 1954 1
        ... primary . "The Hobbit or There and Back Again" 1937 .
        ... '''
        >>> block = cif.read_string(CIF_DATA).sole_block()
        >>> table = access.get_table(block, "_citation")
        >>> # first sort without a key function
        >>> edit.sort(table, "id")
        >>> # This sorts the LOTR books properly, but the 'primary' book is at
        >>> # the bottom
        >>> print(block.as_string())
        data_test
        loop_
        _citation.id
        _citation.journal_full
        _citation.title
        _citation.year
        _citation.journal_volume
        1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2
        2 "The Lord of the Rings" "The Two Towers" 1954 1
        3 "The Lord of the Rings" "Return of the King" 1955 3
        primary . "The Hobbit or There and Back Again" 1937 .
        <BLANKLINE>
        >>> # sort again (this time by block), with a lambda that puts
        >>> # 'primary' first
        >>> edit.sort(
        ...     block,
        ...     "id",
        ...     category="_citation",
        ...     key=lambda row: (
        ...         (0, "") if row["id"] == "primary" else (1, row["id"])
        ...     ),
        ... )
        >>> print(block.as_string())
        data_test
        loop_
        _citation.id
        _citation.journal_full
        _citation.title
        _citation.year
        _citation.journal_volume
        primary . "The Hobbit or There and Back Again" 1937 .
        1 "The Lord of the Rings" "The Fellowship of the Ring" 1954 2
        2 "The Lord of the Rings" "The Two Towers" 1954 1
        3 "The Lord of the Rings" "Return of the King" 1955 3
        <BLANKLINE>

    Args:
        table_or_block (|gemmi.cif.Table|_ | |gemmicifBlock|):
            Object to be sorted. On |gemmicifBlock|, the corresponding table
            will be loaded using ``category``.
        item (str): Name of the column (item) in the table to sort by.
        category (str, optional): Name of the category when sorting a
            |gemmicifBlock|.
        key (callable, optional): Function taking a row and returning a
            sortable value. Defaults to lexicographic ``row[item]`` with a fix
            for numerical sorting.

    Returns:
        None

    Raises:
        ValueError: If ``table_or_block`` is a |gemmicifBlock| object but no
            ``category`` was provided.
    """
    if isinstance(table_or_block, cif.Block):
        if category is None:
            raise ValueError(
                "Arg 'category' required for sorting gemmi.cif.Block"
            )
        table = access.get_table(table_or_block, category)
    else:
        table = table_or_block

    if key is None:

        def key(row):
            # Create tuples 0/ 1 for number/ string so numbers are compared with
            # numbers, strings with strings and if a string hits a number,
            # string is pushed down.
            try:
                return (0, int(row[item]))
            except ValueError:
                return (1, row[item])

    # Prepare guidance for re-ordering the table. Uses indices as gemmi.cif.Row
    # objects can not be used to identify list-items by index().
    n_rows = len(table)
    indices = list(range(n_rows))
    order = sorted(indices, key=lambda i: key(table[i]))
    cur = list(range(n_rows))

    for t, idx in enumerate(order):
        p = cur.index(idx)
        if p != t:
            table.move_row(old_pos=p, new_pos=t)
            cur.insert(t, cur.pop(p))



#  LocalWords:  gemmicifBlock func idx CIF qa str ValueError ModelArchive ndb
#  LocalWords:  MoveIdxToFarError Args num pdbx nonpoly gemmi cif modelarchive
#  LocalWords:  modelcif asym auth mon pdb HOH pos BLANKLINE bool itms msg
#  LocalWords:  NotFoundCategoryError gemmicifblock NotFound iterable