Source code for modelarchive.tools.maxit

"""`MAXIT`_ from `RCSB`_ converts coordinate files in PDB legacy format to CIF
and CIF files to mmCIF. This module also adds functionality to turn a PDB file
into a (minimalist) ModelCIF file. But don't get too excited - none of the
functionality will turn a PDB file into a fully annotated ModelCIF file. It just
makes sure the starting point is of valid CIF syntax. Extra data still need to be
added...

`MAXIT`_ is not bundled with this module. The source code can be downloaded
`here <https://sw-tools.rcsb.org/apps/MAXIT/source.html>`_.
`Installation instructions <https://sw-tools.rcsb.org/apps/MAXIT/README-source>`_
are available, and here is a TL;DR how to compile on macOS and most Linux
distributions:

.. code-block:: bash

   # cd into the unpacked source directory first
   export RCSBROOT=$(pwd)
   make
   make binary
   # binaries are found in bin/
   # RCSBROOT needs to point at data/ when running maxit
"""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from itertools import product, count
from pathlib import Path
import gzip
import os
import subprocess
import tempfile

import gemmi
import ihm
import modelcif.dumper
import modelcif.reader

from modelarchive import _utils

MAXIT_BINARY = os.environ.get("MAXIT_BINARY", "maxit")
"""Path to the maxit binary, defaults to ``maxit`` from ``$PATH``.

Can be overridden by setting the ``MAXIT_BINARY`` environment variable
before import.
"""



[docs]
def run_maxit(infile, outfile, mode, logfile=None):
    """Run MAXIT without checks, mode-preselection, or cleanup.

    Args:
        infile (~pathlib.Path | str): Input file. Either PDB legacy format or
            CIF.
        outfile (~pathlib.Path | str): Output file.
        mode (str): MAXIT operation mode. Use ``"1"`` for PDB to CIF, ``"2"``
            for CIF to PDB, ``"8"`` for CIF to mmCIF.
        logfile (~pathlib.Path | str, optional): File for MAXIT log messages.

    Returns:
        subprocess.CompletedProcess: Result of the MAXIT run.
    """
    cmd = [
        MAXIT_BINARY,
        "-input",
        os.fspath(infile),
        "-output",
        os.fspath(outfile),
        "-o",
        mode,
    ]
    if logfile is not None:
        cmd.extend(["-log", os.fspath(logfile)])

    return subprocess.run(cmd, check=True)




[docs]
def run_maxit_log2list(infile, outfile, mode):
    """Run MAXIT and return the log file content as a list.

    Args:
        infile (~pathlib.Path | str): Input file. Either PDB legacy format or
            CIF (also as gzip).
        outfile (~pathlib.Path | str): Output file. If the filename ends with
            ``.gz`` or ``.gzip``, the output will be compressed.
        mode (str): MAXIT operation mode. Use ``"1"`` for PDB to CIF, ``"2"``
            for CIF to PDB, ``"8"`` for CIF to mmCIF.

    Returns:
        tuple[list[str], int]: A tuple of the log file content as a list
        of strings and the MAXIT exit status.
    """
    log = []
    extstts = 0
    # Note: `delete_on_close=False` (Python >= 3.12) would be more performant
    # but we support Python >= 3.10.
    with tempfile.NamedTemporaryFile(mode="r") as mxtlog:
        try:
            prc = run_maxit(infile, outfile, mode, logfile=mxtlog.name)
            extstts = prc.returncode
        except subprocess.CalledProcessError as exc:
            extstts = exc.returncode
        for line in mxtlog:
            line = line.strip()
            log.append(line)

    if (
        Path(outfile).suffix.lower() in (".gz", ".gzip")
        and Path(outfile).exists()
    ):
        with open(outfile, "rb") as fh:
            content = fh.read()
        with gzip.open(outfile, "wb") as fh:
            fh.write(content)

    return (log, extstts)



def _format2format(infile, outfile, mode):
    """Wrapper for convenience converters."""
    if "RCSBROOT" not in os.environ:
        raise RuntimeError("RCSBROOT environment variable is not set.")
    log, extstts = run_maxit_log2list(infile, outfile, mode)
    if extstts != 0:
        return log

    return []



[docs]
def pdb2cif(infile, outfile):
    """Convert a PDB legacy format file to CIF using MAXIT.

    Only returns log messages upon failure.

    Args:
        infile (~pathlib.Path | str): Input file in PDB legacy format (also as
            gzip).
        outfile (~pathlib.Path | str): Output CIF file.  If the filename
            ends with ``.gz`` or ``.gzip``, the output will be compressed.

    Returns:
        list[str]: MAXIT log messages on failure, empty list on success.

    Raises:
        RuntimeError: If ``RCSBROOT`` environment variable is not set.
    """
    return _format2format(infile, outfile, "1")




[docs]
def cif2mmcif(infile, outfile):
    """Convert a CIF file to mmCIF using MAXIT.

    Only returns log messages upon failure.

    Args:
        infile (~pathlib.Path | str): Input CIF file (also as gzip).
        outfile (~pathlib.Path | str): Output mmCIF file. If the filename ends
            with ``.gz`` or ``.gzip``, the output will be compressed.

    Returns:
        list[str]: MAXIT log messages on failure, empty list on success.

    Raises:
        RuntimeError: If ``RCSBROOT`` environment variable is not set.
    """
    return _format2format(infile, outfile, "8")




[docs]
def pdb2mmcif(infile, outfile):
    """Convert a PDB legacy format file to mmCIF using MAXIT.

    Runs MAXIT first in PDB to CIF mode, then converts the result to mmCIF.
    Only returns log messages upon failure.

    Args:
        infile (~pathlib.Path | str): Input file in PDB legacy format (also as
            gzip).
        outfile (~pathlib.Path | str): Output mmCIF file. If the filename ends
            with ``.gz`` or ``.gzip``, the output will be compressed.

    Returns:
        list[str]: MAXIT log messages on failure, empty list on success.
            On failure, the first element indicates which conversion step
            failed.

    Raises:
        RuntimeError: If ``RCSBROOT`` environment variable is not set.
    """
    log = pdb2cif(infile, outfile)
    if log:
        log.insert(0, "PDB to CIF conversion failed")
        return log
    log = cif2mmcif(outfile, outfile)
    if log:
        log.insert(0, "CIF to mmCIF conversion failed")
    return log




[docs]
def fixing_pdb2mmcif(pdb_as_string, outfile):
    """Convert a PDB legacy format string to mmCIF, fixing known issues.

    Adds missing chain names if necessary before conversion. Only returns
    log messages upon failure.

    Args:
        pdb_as_string (str): Input file content in PDB legacy format.
        outfile (~pathlib.Path | str): Output mmCIF file. If the filename
            ends with ``.gz`` or ``.gzip``, the output will be compressed.

    Returns:
        list[str]: MAXIT log messages on failure, empty list on success.

    Raises:
        RuntimeError: If ``RCSBROOT`` environment variable is not set.
    """

    def _get_cname(used_chain_names):
        chars = (
            "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
            "0123456789"
            "abcdefghijklmnopqrstuvwxyz"
        )
        for length in count(1):
            for name in ("".join(p) for p in product(chars, repeat=length)):
                if name not in used_chain_names:
                    used_chain_names.add(name)
                    return name
        raise RuntimeError("unreachable")  # pragma: no cover

    # fix PDB input
    entry = gemmi.read_pdb_string(pdb_as_string, split_chain_on_ter=True)
    entry.setup_entities()
    entry.assign_label_seq_id()
    fixed = False
    for mdl in entry:
        used_chain_names = set()
        no_name_chains = 0
        for chn in mdl:
            if len(chn.name) == 0:
                no_name_chains += 1
            used_chain_names.add(chn.name)
        if no_name_chains > 0:
            nchns = []
            for chn in mdl:
                if len(chn.name) == 0:
                    for schn in chn.subchains():
                        nchn = gemmi.Chain(_get_cname(used_chain_names))
                        nchn.append_residues(schn)
                        nchns.append(nchn)
            for chn in nchns:
                mdl.add_chain(chn)
            # `remove_chain()` removes ALL occurences of chain name
            mdl.remove_chain("")
            fixed = True
    # input underwent fixing, needs to be written as new input file
    if fixed:
        # deduplicate_entities
        entry.add_entity_types()
        entry.assign_subchains(force=True)
        entry.ensure_entities()
        entry.deduplicate_entities()
        mog = gemmi.MmcifOutputGroups(all=True)
        entry.make_mmcif_document(mog).write_file(os.fspath(outfile))
        return cif2mmcif(outfile, outfile)

    # convert to mmCIF
    with open(outfile, "w", encoding="ascii") as ofh:
        ofh.write(pdb_as_string)
    return pdb2mmcif(outfile, outfile)



def _get_not_modeled_residues(model):
    """Yield NotModeledResidueRange objects for all residue ranges in the
    Model that are not referenced by Atom objects"""
    # The following code is adapted from python-modelcif
    # Copyright (c) 2018-2025 IHM Working Group
    #
    # Permission is hereby granted, free of charge, to any person obtaining a
    # copy of this software and associated documentation files (the "Software"),
    # to deal in the Software without restriction, including without limitation
    # the rights to use, copy, modify, merge, publish, distribute, sublicense,
    # and/or sell copies of the Software, and to permit persons to whom the
    # Software is furnished to do so, subject to the following conditions:
    #
    # The above copyright notice and this permission notice shall be included in
    # all copies or substantial portions of the Software.
    #
    # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    # DEALINGS IN THE SOFTWARE.
    # Function needs to access protected members to do its job, allow Pylint
    # pylint: disable=protected-access
    for assem in model.assembly:
        asym = assem.asym if hasattr(assem, "asym") else assem
        if not asym.entity.is_polymeric():
            continue
        # Make a set of all residue indices of this asym "handled"
        # by being modeled with Atom objects
        handled_residues = set()
        for atom in model._atoms:
            if atom.asym_unit is asym:
                handled_residues.add(atom.seq_id)
        # Convert set to a list of residue ranges
        handled_residues = ihm.util._make_range_from_list(
            sorted(handled_residues)
        )
        # Return not-modeled for each non-handled range
        for r in ihm.util._invert_ranges(
            handled_residues,
            end=assem.seq_id_range[1],
            start=assem.seq_id_range[0],
        ):
            yield modelcif.model.NotModeledResidueRange(asym, r[0], r[1])


def _sanitize_modelcif(system):
    """Sanitize the mmCIF file to be used by the model archive and add modelcif
    categories
    """
    # The following code is adapted from python-modelcif
    # Copyright (c) 2018-2025 IHM Working Group
    #
    # Permission is hereby granted, free of charge, to any person obtaining a
    # copy of this software and associated documentation files (the "Software"),
    # to deal in the Software without restriction, including without limitation
    # the rights to use, copy, modify, merge, publish, distribute, sublicense,
    # and/or sell copies of the Software, and to permit persons to whom the
    # Software is furnished to do so, subject to the following conditions:
    #
    # The above copyright notice and this permission notice shall be included in
    # all copies or substantial portions of the Software.
    #
    # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
    # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
    # DEALINGS IN THE SOFTWARE.
    if not system.title:
        system.title = "Auto-generated system"
    if not system.protocols:
        default_protocol = modelcif.protocol.Protocol()
        step = modelcif.protocol.ModelingStep(
            name="modeling", input_data=None, output_data=None
        )
        default_protocol.steps.append(step)
        system.protocols.append(default_protocol)

    for model_group in system.model_groups:
        for model in model_group:
            # Entity description is also used by python-modelcif for
            # ma_data.name, which is mandatory, so it cannot be unknown/?
            for asym in model.assembly:
                if asym.entity.description is ihm.unknown:
                    asym.entity.description = "target"

            model.not_modeled_residue_ranges.extend(
                _get_not_modeled_residues(model)
            )

    return system


# Replacement for an internal class of python-modelcif, allow protected member
# access in Pylint
# pylint: disable=protected-access
class _MAQAMetricDumper(modelcif.dumper._QAMetricDumper):
    """Get all ma_qa_matric records, even those missing actual values."""

    def dump_metric_types(self, system, writer):  # pragma: no cover
        """Use system._qa_by_id instead of self._metric_classes_by_id."""
        # Excluded from unit testing because copy of original code with
        # system._qa_by_id replacing the original self._metric_classes_by_id to
        # get QA from associated files.
        with writer.loop(
            "_ma_qa_metric",
            [
                "id",
                "name",
                "description",
                "type",
                "mode",
                "type_other_details",
                "software_group_id",
            ],
        ) as lp:
            for i, m in system._qa_by_id.items():
                # use i for the id as missing qa_metrics have not been
                # instantiated, yet. So they don't have a ._id
                lp.write(
                    id=i,
                    name=m.name,
                    description=m.description,
                    type=m.type,
                    mode=m.mode,
                    type_other_details=m.other_details,
                    software_group_id=(
                        m.software._group_id if m.software else None
                    ),
                )


# pylint: enable=protected-access


class _MAVariant(modelcif.dumper.ModelCIFVariant):
    """Wrapper class for writing ModelCIF files.

    We need a writer that preserves ma_qa_metric entries that have the actual
    scores in an accompanying file.
    """

    def __init__(self):
        """Exchange dumpers"""
        super().__init__()
        ma_qa_metric_idx = None
        for i, dumper in enumerate(self._dumpers):
            if dumper == modelcif.dumper._QAMetricDumper:
                ma_qa_metric_idx = i
            elif dumper == _MAQAMetricDumper:
                return
        if ma_qa_metric_idx is not None:
            self._dumpers[ma_qa_metric_idx] = _MAQAMetricDumper
        else:  # pragma: no cover
            # Defensive fallback if modelcif.dumper._QAMetricDumper is not
            # found in _dumpers, e.g. after a python-modelcif API change.
            _utils.warn_msg("modelcif.dumper._QAMetricDumper not found")



[docs]
def cif2modelcif(infile, outfile):
    """Convert a CIF file into a minimalist ModelCIF file.

    Sanitizes the input and adds mandatory ModelCIF categories. The input
    file is expected to be in mmCIF format as produced by MAXIT.

    Args:
        infile (~pathlib.Path | str): Input mmCIF/ ModelCIF file (also as gzip).
        outfile (~pathlib.Path | str): Output ModelCIF file. If the filename
            ends with ``.gz`` or ``.gzip``, the output will be compressed.

    Returns:
        ~pathlib.Path | str: Path to the output file.

    Raises:
        RuntimeError: If reading or writing the CIF file fails.
    """
    opener_in = _utils.get_opener(infile)
    opener_out = _utils.get_opener(outfile)
    try:
        with opener_in(infile, "rt", encoding="utf8") as fh:
            with opener_out(outfile, "wt", encoding="ascii") as fhout:
                modelcif.dumper.write(
                    fhout,
                    [_sanitize_modelcif(s) for s in modelcif.reader.read(fh)],
                    variant=_MAVariant,
                )

        return outfile

    except Exception as e:
        raise RuntimeError(
            f"Error sanitizing mmCIF file with mmcif: {str(e)}"
        ) from e




[docs]
def pdb2modelcif(infile, outfile):
    """Convert a PDB legacy format file into a minimalist ModelCIF file.

    Fixes known issues before conversion. Only returns log messages upon
    failure.

    Args:
        infile (~pathlib.Path | str): Input file in PDB legacy format (also as
            gzip).
        outfile (~pathlib.Path | str): Output ModelCIF file. If the filename
            ends with ``.gz`` or ``.gzip``, the output will be compressed.

    Returns:
        list[str]: Error log on failure, empty list on success.

    Raises:
        RuntimeError: If ``RCSBROOT`` environment variable is not set.
    """
    opener = _utils.get_opener(infile)
    with opener(infile, "rt", encoding="ascii") as fh:
        log = fixing_pdb2mmcif(fh.read(), outfile)
    if log:
        return log
    try:
        cif2modelcif(outfile, outfile)
    except RuntimeError as e:
        return [str(e)]

    return []




[docs]
def coordfile2modelcif(infile, outfile):
    """Convert a macromolecular structure file to a minimalist ModelCIF file.

    Dispatches to :func:`pdb2modelcif` or :func:`cif2modelcif` based on the
    file extension. Supports ``.gz`` compressed files.

    Args:
        infile (~pathlib.Path | str): Input file in PDB or CIF format.
            Supported extensions: ``.pdb``, ``.cif``, ``.mmcif``, and their
            ``.gz`` variants.
        outfile (~pathlib.Path | str): Output ModelCIF file. If the filename
            ends with ``.gz`` or ``.gzip``, the output will be compressed.

    Returns:
        list[str]: Error log on failure, empty list on success.

    Raises:
        RuntimeError: If ``RCSBROOT`` environment variable is not set.
        ValueError: If the file extension is not supported.
    """
    infile = Path(infile)
    suffix = infile.suffix.lower()
    if suffix == ".gz":
        suffix = Path(infile.stem).suffix.lower()
    if suffix == ".pdb":
        return pdb2modelcif(infile, outfile)
    if suffix in (".cif", ".mmcif"):
        try:
            cif2modelcif(infile, outfile)
        except RuntimeError as e:
            return [str(e)]
        return []
    raise ValueError(f"Unsupported file extension: {infile.suffix}")




[docs]
def main():
    """Entry point for the ``ma-maxit`` command line tool."""
    # For main functions we allow bad imports
    # pylint: disable=import-outside-toplevel
    import argparse
    import sys

    def _parse_command_line():
        """Get arguments."""
        parser = argparse.ArgumentParser(
            description="Run RCSB MAXIT from Python "
            + "(https://sw-tools.rcsb.org/apps/MAXIT/index.html)",
        )
        parser.add_argument(
            "--input",
            help="Input/ source file",
            metavar="<INPUTFILE>",
            required=True,
            type=str,
        )
        parser.add_argument(
            "--output",
            help="Output/ destination file",
            metavar="<OUTPUTFILE>",
            required=True,
            type=str,
        )
        parser.add_argument(
            "--mode",
            "-o",
            dest="mode",
            help="Mode, 1: PDB to CIF, 2: CIF to PDB, 8: CIF to mmCIF, "
            + "ma: PDB to mmCIF with some fixes",
            metavar="<NUM>",
            required=True,
            type=str,
        )
        opts = parser.parse_args()
        if not Path(opts.input).is_file():
            print(f"No file '{opts.input}' found.", file=sys.stderr)
            sys.exit(1)

        return opts

    def _main():
        """Run as script."""
        opts = _parse_command_line()
        if opts.mode != "ma":
            log, extstts = run_maxit_log2list(
                opts.input, opts.output, opts.mode
            )
        else:
            opener = _utils.get_opener(opts.input)
            with opener(opts.input, "rt", encoding="ascii") as lfh:
                pdblines = lfh.read()
            log = fixing_pdb2mmcif(pdblines, opts.output)
            if len(log) > 0:
                extstts = 1
            else:
                extstts = 0
        if extstts == 0:
            ostream = sys.stdout
        else:
            ostream = sys.stderr
        for line in log:
            print(line, file=ostream)
        sys.exit(extstts)

    _main()