Source code for modelarchive.tools.maxit

"""`MAXIT`_ from `RCSB`_ converts coordinate files in PDB legacy format to CIF
and CIF files to mmCIF. This module also adds functionality to turn a PDB file
into a (minimalist) ModelCIF file. But don't get too excited - none of the
functionality will turn a PDB file into a fully annotated ModelCIF file. It just
makes sure the starting point is of valid CIF syntax. Extra data still need to be
added...

`MAXIT`_ is not bundled with this module. The source code can be downloaded
`here <https://sw-tools.rcsb.org/apps/MAXIT/source.html>`_.
`Installation instructions <https://sw-tools.rcsb.org/apps/MAXIT/README-source>`_
are available, and here is a TL;DR how to compile on macOS and most Linux
distributions:

.. code-block:: bash

   # cd into the unpacked source directory first
   export RCSBROOT=$(pwd)
   make
   make binary
   # binaries are found in bin/
   # RCSBROOT needs to point at data/ when running maxit
"""

# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
#                     Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.

from itertools import product, count
from pathlib import Path
import gzip
import os
import subprocess
import tempfile

import gemmi
import ihm
import modelcif.dumper
import modelcif.reader

from modelarchive import _utils

MAXIT_BINARY = os.environ.get("MAXIT_BINARY", "maxit")
"""Path to the maxit binary, defaults to ``maxit`` from ``$PATH``.

Can be overridden by setting the ``MAXIT_BINARY`` environment variable
before import.
"""


[docs] def run_maxit(infile, outfile, mode, logfile=None): """Run MAXIT without checks, mode-preselection, or cleanup. Args: infile (~pathlib.Path | str): Input file. Either PDB legacy format or CIF. outfile (~pathlib.Path | str): Output file. mode (str): MAXIT operation mode. Use ``"1"`` for PDB to CIF, ``"2"`` for CIF to PDB, ``"8"`` for CIF to mmCIF. logfile (~pathlib.Path | str, optional): File for MAXIT log messages. Returns: subprocess.CompletedProcess: Result of the MAXIT run. """ cmd = [ MAXIT_BINARY, "-input", os.fspath(infile), "-output", os.fspath(outfile), "-o", mode, ] if logfile is not None: cmd.extend(["-log", os.fspath(logfile)]) return subprocess.run(cmd, check=True)
[docs] def run_maxit_log2list(infile, outfile, mode): """Run MAXIT and return the log file content as a list. Args: infile (~pathlib.Path | str): Input file. Either PDB legacy format or CIF (also as gzip). outfile (~pathlib.Path | str): Output file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. mode (str): MAXIT operation mode. Use ``"1"`` for PDB to CIF, ``"2"`` for CIF to PDB, ``"8"`` for CIF to mmCIF. Returns: tuple[list[str], int]: A tuple of the log file content as a list of strings and the MAXIT exit status. """ log = [] extstts = 0 # Note: `delete_on_close=False` (Python >= 3.12) would be more performant # but we support Python >= 3.10. with tempfile.NamedTemporaryFile(mode="r") as mxtlog: try: prc = run_maxit(infile, outfile, mode, logfile=mxtlog.name) extstts = prc.returncode except subprocess.CalledProcessError as exc: extstts = exc.returncode for line in mxtlog: line = line.strip() log.append(line) if ( Path(outfile).suffix.lower() in (".gz", ".gzip") and Path(outfile).exists() ): with open(outfile, "rb") as fh: content = fh.read() with gzip.open(outfile, "wb") as fh: fh.write(content) return (log, extstts)
def _format2format(infile, outfile, mode): """Wrapper for convenience converters.""" if "RCSBROOT" not in os.environ: raise RuntimeError("RCSBROOT environment variable is not set.") log, extstts = run_maxit_log2list(infile, outfile, mode) if extstts != 0: return log return []
[docs] def pdb2cif(infile, outfile): """Convert a PDB legacy format file to CIF using MAXIT. Only returns log messages upon failure. Args: infile (~pathlib.Path | str): Input file in PDB legacy format (also as gzip). outfile (~pathlib.Path | str): Output CIF file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. Returns: list[str]: MAXIT log messages on failure, empty list on success. Raises: RuntimeError: If ``RCSBROOT`` environment variable is not set. """ return _format2format(infile, outfile, "1")
[docs] def cif2mmcif(infile, outfile): """Convert a CIF file to mmCIF using MAXIT. Only returns log messages upon failure. Args: infile (~pathlib.Path | str): Input CIF file (also as gzip). outfile (~pathlib.Path | str): Output mmCIF file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. Returns: list[str]: MAXIT log messages on failure, empty list on success. Raises: RuntimeError: If ``RCSBROOT`` environment variable is not set. """ return _format2format(infile, outfile, "8")
[docs] def pdb2mmcif(infile, outfile): """Convert a PDB legacy format file to mmCIF using MAXIT. Runs MAXIT first in PDB to CIF mode, then converts the result to mmCIF. Only returns log messages upon failure. Args: infile (~pathlib.Path | str): Input file in PDB legacy format (also as gzip). outfile (~pathlib.Path | str): Output mmCIF file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. Returns: list[str]: MAXIT log messages on failure, empty list on success. On failure, the first element indicates which conversion step failed. Raises: RuntimeError: If ``RCSBROOT`` environment variable is not set. """ log = pdb2cif(infile, outfile) if log: log.insert(0, "PDB to CIF conversion failed") return log log = cif2mmcif(outfile, outfile) if log: log.insert(0, "CIF to mmCIF conversion failed") return log
[docs] def fixing_pdb2mmcif(pdb_as_string, outfile): """Convert a PDB legacy format string to mmCIF, fixing known issues. Adds missing chain names if necessary before conversion. Only returns log messages upon failure. Args: pdb_as_string (str): Input file content in PDB legacy format. outfile (~pathlib.Path | str): Output mmCIF file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. Returns: list[str]: MAXIT log messages on failure, empty list on success. Raises: RuntimeError: If ``RCSBROOT`` environment variable is not set. """ def _get_cname(used_chain_names): chars = ( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" "0123456789" "abcdefghijklmnopqrstuvwxyz" ) for length in count(1): for name in ("".join(p) for p in product(chars, repeat=length)): if name not in used_chain_names: used_chain_names.add(name) return name raise RuntimeError("unreachable") # pragma: no cover # fix PDB input entry = gemmi.read_pdb_string(pdb_as_string, split_chain_on_ter=True) entry.setup_entities() entry.assign_label_seq_id() fixed = False for mdl in entry: used_chain_names = set() no_name_chains = 0 for chn in mdl: if len(chn.name) == 0: no_name_chains += 1 used_chain_names.add(chn.name) if no_name_chains > 0: nchns = [] for chn in mdl: if len(chn.name) == 0: for schn in chn.subchains(): nchn = gemmi.Chain(_get_cname(used_chain_names)) nchn.append_residues(schn) nchns.append(nchn) for chn in nchns: mdl.add_chain(chn) # `remove_chain()` removes ALL occurences of chain name mdl.remove_chain("") fixed = True # input underwent fixing, needs to be written as new input file if fixed: # deduplicate_entities entry.add_entity_types() entry.assign_subchains(force=True) entry.ensure_entities() entry.deduplicate_entities() mog = gemmi.MmcifOutputGroups(all=True) entry.make_mmcif_document(mog).write_file(os.fspath(outfile)) return cif2mmcif(outfile, outfile) # convert to mmCIF with open(outfile, "w", encoding="ascii") as ofh: ofh.write(pdb_as_string) return pdb2mmcif(outfile, outfile)
def _get_not_modeled_residues(model): """Yield NotModeledResidueRange objects for all residue ranges in the Model that are not referenced by Atom objects""" # The following code is adapted from python-modelcif # Copyright (c) 2018-2025 IHM Working Group # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. # Function needs to access protected members to do its job, allow Pylint # pylint: disable=protected-access for assem in model.assembly: asym = assem.asym if hasattr(assem, "asym") else assem if not asym.entity.is_polymeric(): continue # Make a set of all residue indices of this asym "handled" # by being modeled with Atom objects handled_residues = set() for atom in model._atoms: if atom.asym_unit is asym: handled_residues.add(atom.seq_id) # Convert set to a list of residue ranges handled_residues = ihm.util._make_range_from_list( sorted(handled_residues) ) # Return not-modeled for each non-handled range for r in ihm.util._invert_ranges( handled_residues, end=assem.seq_id_range[1], start=assem.seq_id_range[0], ): yield modelcif.model.NotModeledResidueRange(asym, r[0], r[1]) def _sanitize_modelcif(system): """Sanitize the mmCIF file to be used by the model archive and add modelcif categories """ # The following code is adapted from python-modelcif # Copyright (c) 2018-2025 IHM Working Group # # Permission is hereby granted, free of charge, to any person obtaining a # copy of this software and associated documentation files (the "Software"), # to deal in the Software without restriction, including without limitation # the rights to use, copy, modify, merge, publish, distribute, sublicense, # and/or sell copies of the Software, and to permit persons to whom the # Software is furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER # DEALINGS IN THE SOFTWARE. if not system.title: system.title = "Auto-generated system" if not system.protocols: default_protocol = modelcif.protocol.Protocol() step = modelcif.protocol.ModelingStep( name="modeling", input_data=None, output_data=None ) default_protocol.steps.append(step) system.protocols.append(default_protocol) for model_group in system.model_groups: for model in model_group: # Entity description is also used by python-modelcif for # ma_data.name, which is mandatory, so it cannot be unknown/? for asym in model.assembly: if asym.entity.description is ihm.unknown: asym.entity.description = "target" model.not_modeled_residue_ranges.extend( _get_not_modeled_residues(model) ) return system # Replacement for an internal class of python-modelcif, allow protected member # access in Pylint # pylint: disable=protected-access class _MAQAMetricDumper(modelcif.dumper._QAMetricDumper): """Get all ma_qa_matric records, even those missing actual values.""" def dump_metric_types(self, system, writer): # pragma: no cover """Use system._qa_by_id instead of self._metric_classes_by_id.""" # Excluded from unit testing because copy of original code with # system._qa_by_id replacing the original self._metric_classes_by_id to # get QA from associated files. with writer.loop( "_ma_qa_metric", [ "id", "name", "description", "type", "mode", "type_other_details", "software_group_id", ], ) as lp: for i, m in system._qa_by_id.items(): # use i for the id as missing qa_metrics have not been # instantiated, yet. So they don't have a ._id lp.write( id=i, name=m.name, description=m.description, type=m.type, mode=m.mode, type_other_details=m.other_details, software_group_id=( m.software._group_id if m.software else None ), ) # pylint: enable=protected-access class _MAVariant(modelcif.dumper.ModelCIFVariant): """Wrapper class for writing ModelCIF files. We need a writer that preserves ma_qa_metric entries that have the actual scores in an accompanying file. """ def __init__(self): """Exchange dumpers""" super().__init__() ma_qa_metric_idx = None for i, dumper in enumerate(self._dumpers): if dumper == modelcif.dumper._QAMetricDumper: ma_qa_metric_idx = i elif dumper == _MAQAMetricDumper: return if ma_qa_metric_idx is not None: self._dumpers[ma_qa_metric_idx] = _MAQAMetricDumper else: # pragma: no cover # Defensive fallback if modelcif.dumper._QAMetricDumper is not # found in _dumpers, e.g. after a python-modelcif API change. _utils.warn_msg("modelcif.dumper._QAMetricDumper not found")
[docs] def cif2modelcif(infile, outfile): """Convert a CIF file into a minimalist ModelCIF file. Sanitizes the input and adds mandatory ModelCIF categories. The input file is expected to be in mmCIF format as produced by MAXIT. Args: infile (~pathlib.Path | str): Input mmCIF/ ModelCIF file (also as gzip). outfile (~pathlib.Path | str): Output ModelCIF file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. Returns: ~pathlib.Path | str: Path to the output file. Raises: RuntimeError: If reading or writing the CIF file fails. """ opener_in = _utils.get_opener(infile) opener_out = _utils.get_opener(outfile) try: with opener_in(infile, "rt", encoding="utf8") as fh: with opener_out(outfile, "wt", encoding="ascii") as fhout: modelcif.dumper.write( fhout, [_sanitize_modelcif(s) for s in modelcif.reader.read(fh)], variant=_MAVariant, ) return outfile except Exception as e: raise RuntimeError( f"Error sanitizing mmCIF file with mmcif: {str(e)}" ) from e
[docs] def pdb2modelcif(infile, outfile): """Convert a PDB legacy format file into a minimalist ModelCIF file. Fixes known issues before conversion. Only returns log messages upon failure. Args: infile (~pathlib.Path | str): Input file in PDB legacy format (also as gzip). outfile (~pathlib.Path | str): Output ModelCIF file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. Returns: list[str]: Error log on failure, empty list on success. Raises: RuntimeError: If ``RCSBROOT`` environment variable is not set. """ opener = _utils.get_opener(infile) with opener(infile, "rt", encoding="ascii") as fh: log = fixing_pdb2mmcif(fh.read(), outfile) if log: return log try: cif2modelcif(outfile, outfile) except RuntimeError as e: return [str(e)] return []
[docs] def coordfile2modelcif(infile, outfile): """Convert a macromolecular structure file to a minimalist ModelCIF file. Dispatches to :func:`pdb2modelcif` or :func:`cif2modelcif` based on the file extension. Supports ``.gz`` compressed files. Args: infile (~pathlib.Path | str): Input file in PDB or CIF format. Supported extensions: ``.pdb``, ``.cif``, ``.mmcif``, and their ``.gz`` variants. outfile (~pathlib.Path | str): Output ModelCIF file. If the filename ends with ``.gz`` or ``.gzip``, the output will be compressed. Returns: list[str]: Error log on failure, empty list on success. Raises: RuntimeError: If ``RCSBROOT`` environment variable is not set. ValueError: If the file extension is not supported. """ infile = Path(infile) suffix = infile.suffix.lower() if suffix == ".gz": suffix = Path(infile.stem).suffix.lower() if suffix == ".pdb": return pdb2modelcif(infile, outfile) if suffix in (".cif", ".mmcif"): try: cif2modelcif(infile, outfile) except RuntimeError as e: return [str(e)] return [] raise ValueError(f"Unsupported file extension: {infile.suffix}")
[docs] def main(): """Entry point for the ``ma-maxit`` command line tool.""" # For main functions we allow bad imports # pylint: disable=import-outside-toplevel import argparse import sys def _parse_command_line(): """Get arguments.""" parser = argparse.ArgumentParser( description="Run RCSB MAXIT from Python " + "(https://sw-tools.rcsb.org/apps/MAXIT/index.html)", ) parser.add_argument( "--input", help="Input/ source file", metavar="<INPUTFILE>", required=True, type=str, ) parser.add_argument( "--output", help="Output/ destination file", metavar="<OUTPUTFILE>", required=True, type=str, ) parser.add_argument( "--mode", "-o", dest="mode", help="Mode, 1: PDB to CIF, 2: CIF to PDB, 8: CIF to mmCIF, " + "ma: PDB to mmCIF with some fixes", metavar="<NUM>", required=True, type=str, ) opts = parser.parse_args() if not Path(opts.input).is_file(): print(f"No file '{opts.input}' found.", file=sys.stderr) sys.exit(1) return opts def _main(): """Run as script.""" opts = _parse_command_line() if opts.mode != "ma": log, extstts = run_maxit_log2list( opts.input, opts.output, opts.mode ) else: opener = _utils.get_opener(opts.input) with opener(opts.input, "rt", encoding="ascii") as lfh: pdblines = lfh.read() log = fixing_pdb2mmcif(pdblines, opts.output) if len(log) > 0: extstts = 1 else: extstts = 0 if extstts == 0: ostream = sys.stdout else: ostream = sys.stderr for line in log: print(line, file=ostream) sys.exit(extstts) _main()