"""`MAXIT`_ from `RCSB`_ converts coordinate files in PDB legacy format to CIF
and CIF files to mmCIF. This module also adds functionality to turn a PDB file
into a (minimalist) ModelCIF file. But don't get too excited - none of the
functionality will turn a PDB file into a fully annotated ModelCIF file. It just
makes sure the starting point is of valid CIF syntax. Extra data still need to be
added...
`MAXIT`_ is not bundled with this module. The source code can be downloaded
`here <https://sw-tools.rcsb.org/apps/MAXIT/source.html>`_.
`Installation instructions <https://sw-tools.rcsb.org/apps/MAXIT/README-source>`_
are available, and here is a TL;DR how to compile on macOS and most Linux
distributions:
.. code-block:: bash
# cd into the unpacked source directory first
export RCSBROOT=$(pwd)
make
make binary
# binaries are found in bin/
# RCSBROOT needs to point at data/ when running maxit
"""
# Copyright (c) 2026, SIB - Swiss Institute of Bioinformatics and
# Biozentrum - University of Basel
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
from itertools import product, count
from pathlib import Path
import gzip
import os
import subprocess
import tempfile
import gemmi
import ihm
import modelcif.dumper
import modelcif.reader
from modelarchive import _utils
MAXIT_BINARY = os.environ.get("MAXIT_BINARY", "maxit")
"""Path to the maxit binary, defaults to ``maxit`` from ``$PATH``.
Can be overridden by setting the ``MAXIT_BINARY`` environment variable
before import.
"""
[docs]
def run_maxit(infile, outfile, mode, logfile=None):
"""Run MAXIT without checks, mode-preselection, or cleanup.
Args:
infile (~pathlib.Path | str): Input file. Either PDB legacy format or
CIF.
outfile (~pathlib.Path | str): Output file.
mode (str): MAXIT operation mode. Use ``"1"`` for PDB to CIF, ``"2"``
for CIF to PDB, ``"8"`` for CIF to mmCIF.
logfile (~pathlib.Path | str, optional): File for MAXIT log messages.
Returns:
subprocess.CompletedProcess: Result of the MAXIT run.
"""
cmd = [
MAXIT_BINARY,
"-input",
os.fspath(infile),
"-output",
os.fspath(outfile),
"-o",
mode,
]
if logfile is not None:
cmd.extend(["-log", os.fspath(logfile)])
return subprocess.run(cmd, check=True)
[docs]
def run_maxit_log2list(infile, outfile, mode):
"""Run MAXIT and return the log file content as a list.
Args:
infile (~pathlib.Path | str): Input file. Either PDB legacy format or
CIF (also as gzip).
outfile (~pathlib.Path | str): Output file. If the filename ends with
``.gz`` or ``.gzip``, the output will be compressed.
mode (str): MAXIT operation mode. Use ``"1"`` for PDB to CIF, ``"2"``
for CIF to PDB, ``"8"`` for CIF to mmCIF.
Returns:
tuple[list[str], int]: A tuple of the log file content as a list
of strings and the MAXIT exit status.
"""
log = []
extstts = 0
# Note: `delete_on_close=False` (Python >= 3.12) would be more performant
# but we support Python >= 3.10.
with tempfile.NamedTemporaryFile(mode="r") as mxtlog:
try:
prc = run_maxit(infile, outfile, mode, logfile=mxtlog.name)
extstts = prc.returncode
except subprocess.CalledProcessError as exc:
extstts = exc.returncode
for line in mxtlog:
line = line.strip()
log.append(line)
if (
Path(outfile).suffix.lower() in (".gz", ".gzip")
and Path(outfile).exists()
):
with open(outfile, "rb") as fh:
content = fh.read()
with gzip.open(outfile, "wb") as fh:
fh.write(content)
return (log, extstts)
def _format2format(infile, outfile, mode):
"""Wrapper for convenience converters."""
if "RCSBROOT" not in os.environ:
raise RuntimeError("RCSBROOT environment variable is not set.")
log, extstts = run_maxit_log2list(infile, outfile, mode)
if extstts != 0:
return log
return []
[docs]
def pdb2cif(infile, outfile):
"""Convert a PDB legacy format file to CIF using MAXIT.
Only returns log messages upon failure.
Args:
infile (~pathlib.Path | str): Input file in PDB legacy format (also as
gzip).
outfile (~pathlib.Path | str): Output CIF file. If the filename
ends with ``.gz`` or ``.gzip``, the output will be compressed.
Returns:
list[str]: MAXIT log messages on failure, empty list on success.
Raises:
RuntimeError: If ``RCSBROOT`` environment variable is not set.
"""
return _format2format(infile, outfile, "1")
[docs]
def cif2mmcif(infile, outfile):
"""Convert a CIF file to mmCIF using MAXIT.
Only returns log messages upon failure.
Args:
infile (~pathlib.Path | str): Input CIF file (also as gzip).
outfile (~pathlib.Path | str): Output mmCIF file. If the filename ends
with ``.gz`` or ``.gzip``, the output will be compressed.
Returns:
list[str]: MAXIT log messages on failure, empty list on success.
Raises:
RuntimeError: If ``RCSBROOT`` environment variable is not set.
"""
return _format2format(infile, outfile, "8")
[docs]
def pdb2mmcif(infile, outfile):
"""Convert a PDB legacy format file to mmCIF using MAXIT.
Runs MAXIT first in PDB to CIF mode, then converts the result to mmCIF.
Only returns log messages upon failure.
Args:
infile (~pathlib.Path | str): Input file in PDB legacy format (also as
gzip).
outfile (~pathlib.Path | str): Output mmCIF file. If the filename ends
with ``.gz`` or ``.gzip``, the output will be compressed.
Returns:
list[str]: MAXIT log messages on failure, empty list on success.
On failure, the first element indicates which conversion step
failed.
Raises:
RuntimeError: If ``RCSBROOT`` environment variable is not set.
"""
log = pdb2cif(infile, outfile)
if log:
log.insert(0, "PDB to CIF conversion failed")
return log
log = cif2mmcif(outfile, outfile)
if log:
log.insert(0, "CIF to mmCIF conversion failed")
return log
[docs]
def fixing_pdb2mmcif(pdb_as_string, outfile):
"""Convert a PDB legacy format string to mmCIF, fixing known issues.
Adds missing chain names if necessary before conversion. Only returns
log messages upon failure.
Args:
pdb_as_string (str): Input file content in PDB legacy format.
outfile (~pathlib.Path | str): Output mmCIF file. If the filename
ends with ``.gz`` or ``.gzip``, the output will be compressed.
Returns:
list[str]: MAXIT log messages on failure, empty list on success.
Raises:
RuntimeError: If ``RCSBROOT`` environment variable is not set.
"""
def _get_cname(used_chain_names):
chars = (
"ABCDEFGHIJKLMNOPQRSTUVWXYZ"
"0123456789"
"abcdefghijklmnopqrstuvwxyz"
)
for length in count(1):
for name in ("".join(p) for p in product(chars, repeat=length)):
if name not in used_chain_names:
used_chain_names.add(name)
return name
raise RuntimeError("unreachable") # pragma: no cover
# fix PDB input
entry = gemmi.read_pdb_string(pdb_as_string, split_chain_on_ter=True)
entry.setup_entities()
entry.assign_label_seq_id()
fixed = False
for mdl in entry:
used_chain_names = set()
no_name_chains = 0
for chn in mdl:
if len(chn.name) == 0:
no_name_chains += 1
used_chain_names.add(chn.name)
if no_name_chains > 0:
nchns = []
for chn in mdl:
if len(chn.name) == 0:
for schn in chn.subchains():
nchn = gemmi.Chain(_get_cname(used_chain_names))
nchn.append_residues(schn)
nchns.append(nchn)
for chn in nchns:
mdl.add_chain(chn)
# `remove_chain()` removes ALL occurences of chain name
mdl.remove_chain("")
fixed = True
# input underwent fixing, needs to be written as new input file
if fixed:
# deduplicate_entities
entry.add_entity_types()
entry.assign_subchains(force=True)
entry.ensure_entities()
entry.deduplicate_entities()
mog = gemmi.MmcifOutputGroups(all=True)
entry.make_mmcif_document(mog).write_file(os.fspath(outfile))
return cif2mmcif(outfile, outfile)
# convert to mmCIF
with open(outfile, "w", encoding="ascii") as ofh:
ofh.write(pdb_as_string)
return pdb2mmcif(outfile, outfile)
def _get_not_modeled_residues(model):
"""Yield NotModeledResidueRange objects for all residue ranges in the
Model that are not referenced by Atom objects"""
# The following code is adapted from python-modelcif
# Copyright (c) 2018-2025 IHM Working Group
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
# Function needs to access protected members to do its job, allow Pylint
# pylint: disable=protected-access
for assem in model.assembly:
asym = assem.asym if hasattr(assem, "asym") else assem
if not asym.entity.is_polymeric():
continue
# Make a set of all residue indices of this asym "handled"
# by being modeled with Atom objects
handled_residues = set()
for atom in model._atoms:
if atom.asym_unit is asym:
handled_residues.add(atom.seq_id)
# Convert set to a list of residue ranges
handled_residues = ihm.util._make_range_from_list(
sorted(handled_residues)
)
# Return not-modeled for each non-handled range
for r in ihm.util._invert_ranges(
handled_residues,
end=assem.seq_id_range[1],
start=assem.seq_id_range[0],
):
yield modelcif.model.NotModeledResidueRange(asym, r[0], r[1])
def _sanitize_modelcif(system):
"""Sanitize the mmCIF file to be used by the model archive and add modelcif
categories
"""
# The following code is adapted from python-modelcif
# Copyright (c) 2018-2025 IHM Working Group
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.
if not system.title:
system.title = "Auto-generated system"
if not system.protocols:
default_protocol = modelcif.protocol.Protocol()
step = modelcif.protocol.ModelingStep(
name="modeling", input_data=None, output_data=None
)
default_protocol.steps.append(step)
system.protocols.append(default_protocol)
for model_group in system.model_groups:
for model in model_group:
# Entity description is also used by python-modelcif for
# ma_data.name, which is mandatory, so it cannot be unknown/?
for asym in model.assembly:
if asym.entity.description is ihm.unknown:
asym.entity.description = "target"
model.not_modeled_residue_ranges.extend(
_get_not_modeled_residues(model)
)
return system
# Replacement for an internal class of python-modelcif, allow protected member
# access in Pylint
# pylint: disable=protected-access
class _MAQAMetricDumper(modelcif.dumper._QAMetricDumper):
"""Get all ma_qa_matric records, even those missing actual values."""
def dump_metric_types(self, system, writer): # pragma: no cover
"""Use system._qa_by_id instead of self._metric_classes_by_id."""
# Excluded from unit testing because copy of original code with
# system._qa_by_id replacing the original self._metric_classes_by_id to
# get QA from associated files.
with writer.loop(
"_ma_qa_metric",
[
"id",
"name",
"description",
"type",
"mode",
"type_other_details",
"software_group_id",
],
) as lp:
for i, m in system._qa_by_id.items():
# use i for the id as missing qa_metrics have not been
# instantiated, yet. So they don't have a ._id
lp.write(
id=i,
name=m.name,
description=m.description,
type=m.type,
mode=m.mode,
type_other_details=m.other_details,
software_group_id=(
m.software._group_id if m.software else None
),
)
# pylint: enable=protected-access
class _MAVariant(modelcif.dumper.ModelCIFVariant):
"""Wrapper class for writing ModelCIF files.
We need a writer that preserves ma_qa_metric entries that have the actual
scores in an accompanying file.
"""
def __init__(self):
"""Exchange dumpers"""
super().__init__()
ma_qa_metric_idx = None
for i, dumper in enumerate(self._dumpers):
if dumper == modelcif.dumper._QAMetricDumper:
ma_qa_metric_idx = i
elif dumper == _MAQAMetricDumper:
return
if ma_qa_metric_idx is not None:
self._dumpers[ma_qa_metric_idx] = _MAQAMetricDumper
else: # pragma: no cover
# Defensive fallback if modelcif.dumper._QAMetricDumper is not
# found in _dumpers, e.g. after a python-modelcif API change.
_utils.warn_msg("modelcif.dumper._QAMetricDumper not found")
[docs]
def cif2modelcif(infile, outfile):
"""Convert a CIF file into a minimalist ModelCIF file.
Sanitizes the input and adds mandatory ModelCIF categories. The input
file is expected to be in mmCIF format as produced by MAXIT.
Args:
infile (~pathlib.Path | str): Input mmCIF/ ModelCIF file (also as gzip).
outfile (~pathlib.Path | str): Output ModelCIF file. If the filename
ends with ``.gz`` or ``.gzip``, the output will be compressed.
Returns:
~pathlib.Path | str: Path to the output file.
Raises:
RuntimeError: If reading or writing the CIF file fails.
"""
opener_in = _utils.get_opener(infile)
opener_out = _utils.get_opener(outfile)
try:
with opener_in(infile, "rt", encoding="utf8") as fh:
with opener_out(outfile, "wt", encoding="ascii") as fhout:
modelcif.dumper.write(
fhout,
[_sanitize_modelcif(s) for s in modelcif.reader.read(fh)],
variant=_MAVariant,
)
return outfile
except Exception as e:
raise RuntimeError(
f"Error sanitizing mmCIF file with mmcif: {str(e)}"
) from e
[docs]
def pdb2modelcif(infile, outfile):
"""Convert a PDB legacy format file into a minimalist ModelCIF file.
Fixes known issues before conversion. Only returns log messages upon
failure.
Args:
infile (~pathlib.Path | str): Input file in PDB legacy format (also as
gzip).
outfile (~pathlib.Path | str): Output ModelCIF file. If the filename
ends with ``.gz`` or ``.gzip``, the output will be compressed.
Returns:
list[str]: Error log on failure, empty list on success.
Raises:
RuntimeError: If ``RCSBROOT`` environment variable is not set.
"""
opener = _utils.get_opener(infile)
with opener(infile, "rt", encoding="ascii") as fh:
log = fixing_pdb2mmcif(fh.read(), outfile)
if log:
return log
try:
cif2modelcif(outfile, outfile)
except RuntimeError as e:
return [str(e)]
return []
[docs]
def coordfile2modelcif(infile, outfile):
"""Convert a macromolecular structure file to a minimalist ModelCIF file.
Dispatches to :func:`pdb2modelcif` or :func:`cif2modelcif` based on the
file extension. Supports ``.gz`` compressed files.
Args:
infile (~pathlib.Path | str): Input file in PDB or CIF format.
Supported extensions: ``.pdb``, ``.cif``, ``.mmcif``, and their
``.gz`` variants.
outfile (~pathlib.Path | str): Output ModelCIF file. If the filename
ends with ``.gz`` or ``.gzip``, the output will be compressed.
Returns:
list[str]: Error log on failure, empty list on success.
Raises:
RuntimeError: If ``RCSBROOT`` environment variable is not set.
ValueError: If the file extension is not supported.
"""
infile = Path(infile)
suffix = infile.suffix.lower()
if suffix == ".gz":
suffix = Path(infile.stem).suffix.lower()
if suffix == ".pdb":
return pdb2modelcif(infile, outfile)
if suffix in (".cif", ".mmcif"):
try:
cif2modelcif(infile, outfile)
except RuntimeError as e:
return [str(e)]
return []
raise ValueError(f"Unsupported file extension: {infile.suffix}")
[docs]
def main():
"""Entry point for the ``ma-maxit`` command line tool."""
# For main functions we allow bad imports
# pylint: disable=import-outside-toplevel
import argparse
import sys
def _parse_command_line():
"""Get arguments."""
parser = argparse.ArgumentParser(
description="Run RCSB MAXIT from Python "
+ "(https://sw-tools.rcsb.org/apps/MAXIT/index.html)",
)
parser.add_argument(
"--input",
help="Input/ source file",
metavar="<INPUTFILE>",
required=True,
type=str,
)
parser.add_argument(
"--output",
help="Output/ destination file",
metavar="<OUTPUTFILE>",
required=True,
type=str,
)
parser.add_argument(
"--mode",
"-o",
dest="mode",
help="Mode, 1: PDB to CIF, 2: CIF to PDB, 8: CIF to mmCIF, "
+ "ma: PDB to mmCIF with some fixes",
metavar="<NUM>",
required=True,
type=str,
)
opts = parser.parse_args()
if not Path(opts.input).is_file():
print(f"No file '{opts.input}' found.", file=sys.stderr)
sys.exit(1)
return opts
def _main():
"""Run as script."""
opts = _parse_command_line()
if opts.mode != "ma":
log, extstts = run_maxit_log2list(
opts.input, opts.output, opts.mode
)
else:
opener = _utils.get_opener(opts.input)
with opener(opts.input, "rt", encoding="ascii") as lfh:
pdblines = lfh.read()
log = fixing_pdb2mmcif(pdblines, opts.output)
if len(log) > 0:
extstts = 1
else:
extstts = 0
if extstts == 0:
ostream = sys.stdout
else:
ostream = sys.stderr
for line in log:
print(line, file=ostream)
sys.exit(extstts)
_main()