deprecate Molecule.sequence, implement getSequence, make default key chain instead of segid

stefdoerr · stefdoerr · commit 35b2000b4419 · 2025-11-25T13:20:25.000+02:00
diff --git a/moleculekit/align.py b/moleculekit/align.py
@@ -136,10 +136,10 @@ def molTMalign(
     if refsel.sum() == 0:
         raise RuntimeError("No atoms in `refsel`")
 
-    seqx = mol.sequence(noseg=True, sel=molsel, _logger=False)["protein"].encode(
+    seqx = mol.getSequence(dict_key=None, sel=molsel, _logger=False)["protein"].encode(
         "UTF-8"
     )
-    seqy = ref.sequence(noseg=True, sel=refsel, _logger=False)["protein"].encode(
+    seqy = ref.getSequence(dict_key=None, sel=refsel, _logger=False)["protein"].encode(
         "UTF-8"
     )
 
diff --git a/moleculekit/molecule.py b/moleculekit/molecule.py
@@ -1987,17 +1987,25 @@ def empty(self, numAtoms, numFrames=0):
         self._emptyTraj(numAtoms, numFrames)
         return self
 
-    def sequence(
-        self, oneletter=True, noseg=False, return_idx=False, sel="all", _logger=True
+    def getSequence(
+        self,
+        one_letter=True,
+        dict_key="chain",
+        return_idx=False,
+        sel="all",
+        _logger=True,
     ):
         """Return the aminoacid sequence of the Molecule.
 
         Parameters
         ----------
-        oneletter : bool
+        one_letter : bool
             Whether to return one-letter or three-letter AA codes. There should be only one atom per residue.
-        noseg : bool
-            Ignore segments and return the whole sequence as single string.
+        dict_key : str | None
+            If None, the function will return a dictionary with keys "protein" and "nucleic" (if they exist)
+            and the concatenated sequence as the value.
+            If "chain" or "segid" is passed, the function will return a dictionary with the sequence of each
+            chain or segment.
         return_idx : bool
             If True, the function also returns the indexes of the atoms of each residue in the sequence
         sel : str
@@ -2006,35 +2014,50 @@ def sequence(
         Returns
         -------
         sequence : str
-            The primary sequence as a dictionary segid - string (if oneletter is True) or segid - list of
-            strings (otherwise).
+            The primary sequence as a dictionary.
 
         Examples
         --------
-        >>> mol=tryp.copy()
-        >>> mol.sequence()
-        {'0': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN'}
-        >>> sh2 = Molecule("1LKK")
-        >>> pYseq = sh2.sequence(oneletter=False)
-        >>> pYseq['1']
+        >>> mol = Molecule("3PTB")
+        >>> mol.getSequence()
+        {'A': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN'}
+        >>> mol.getSequence(sel="resid 16 to 50")
+        {'A': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQ'}
+        >>> mol = Molecule("1LKK")
+        >>> seq = mol.getSequence(one_letter=False, dict_key="chain")
+        >>> seq.keys()
+        dict_keys(['A', 'B'])
+        >>> seq['B']
         ['PTR', 'GLU', 'GLU', 'ILE']
-        >>> pYseq = sh2.sequence(oneletter=True)
-        >>> pYseq['1']
+        >>> seq = mol.getSequence(one_letter=True, dict_key="chain")
+        >>> seq['B']
         'XEEI'
-
+        >>> seq = mol.getSequence(one_letter=True, dict_key="segid")
+        >>> seq.keys()
+        dict_keys(['1', '2'])
+        >>> seq, idx = mol.getSequence(return_idx=True)
+        >>> idx['B'][-1] # The atom indexes of the last residue in chain B
+        array([1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728,
+               1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737])
         """
         from moleculekit.util import sequenceID
 
+        if dict_key is not None and dict_key not in ["segid", "chain"]:
+            raise ValueError(
+                f"Invalid dictionary key: {dict_key}. Allowed values are: segid, chain"
+            )
+
         prot = self.atomselect("protein")
         nucl = self.atomselect("nucleic")
         selb = self.atomselect(sel)
 
         increm = sequenceID((self.resid, self.insertion, self.chain))
-        segs = np.unique(self.segid[(prot | nucl) & selb])
         segSequences = {}
         seqAtoms = {}
-        if noseg:
+        if dict_key is None:
             segs = ["protein", "nucleic"]
+        else:
+            segs = np.unique(getattr(self, dict_key)[(prot | nucl) & selb])
 
         # Iterate over segments
         for seg in segs:
@@ -2045,26 +2068,42 @@ def sequence(
             elif seg == "nucleic":
                 segatoms = nucl & selb
             else:
-                segatoms = (prot | nucl) & (self.segid == seg) & selb
+                segatoms = (prot | nucl) & (getattr(self, dict_key) == seg) & selb
 
             seq, res_atm = _atoms_to_sequence(
                 self,
                 segatoms,
-                oneletter=oneletter,
+                oneletter=one_letter,
                 incremseg=increm[segatoms],
                 _logger=_logger,
             )
             segSequences[seg] = seq
             seqAtoms[seg] = res_atm
 
         # Join single letters into strings
-        if oneletter:
+        if one_letter:
             segSequences = {k: "".join(segSequences[k]) for k in segSequences}
 
         if return_idx:
             return segSequences, seqAtoms
         return segSequences
 
+    def sequence(
+        self, oneletter=True, noseg=False, return_idx=False, sel="all", _logger=True
+    ):
+        """DEPRECATED: Use getSequence instead."""
+        logger.warning(
+            "Molecule.sequence() method is deprecated. Please use the new Molecule.getSequence() method instead."
+            "Take care that the new method returns by default a dictionary with the chain IDs as keys instead of segment IDs."
+        )
+        return self.getSequence(
+            one_letter=oneletter,
+            dict_key=None if noseg else "segid",
+            return_idx=return_idx,
+            sel=sel,
+            _logger=_logger,
+        )
+
     def dropFrames(self, drop=None, keep=None):
         """Removes trajectory frames from the Molecule
 
@@ -3268,6 +3307,27 @@ def mol_equal(
     return True
 
 
+def _get_residue_indices(mol):
+    """
+    Get the indices of all residues in a Molecule object.
+
+    Parameters
+    ----------
+    mol : Molecule
+        The Molecule object to get the residue indices from.
+
+    Returns
+    -------
+    residue_indices : list
+        A list of arrays, each containing the indices of the atoms in a residue.
+    """
+    from moleculekit.util import sequenceID
+
+    unique_residues = np.unique(sequenceID((mol.resid, mol.insertion, mol.chain)))
+
+    return [np.where(unique_residues == uqresid)[0] for uqresid in set(unique_residues)]
+
+
 def _detectCollisions(coords1, coords2, gap, remove_idx):
     from moleculekit.distance_utils import get_collisions
 
diff --git a/moleculekit/opm.py b/moleculekit/opm.py
@@ -47,14 +47,14 @@ def generate_opm_sequences(opm_pdbs, outjson):
                     continue
                 sequences[name] = {}
                 if molp.numAtoms:
-                    seq = molp.sequence()
+                    seq = molp.getSequence()
                     for k in list(seq.keys()):
                         if len(seq[k]) < 5 or all([ss == "X" for ss in seq[k]]):
                             del seq[k]
                     if len(seq):
                         sequences[name]["protein"] = seq
                 if moln.numAtoms:
-                    seq = moln.sequence()
+                    seq = moln.getSequence()
                     for k in list(seq.keys()):
                         if len(seq[k]) < 5 or all([ss == "X" for ss in seq[k]]):
                             del seq[k]
@@ -211,8 +211,8 @@ def align_to_opm(mol, molsel="all", maxalignments=3, opmid=None, macrotype="prot
         # Throw away all other sequences
         sequences = {opmid.lower(): sequences[opmid.lower()]}
 
-    seqmol, molidx = mol.sequence(
-        noseg=True, return_idx=True, sel=molsel, _logger=False
+    seqmol, molidx = mol.getSequence(
+        dict_key=None, return_idx=True, sel=molsel, _logger=False
     )
     seqmol = seqmol[macrotype]
     molidx = molidx[macrotype]
@@ -228,7 +228,7 @@ def align_to_opm(mol, molsel="all", maxalignments=3, opmid=None, macrotype="prot
         )
         ref, thickness = get_opm_pdb(pdbid, validateElements=False)
 
-        seqref, refidx = ref.sequence(noseg=True, return_idx=True, _logger=False)
+        seqref, refidx = ref.getSequence(dict_key=None, return_idx=True, _logger=False)
         seqref = seqref[macrotype]
         refidx = refidx[macrotype]
 
diff --git a/moleculekit/readers.py b/moleculekit/readers.py
@@ -803,12 +803,22 @@ def MAEread(fname, frame=None, topoloc=None):
 
 
 def _getLocalPDB(fname):
-    if "LOCAL_PDB_REPO" in os.environ and os.path.isfile(
-        os.path.join(os.environ["LOCAL_PDB_REPO"], fname)
-    ):
-        filepath = os.path.join(os.environ["LOCAL_PDB_REPO"], fname)
-        logger.info(f"Using local copy for {fname}: {filepath}")
-        return filepath
+    if os.environ.get("LOCAL_PDB_REPO") is not None:
+        if os.path.isfile(os.path.join(os.environ["LOCAL_PDB_REPO"], fname)):
+            filepath = os.path.join(os.environ["LOCAL_PDB_REPO"], fname)
+            logger.info(f"Using local copy for {fname}: {filepath}")
+            return filepath
+        elif len(fname) == 4:
+            fname = fname.lower()
+            filename = os.path.join(os.environ["LOCAL_PDB_REPO"], fname + ".cif")
+            if os.path.isfile(filename):
+                return filename
+            filename = os.path.join(os.environ["LOCAL_PDB_REPO"], fname + ".bcif.gz")
+            if os.path.isfile(filename):
+                return filename
+            filename = os.path.join(os.environ["LOCAL_PDB_REPO"], fname + ".pdb")
+            if os.path.isfile(filename):
+                return filename
     return None
 
 
diff --git a/moleculekit/tools/modelling.py b/moleculekit/tools/modelling.py
@@ -118,7 +118,7 @@ def model_gaps(
         mol_seg = mol.copy(sel=f"segid {segid}")
         mol_seg.write(pdbfile)
 
-        molseq = mol.sequence()[segid]
+        molseq = mol.getSequence(dict_key="segid")[segid]
 
         # -11 is gap creation penalty. -1 is gap extension penalty. Taken from https://www.arabidopsis.org/Blast/BLASToptions.jsp BLASTP options
         alignments = pairwise2.align.globalds(sequence, molseq, blosum62, -11.0, -1.0)
diff --git a/moleculekit/tools/sequencestructuralalignment.py b/moleculekit/tools/sequencestructuralalignment.py
@@ -19,7 +19,9 @@ def _get_sequence(mol: Molecule, sel):
             "Your selection contains both protein and nucleic residues. You need to clarify which selection to align."
         )
     molseg = "protein" if any(protein_mask) else "nucleic"
-    seqmol, seqidx = mol.sequence(noseg=True, return_idx=True, sel=sel, _logger=False)
+    seqmol, seqidx = mol.getSequence(
+        dict_key=None, return_idx=True, sel=sel, _logger=False
+    )
     seqidx = seqidx[molseg]
     seqmol = seqmol[molseg]
     segment_type = molseg
diff --git a/moleculekit/writers.py b/moleculekit/writers.py
@@ -1314,7 +1314,7 @@ def __init__(self, mol):
             protein = mol.atomselect("protein")
             nucleic = mol.atomselect("nucleic")
             water = mol.atomselect("water")
-            sequences = mol.sequence()
+            sequences = mol.getSequence()
             insertions = []
             self.group_id_list = []
             chain_count = 0
diff --git a/pyproject.toml b/pyproject.toml
@@ -49,11 +49,12 @@ requires = [
 ]
 build-backend = "setuptools.build_meta"
 
+[dependency-groups]
+dev = ["ipython>=8.18.1", "pytest>=8.4.2"]
+
 [tool.pytest.ini_options]
-python_files = "*.py"
-python_classes = "_Test"
 python_functions = "_test*"
-norecursedirs = "test-data"
+testpaths = ["tests"]
 
 
 [tool.cibuildwheel]
diff --git a/tests/test_molecule.py b/tests/test_molecule.py
@@ -233,15 +233,52 @@ def _test_reorderAtoms():
 
 
 def _test_sequence():
-    seq, seqatms = MOL3PTB.sequence(return_idx=True)
+    seq, seqatms = MOL3PTB.getSequence(return_idx=True)
     refseq = "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"
-    assert seq["1"] == refseq
+    assert seq["A"] == refseq
 
     # Ensure that the returned indexes only belong to a single residue
-    for indexes in seqatms["1"]:
+    for indexes in seqatms["A"]:
         assert len(np.unique(MOL3PTB.resname[indexes])) == 1
         assert len(np.unique(MOL3PTB.resid[indexes])) == 1
 
+    seq = MOL3PTB.getSequence(sel="resid 16 to 50")
+    assert seq == {"A": "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQ"}
+
+    mol = Molecule("1lkk")
+    seq, idx = mol.getSequence(return_idx=True)
+    assert seq == {
+        "A": "LEPEPWFFKNLSRKDAERQLLAPGNTHGSFLIRESESTAGSFSLSVRDFDQNQGEVVKHYKIRNLDNGGFYISPRITFPGLHELVRHYTNASDGLCTRLSRPCQT",
+        "B": "XEEI",
+    }
+
+    refidx = np.array(
+        [
+            1688,
+            1689,
+            1690,
+            1691,
+            1692,
+            1693,
+            1694,
+            1695,
+            1696,
+            1697,
+            1698,
+            1699,
+            1700,
+            1701,
+            1702,
+        ]
+    )
+    assert np.array_equal(idx["B"][1], refidx)
+    seq = mol.getSequence(dict_key="segid")
+    assert seq == {
+        "1": "LEPEPWFFKNLSRKDAERQLLAPGNTHGSFLIRESESTAGSFSLSVRDFDQNQGEVVKHYKIRNLDNGGFYISPRITFPGLHELVRHYTNASDGLCTRLSRPCQT",
+        "2": "XEEI",
+    }
+    assert mol.getSequence(one_letter=False)["B"] == ["PTR", "GLU", "GLU", "ILE"]
+
 
 def _test_appendFrames():
     trajmol = TRAJMOL.copy()

Original file line number	Diff line number	Diff line change
`@@ -136,10 +136,10 @@ def molTMalign(`
`136`	`136`	`if refsel.sum() == 0:`
`137`	`137`	raise RuntimeError("No atoms in `refsel`")
`138`	`138`
`139`		`- seqx = mol.sequence(noseg=True, sel=molsel, _logger=False)["protein"].encode(`
	`139`	`+ seqx = mol.getSequence(dict_key=None, sel=molsel, _logger=False)["protein"].encode(`
`140`	`140`	`"UTF-8"`
`141`	`141`	`)`
`142`		`- seqy = ref.sequence(noseg=True, sel=refsel, _logger=False)["protein"].encode(`
	`142`	`+ seqy = ref.getSequence(dict_key=None, sel=refsel, _logger=False)["protein"].encode(`
`143`	`143`	`"UTF-8"`
`144`	`144`	`)`
`145`	`145`