Skip to content

Commit 35b2000

Browse files
committed
deprecate Molecule.sequence, implement getSequence, make default key chain instead of segid
1 parent 6c67c60 commit 35b2000

File tree

9 files changed

+153
-43
lines changed

9 files changed

+153
-43
lines changed

moleculekit/align.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -136,10 +136,10 @@ def molTMalign(
136136
if refsel.sum() == 0:
137137
raise RuntimeError("No atoms in `refsel`")
138138

139-
seqx = mol.sequence(noseg=True, sel=molsel, _logger=False)["protein"].encode(
139+
seqx = mol.getSequence(dict_key=None, sel=molsel, _logger=False)["protein"].encode(
140140
"UTF-8"
141141
)
142-
seqy = ref.sequence(noseg=True, sel=refsel, _logger=False)["protein"].encode(
142+
seqy = ref.getSequence(dict_key=None, sel=refsel, _logger=False)["protein"].encode(
143143
"UTF-8"
144144
)
145145

moleculekit/molecule.py

Lines changed: 81 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1987,17 +1987,25 @@ def empty(self, numAtoms, numFrames=0):
19871987
self._emptyTraj(numAtoms, numFrames)
19881988
return self
19891989

1990-
def sequence(
1991-
self, oneletter=True, noseg=False, return_idx=False, sel="all", _logger=True
1990+
def getSequence(
1991+
self,
1992+
one_letter=True,
1993+
dict_key="chain",
1994+
return_idx=False,
1995+
sel="all",
1996+
_logger=True,
19921997
):
19931998
"""Return the aminoacid sequence of the Molecule.
19941999
19952000
Parameters
19962001
----------
1997-
oneletter : bool
2002+
one_letter : bool
19982003
Whether to return one-letter or three-letter AA codes. There should be only one atom per residue.
1999-
noseg : bool
2000-
Ignore segments and return the whole sequence as single string.
2004+
dict_key : str | None
2005+
If None, the function will return a dictionary with keys "protein" and "nucleic" (if they exist)
2006+
and the concatenated sequence as the value.
2007+
If "chain" or "segid" is passed, the function will return a dictionary with the sequence of each
2008+
chain or segment.
20012009
return_idx : bool
20022010
If True, the function also returns the indexes of the atoms of each residue in the sequence
20032011
sel : str
@@ -2006,35 +2014,50 @@ def sequence(
20062014
Returns
20072015
-------
20082016
sequence : str
2009-
The primary sequence as a dictionary segid - string (if oneletter is True) or segid - list of
2010-
strings (otherwise).
2017+
The primary sequence as a dictionary.
20112018
20122019
Examples
20132020
--------
2014-
>>> mol=tryp.copy()
2015-
>>> mol.sequence()
2016-
{'0': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN'}
2017-
>>> sh2 = Molecule("1LKK")
2018-
>>> pYseq = sh2.sequence(oneletter=False)
2019-
>>> pYseq['1']
2021+
>>> mol = Molecule("3PTB")
2022+
>>> mol.getSequence()
2023+
{'A': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN'}
2024+
>>> mol.getSequence(sel="resid 16 to 50")
2025+
{'A': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQ'}
2026+
>>> mol = Molecule("1LKK")
2027+
>>> seq = mol.getSequence(one_letter=False, dict_key="chain")
2028+
>>> seq.keys()
2029+
dict_keys(['A', 'B'])
2030+
>>> seq['B']
20202031
['PTR', 'GLU', 'GLU', 'ILE']
2021-
>>> pYseq = sh2.sequence(oneletter=True)
2022-
>>> pYseq['1']
2032+
>>> seq = mol.getSequence(one_letter=True, dict_key="chain")
2033+
>>> seq['B']
20232034
'XEEI'
2024-
2035+
>>> seq = mol.getSequence(one_letter=True, dict_key="segid")
2036+
>>> seq.keys()
2037+
dict_keys(['1', '2'])
2038+
>>> seq, idx = mol.getSequence(return_idx=True)
2039+
>>> idx['B'][-1] # The atom indexes of the last residue in chain B
2040+
array([1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728,
2041+
1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737])
20252042
"""
20262043
from moleculekit.util import sequenceID
20272044

2045+
if dict_key is not None and dict_key not in ["segid", "chain"]:
2046+
raise ValueError(
2047+
f"Invalid dictionary key: {dict_key}. Allowed values are: segid, chain"
2048+
)
2049+
20282050
prot = self.atomselect("protein")
20292051
nucl = self.atomselect("nucleic")
20302052
selb = self.atomselect(sel)
20312053

20322054
increm = sequenceID((self.resid, self.insertion, self.chain))
2033-
segs = np.unique(self.segid[(prot | nucl) & selb])
20342055
segSequences = {}
20352056
seqAtoms = {}
2036-
if noseg:
2057+
if dict_key is None:
20372058
segs = ["protein", "nucleic"]
2059+
else:
2060+
segs = np.unique(getattr(self, dict_key)[(prot | nucl) & selb])
20382061

20392062
# Iterate over segments
20402063
for seg in segs:
@@ -2045,26 +2068,42 @@ def sequence(
20452068
elif seg == "nucleic":
20462069
segatoms = nucl & selb
20472070
else:
2048-
segatoms = (prot | nucl) & (self.segid == seg) & selb
2071+
segatoms = (prot | nucl) & (getattr(self, dict_key) == seg) & selb
20492072

20502073
seq, res_atm = _atoms_to_sequence(
20512074
self,
20522075
segatoms,
2053-
oneletter=oneletter,
2076+
oneletter=one_letter,
20542077
incremseg=increm[segatoms],
20552078
_logger=_logger,
20562079
)
20572080
segSequences[seg] = seq
20582081
seqAtoms[seg] = res_atm
20592082

20602083
# Join single letters into strings
2061-
if oneletter:
2084+
if one_letter:
20622085
segSequences = {k: "".join(segSequences[k]) for k in segSequences}
20632086

20642087
if return_idx:
20652088
return segSequences, seqAtoms
20662089
return segSequences
20672090

2091+
def sequence(
2092+
self, oneletter=True, noseg=False, return_idx=False, sel="all", _logger=True
2093+
):
2094+
"""DEPRECATED: Use getSequence instead."""
2095+
logger.warning(
2096+
"Molecule.sequence() method is deprecated. Please use the new Molecule.getSequence() method instead."
2097+
"Take care that the new method returns by default a dictionary with the chain IDs as keys instead of segment IDs."
2098+
)
2099+
return self.getSequence(
2100+
one_letter=oneletter,
2101+
dict_key=None if noseg else "segid",
2102+
return_idx=return_idx,
2103+
sel=sel,
2104+
_logger=_logger,
2105+
)
2106+
20682107
def dropFrames(self, drop=None, keep=None):
20692108
"""Removes trajectory frames from the Molecule
20702109
@@ -3268,6 +3307,27 @@ def mol_equal(
32683307
return True
32693308

32703309

3310+
def _get_residue_indices(mol):
3311+
"""
3312+
Get the indices of all residues in a Molecule object.
3313+
3314+
Parameters
3315+
----------
3316+
mol : Molecule
3317+
The Molecule object to get the residue indices from.
3318+
3319+
Returns
3320+
-------
3321+
residue_indices : list
3322+
A list of arrays, each containing the indices of the atoms in a residue.
3323+
"""
3324+
from moleculekit.util import sequenceID
3325+
3326+
unique_residues = np.unique(sequenceID((mol.resid, mol.insertion, mol.chain)))
3327+
3328+
return [np.where(unique_residues == uqresid)[0] for uqresid in set(unique_residues)]
3329+
3330+
32713331
def _detectCollisions(coords1, coords2, gap, remove_idx):
32723332
from moleculekit.distance_utils import get_collisions
32733333

moleculekit/opm.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -47,14 +47,14 @@ def generate_opm_sequences(opm_pdbs, outjson):
4747
continue
4848
sequences[name] = {}
4949
if molp.numAtoms:
50-
seq = molp.sequence()
50+
seq = molp.getSequence()
5151
for k in list(seq.keys()):
5252
if len(seq[k]) < 5 or all([ss == "X" for ss in seq[k]]):
5353
del seq[k]
5454
if len(seq):
5555
sequences[name]["protein"] = seq
5656
if moln.numAtoms:
57-
seq = moln.sequence()
57+
seq = moln.getSequence()
5858
for k in list(seq.keys()):
5959
if len(seq[k]) < 5 or all([ss == "X" for ss in seq[k]]):
6060
del seq[k]
@@ -211,8 +211,8 @@ def align_to_opm(mol, molsel="all", maxalignments=3, opmid=None, macrotype="prot
211211
# Throw away all other sequences
212212
sequences = {opmid.lower(): sequences[opmid.lower()]}
213213

214-
seqmol, molidx = mol.sequence(
215-
noseg=True, return_idx=True, sel=molsel, _logger=False
214+
seqmol, molidx = mol.getSequence(
215+
dict_key=None, return_idx=True, sel=molsel, _logger=False
216216
)
217217
seqmol = seqmol[macrotype]
218218
molidx = molidx[macrotype]
@@ -228,7 +228,7 @@ def align_to_opm(mol, molsel="all", maxalignments=3, opmid=None, macrotype="prot
228228
)
229229
ref, thickness = get_opm_pdb(pdbid, validateElements=False)
230230

231-
seqref, refidx = ref.sequence(noseg=True, return_idx=True, _logger=False)
231+
seqref, refidx = ref.getSequence(dict_key=None, return_idx=True, _logger=False)
232232
seqref = seqref[macrotype]
233233
refidx = refidx[macrotype]
234234

moleculekit/readers.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -803,12 +803,22 @@ def MAEread(fname, frame=None, topoloc=None):
803803

804804

805805
def _getLocalPDB(fname):
806-
if "LOCAL_PDB_REPO" in os.environ and os.path.isfile(
807-
os.path.join(os.environ["LOCAL_PDB_REPO"], fname)
808-
):
809-
filepath = os.path.join(os.environ["LOCAL_PDB_REPO"], fname)
810-
logger.info(f"Using local copy for {fname}: {filepath}")
811-
return filepath
806+
if os.environ.get("LOCAL_PDB_REPO") is not None:
807+
if os.path.isfile(os.path.join(os.environ["LOCAL_PDB_REPO"], fname)):
808+
filepath = os.path.join(os.environ["LOCAL_PDB_REPO"], fname)
809+
logger.info(f"Using local copy for {fname}: {filepath}")
810+
return filepath
811+
elif len(fname) == 4:
812+
fname = fname.lower()
813+
filename = os.path.join(os.environ["LOCAL_PDB_REPO"], fname + ".cif")
814+
if os.path.isfile(filename):
815+
return filename
816+
filename = os.path.join(os.environ["LOCAL_PDB_REPO"], fname + ".bcif.gz")
817+
if os.path.isfile(filename):
818+
return filename
819+
filename = os.path.join(os.environ["LOCAL_PDB_REPO"], fname + ".pdb")
820+
if os.path.isfile(filename):
821+
return filename
812822
return None
813823

814824

moleculekit/tools/modelling.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -118,7 +118,7 @@ def model_gaps(
118118
mol_seg = mol.copy(sel=f"segid {segid}")
119119
mol_seg.write(pdbfile)
120120

121-
molseq = mol.sequence()[segid]
121+
molseq = mol.getSequence(dict_key="segid")[segid]
122122

123123
# -11 is gap creation penalty. -1 is gap extension penalty. Taken from https://www.arabidopsis.org/Blast/BLASToptions.jsp BLASTP options
124124
alignments = pairwise2.align.globalds(sequence, molseq, blosum62, -11.0, -1.0)

moleculekit/tools/sequencestructuralalignment.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@ def _get_sequence(mol: Molecule, sel):
1919
"Your selection contains both protein and nucleic residues. You need to clarify which selection to align."
2020
)
2121
molseg = "protein" if any(protein_mask) else "nucleic"
22-
seqmol, seqidx = mol.sequence(noseg=True, return_idx=True, sel=sel, _logger=False)
22+
seqmol, seqidx = mol.getSequence(
23+
dict_key=None, return_idx=True, sel=sel, _logger=False
24+
)
2325
seqidx = seqidx[molseg]
2426
seqmol = seqmol[molseg]
2527
segment_type = molseg

moleculekit/writers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1314,7 +1314,7 @@ def __init__(self, mol):
13141314
protein = mol.atomselect("protein")
13151315
nucleic = mol.atomselect("nucleic")
13161316
water = mol.atomselect("water")
1317-
sequences = mol.sequence()
1317+
sequences = mol.getSequence()
13181318
insertions = []
13191319
self.group_id_list = []
13201320
chain_count = 0

pyproject.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,11 +49,12 @@ requires = [
4949
]
5050
build-backend = "setuptools.build_meta"
5151

52+
[dependency-groups]
53+
dev = ["ipython>=8.18.1", "pytest>=8.4.2"]
54+
5255
[tool.pytest.ini_options]
53-
python_files = "*.py"
54-
python_classes = "_Test"
5556
python_functions = "_test*"
56-
norecursedirs = "test-data"
57+
testpaths = ["tests"]
5758

5859

5960
[tool.cibuildwheel]

tests/test_molecule.py

Lines changed: 40 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -233,15 +233,52 @@ def _test_reorderAtoms():
233233

234234

235235
def _test_sequence():
236-
seq, seqatms = MOL3PTB.sequence(return_idx=True)
236+
seq, seqatms = MOL3PTB.getSequence(return_idx=True)
237237
refseq = "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN"
238-
assert seq["1"] == refseq
238+
assert seq["A"] == refseq
239239

240240
# Ensure that the returned indexes only belong to a single residue
241-
for indexes in seqatms["1"]:
241+
for indexes in seqatms["A"]:
242242
assert len(np.unique(MOL3PTB.resname[indexes])) == 1
243243
assert len(np.unique(MOL3PTB.resid[indexes])) == 1
244244

245+
seq = MOL3PTB.getSequence(sel="resid 16 to 50")
246+
assert seq == {"A": "IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQ"}
247+
248+
mol = Molecule("1lkk")
249+
seq, idx = mol.getSequence(return_idx=True)
250+
assert seq == {
251+
"A": "LEPEPWFFKNLSRKDAERQLLAPGNTHGSFLIRESESTAGSFSLSVRDFDQNQGEVVKHYKIRNLDNGGFYISPRITFPGLHELVRHYTNASDGLCTRLSRPCQT",
252+
"B": "XEEI",
253+
}
254+
255+
refidx = np.array(
256+
[
257+
1688,
258+
1689,
259+
1690,
260+
1691,
261+
1692,
262+
1693,
263+
1694,
264+
1695,
265+
1696,
266+
1697,
267+
1698,
268+
1699,
269+
1700,
270+
1701,
271+
1702,
272+
]
273+
)
274+
assert np.array_equal(idx["B"][1], refidx)
275+
seq = mol.getSequence(dict_key="segid")
276+
assert seq == {
277+
"1": "LEPEPWFFKNLSRKDAERQLLAPGNTHGSFLIRESESTAGSFSLSVRDFDQNQGEVVKHYKIRNLDNGGFYISPRITFPGLHELVRHYTNASDGLCTRLSRPCQT",
278+
"2": "XEEI",
279+
}
280+
assert mol.getSequence(one_letter=False)["B"] == ["PTR", "GLU", "GLU", "ILE"]
281+
245282

246283
def _test_appendFrames():
247284
trajmol = TRAJMOL.copy()

0 commit comments

Comments
 (0)