@@ -1987,17 +1987,25 @@ def empty(self, numAtoms, numFrames=0):
19871987 self ._emptyTraj (numAtoms , numFrames )
19881988 return self
19891989
1990- def sequence (
1991- self , oneletter = True , noseg = False , return_idx = False , sel = "all" , _logger = True
1990+ def getSequence (
1991+ self ,
1992+ one_letter = True ,
1993+ dict_key = "chain" ,
1994+ return_idx = False ,
1995+ sel = "all" ,
1996+ _logger = True ,
19921997 ):
19931998 """Return the aminoacid sequence of the Molecule.
19941999
19952000 Parameters
19962001 ----------
1997- oneletter : bool
2002+ one_letter : bool
19982003 Whether to return one-letter or three-letter AA codes. There should be only one atom per residue.
1999- noseg : bool
2000- Ignore segments and return the whole sequence as single string.
2004+ dict_key : str | None
2005+ If None, the function will return a dictionary with keys "protein" and "nucleic" (if they exist)
2006+ and the concatenated sequence as the value.
2007+ If "chain" or "segid" is passed, the function will return a dictionary with the sequence of each
2008+ chain or segment.
20012009 return_idx : bool
20022010 If True, the function also returns the indexes of the atoms of each residue in the sequence
20032011 sel : str
@@ -2006,35 +2014,50 @@ def sequence(
20062014 Returns
20072015 -------
20082016 sequence : str
2009- The primary sequence as a dictionary segid - string (if oneletter is True) or segid - list of
2010- strings (otherwise).
2017+ The primary sequence as a dictionary.
20112018
20122019 Examples
20132020 --------
2014- >>> mol=tryp.copy()
2015- >>> mol.sequence()
2016- {'0': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN'}
2017- >>> sh2 = Molecule("1LKK")
2018- >>> pYseq = sh2.sequence(oneletter=False)
2019- >>> pYseq['1']
2021+ >>> mol = Molecule("3PTB")
2022+ >>> mol.getSequence()
2023+ {'A': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQWVVSAAHCYKSGIQVRLGEDNINVVEGNEQFISASKSIVHPSYNSNTLNNDIMLIKLKSAASLNSRVASISLPTSCASAGTQCLISGWGNTKSSGTSYPDVLKCLKAPILSDSSCKSAYPGQITSNMFCAGYLEGGKDSCQGDSGGPVVCSGKLQGIVSWGSGCAQKNKPGVYTKVCNYVSWIKQTIASN'}
2024+ >>> mol.getSequence(sel="resid 16 to 50")
2025+ {'A': 'IVGGYTCGANTVPYQVSLNSGYHFCGGSLINSQ'}
2026+ >>> mol = Molecule("1LKK")
2027+ >>> seq = mol.getSequence(one_letter=False, dict_key="chain")
2028+ >>> seq.keys()
2029+ dict_keys(['A', 'B'])
2030+ >>> seq['B']
20202031 ['PTR', 'GLU', 'GLU', 'ILE']
2021- >>> pYseq = sh2.sequence(oneletter =True)
2022- >>> pYseq['1 ']
2032+ >>> seq = mol.getSequence(one_letter =True, dict_key="chain" )
2033+ >>> seq['B ']
20232034 'XEEI'
2024-
2035+ >>> seq = mol.getSequence(one_letter=True, dict_key="segid")
2036+ >>> seq.keys()
2037+ dict_keys(['1', '2'])
2038+ >>> seq, idx = mol.getSequence(return_idx=True)
2039+ >>> idx['B'][-1] # The atom indexes of the last residue in chain B
2040+ array([1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728,
2041+ 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737])
20252042 """
20262043 from moleculekit .util import sequenceID
20272044
2045+ if dict_key is not None and dict_key not in ["segid" , "chain" ]:
2046+ raise ValueError (
2047+ f"Invalid dictionary key: { dict_key } . Allowed values are: segid, chain"
2048+ )
2049+
20282050 prot = self .atomselect ("protein" )
20292051 nucl = self .atomselect ("nucleic" )
20302052 selb = self .atomselect (sel )
20312053
20322054 increm = sequenceID ((self .resid , self .insertion , self .chain ))
2033- segs = np .unique (self .segid [(prot | nucl ) & selb ])
20342055 segSequences = {}
20352056 seqAtoms = {}
2036- if noseg :
2057+ if dict_key is None :
20372058 segs = ["protein" , "nucleic" ]
2059+ else :
2060+ segs = np .unique (getattr (self , dict_key )[(prot | nucl ) & selb ])
20382061
20392062 # Iterate over segments
20402063 for seg in segs :
@@ -2045,26 +2068,42 @@ def sequence(
20452068 elif seg == "nucleic" :
20462069 segatoms = nucl & selb
20472070 else :
2048- segatoms = (prot | nucl ) & (self . segid == seg ) & selb
2071+ segatoms = (prot | nucl ) & (getattr ( self , dict_key ) == seg ) & selb
20492072
20502073 seq , res_atm = _atoms_to_sequence (
20512074 self ,
20522075 segatoms ,
2053- oneletter = oneletter ,
2076+ oneletter = one_letter ,
20542077 incremseg = increm [segatoms ],
20552078 _logger = _logger ,
20562079 )
20572080 segSequences [seg ] = seq
20582081 seqAtoms [seg ] = res_atm
20592082
20602083 # Join single letters into strings
2061- if oneletter :
2084+ if one_letter :
20622085 segSequences = {k : "" .join (segSequences [k ]) for k in segSequences }
20632086
20642087 if return_idx :
20652088 return segSequences , seqAtoms
20662089 return segSequences
20672090
2091+ def sequence (
2092+ self , oneletter = True , noseg = False , return_idx = False , sel = "all" , _logger = True
2093+ ):
2094+ """DEPRECATED: Use getSequence instead."""
2095+ logger .warning (
2096+ "Molecule.sequence() method is deprecated. Please use the new Molecule.getSequence() method instead."
2097+ "Take care that the new method returns by default a dictionary with the chain IDs as keys instead of segment IDs."
2098+ )
2099+ return self .getSequence (
2100+ one_letter = oneletter ,
2101+ dict_key = None if noseg else "segid" ,
2102+ return_idx = return_idx ,
2103+ sel = sel ,
2104+ _logger = _logger ,
2105+ )
2106+
20682107 def dropFrames (self , drop = None , keep = None ):
20692108 """Removes trajectory frames from the Molecule
20702109
@@ -3268,6 +3307,27 @@ def mol_equal(
32683307 return True
32693308
32703309
3310+ def _get_residue_indices (mol ):
3311+ """
3312+ Get the indices of all residues in a Molecule object.
3313+
3314+ Parameters
3315+ ----------
3316+ mol : Molecule
3317+ The Molecule object to get the residue indices from.
3318+
3319+ Returns
3320+ -------
3321+ residue_indices : list
3322+ A list of arrays, each containing the indices of the atoms in a residue.
3323+ """
3324+ from moleculekit .util import sequenceID
3325+
3326+ unique_residues = np .unique (sequenceID ((mol .resid , mol .insertion , mol .chain )))
3327+
3328+ return [np .where (unique_residues == uqresid )[0 ] for uqresid in set (unique_residues )]
3329+
3330+
32713331def _detectCollisions (coords1 , coords2 , gap , remove_idx ):
32723332 from moleculekit .distance_utils import get_collisions
32733333
0 commit comments