2929import ihm .util
3030import os
3131import argparse
32-
33-
34- def add_ihm_info (s , fix_histidines ):
32+ import collections
33+ import operator
34+
35+
36+ # All canonical atom names for each standard residue type, as per CCD.
37+ # This is generated using the util/get_ccd_atoms.py script.
38+ KNOWN_ATOM_NAMES = {
39+ 'A' : {"C4'" , "C2'" , 'C2' , "C1'" , 'N7' , 'H62' , 'OP2' , 'N3' , 'C5' , 'P' ,
40+ "H5''" , 'H2' , "C5'" , 'H61' , "H3'" , 'C4' , 'N1' , 'H8' , "H1'" , 'C8' ,
41+ 'N9' , 'HOP3' , 'OP1' , "O4'" , "H2'" , "HO2'" , 'OP3' , "O3'" , 'N6' ,
42+ 'HOP2' , "O5'" , "O2'" , "HO3'" , "H5'" , "C3'" , 'C6' , "H4'" },
43+ 'ALA' : {'H2' , 'HB1' , 'HB3' , 'HB2' , 'N' , 'HXT' , 'O' , 'CB' , 'C' , 'HA' , 'CA' ,
44+ 'H' , 'OXT' },
45+ 'ARG' : {'HB2' , 'CG' , 'NE' , 'H' , 'H2' , 'HH22' , 'N' , 'HG2' , 'CA' , 'NH2' ,
46+ 'HH11' , 'HG3' , 'HH21' , 'CZ' , 'HB3' , 'HXT' , 'O' , 'C' , 'HD3' , 'HH12' ,
47+ 'CB' , 'NH1' , 'CD' , 'HA' , 'HD2' , 'HE' , 'OXT' },
48+ 'ASN' : {'H2' , 'HB3' , 'HD22' , 'HB2' , 'N' , 'CG' , 'O' , 'CB' , 'ND2' , 'HXT' ,
49+ 'C' , 'HA' , 'HD21' , 'CA' , 'OD1' , 'H' , 'OXT' },
50+ 'ASP' : {'H2' , 'HB3' , 'HB2' , 'N' , 'CG' , 'O' , 'CB' , 'HXT' , 'C' , 'HA' , 'OD2' ,
51+ 'CA' , 'OD1' , 'HD2' , 'H' , 'OXT' },
52+ 'C' : {"C4'" , "C2'" , 'C2' , 'O2' , 'H42' , 'H5' , "C1'" , 'OP2' , 'N3' , 'C5' ,
53+ 'P' , "H5''" , 'H41' , 'H6' , "C5'" , "H3'" , 'C4' , 'N1' , 'N4' , "H1'" ,
54+ 'HOP3' , 'OP1' , "O4'" , "H2'" , "HO2'" , 'OP3' , "O3'" , 'HOP2' , "O5'" ,
55+ "O2'" , "HO3'" , "H5'" , "C3'" , 'C6' , "H4'" },
56+ 'CYS' : {'H2' , 'HB3' , 'HB2' , 'N' , 'SG' , 'O' , 'CB' , 'HXT' , 'C' , 'HA' , 'HG' ,
57+ 'CA' , 'H' , 'OXT' },
58+ 'DA' : {"C4'" , "C2'" , 'C2' , "C1'" , 'N7' , 'H62' , 'OP2' , 'N3' , 'C5' , 'P' ,
59+ "H5''" , 'H2' , "C5'" , 'H61' , "H3'" , 'C4' , 'N1' , 'H8' , "H1'" , 'C8' ,
60+ 'N9' , 'HOP3' , 'OP1' , "O4'" , "H2'" , 'OP3' , "O3'" , 'N6' , 'HOP2' ,
61+ "O5'" , "H2''" , "HO3'" , "H5'" , "C3'" , 'C6' , "H4'" },
62+ 'DC' : {"C4'" , "C2'" , 'C2' , 'O2' , 'H42' , 'H5' , "C1'" , 'OP2' , 'N3' , 'C5' ,
63+ 'P' , "H5''" , 'H41' , 'H6' , "C5'" , "H3'" , 'C4' , 'N1' , 'N4' , "H1'" ,
64+ 'HOP3' , 'OP1' , "O4'" , "H2'" , 'OP3' , "O3'" , 'HOP2' , "O5'" , "H2''" ,
65+ "HO3'" , "H5'" , "C3'" , 'C6' , "H4'" },
66+ 'DG' : {"C4'" , "C2'" , 'C2' , "C1'" , 'N7' , 'OP2' , 'N3' , 'C5' , 'P' , "H5''" ,
67+ "C5'" , 'O6' , 'H1' , "H3'" , 'C4' , 'N1' , 'H8' , "H1'" , 'C8' , 'N9' ,
68+ 'HOP3' , 'OP1' , "O4'" , "H2'" , 'OP3' , "O3'" , 'HOP2' , "O5'" , "H2''" ,
69+ 'H21' , 'H22' , "HO3'" , "H5'" , "C3'" , 'N2' , 'C6' , "H4'" },
70+ 'DT' : {"C4'" , "C2'" , 'C2' , 'O2' , 'O4' , "C1'" , 'OP2' , 'N3' , 'C5' , 'P' ,
71+ "H5''" , 'H6' , "C5'" , "H3'" , 'C4' , 'N1' , 'C7' , "H1'" , 'H73' , 'HOP3' ,
72+ 'H3' , 'OP1' , "O4'" , "H2'" , 'OP3' , "O3'" , 'HOP2' , "O5'" , "H2''" ,
73+ 'H71' , "HO3'" , "H5'" , "C3'" , 'H72' , 'C6' , "H4'" },
74+ 'G' : {"C4'" , "C2'" , 'C2' , "C1'" , 'N7' , 'OP2' , 'N3' , 'C5' , 'P' , "H5''" ,
75+ "C5'" , 'O6' , 'H1' , "H3'" , 'C4' , 'N1' , 'H8' , "H1'" , 'C8' , 'N9' ,
76+ 'HOP3' , 'OP1' , "O4'" , "H2'" , "HO2'" , 'OP3' , "O3'" , 'HOP2' , "O5'" ,
77+ "O2'" , 'H21' , 'H22' , "HO3'" , "H5'" , "C3'" , 'N2' , 'C6' , "H4'" },
78+ 'GLN' : {'HB2' , 'CG' , 'H' , 'H2' , 'N' , 'HG2' , 'HE22' , 'CA' , 'HG3' , 'HE21' ,
79+ 'HB3' , 'HXT' , 'O' , 'NE2' , 'C' , 'OE1' , 'CB' , 'CD' , 'HA' , 'OXT' },
80+ 'GLU' : {'HB2' , 'CG' , 'H' , 'H2' , 'N' , 'HG2' , 'CA' , 'HG3' , 'HB3' , 'HXT' ,
81+ 'O' , 'HE2' , 'C' , 'OE2' , 'OE1' , 'CB' , 'CD' , 'HA' , 'OXT' },
82+ 'GLY' : {'HA3' , 'HXT' , 'CA' , 'O' , 'HA2' , 'H' , 'N' , 'C' , 'H2' , 'OXT' },
83+ 'HIS' : {'HB2' , 'CG' , 'CE1' , 'HE1' , 'H' , 'ND1' , 'H2' , 'N' , 'CA' , 'HD1' ,
84+ 'HB3' , 'HXT' , 'O' , 'HE2' , 'NE2' , 'C' , 'CD2' , 'CB' , 'HA' , 'HD2' ,
85+ 'OXT' },
86+ 'ILE' : {'HD11' , 'CG1' , 'H' , 'HD12' , 'H2' , 'N' , 'CA' , 'HD13' , 'HG13' ,
87+ 'HXT' , 'O' , 'HB' , 'C' , 'CD1' , 'HG23' , 'HG22' , 'HG21' , 'HG12' ,
88+ 'CB' , 'CG2' , 'HA' , 'OXT' },
89+ 'LEU' : {'HD11' , 'HB2' , 'HD22' , 'CG' , 'HD21' , 'H' , 'HD12' , 'H2' , 'N' ,
90+ 'HD23' , 'CA' , 'HD13' , 'HB3' , 'HXT' , 'O' , 'C' , 'CD2' , 'CD1' , 'CB' ,
91+ 'HA' , 'HG' , 'OXT' },
92+ 'LYS' : {'HB2' , 'CG' , 'CE' , 'H' , 'H2' , 'N' , 'HG2' , 'HE3' , 'CA' , 'HG3' ,
93+ 'HB3' , 'HXT' , 'O' , 'HE2' , 'HZ1' , 'HZ3' , 'C' , 'HD3' , 'CB' , 'CD' ,
94+ 'HA' , 'HZ2' , 'HD2' , 'NZ' , 'OXT' },
95+ 'MET' : {'HB2' , 'CG' , 'HE1' , 'CE' , 'H' , 'H2' , 'N' , 'HG2' , 'HE3' , 'CA' ,
96+ 'HG3' , 'SD' , 'HB3' , 'HXT' , 'O' , 'HE2' , 'C' , 'CB' , 'HA' , 'OXT' },
97+ 'PHE' : {'HB2' , 'CG' , 'CE1' , 'HE1' , 'H' , 'H2' , 'N' , 'HZ' , 'CA' , 'HD1' ,
98+ 'CZ' , 'HB3' , 'HXT' , 'O' , 'HE2' , 'C' , 'CD2' , 'CD1' , 'CB' , 'CE2' ,
99+ 'HA' , 'HD2' , 'OXT' },
100+ 'PRO' : {'HB3' , 'HB2' , 'N' , 'CG' , 'O' , 'CB' , 'HG2' , 'HXT' , 'CD' , 'C' , 'HA' ,
101+ 'CA' , 'HD2' , 'H' , 'HG3' , 'HD3' , 'OXT' },
102+ 'SER' : {'H2' , 'HB3' , 'HB2' , 'N' , 'HXT' , 'O' , 'CB' , 'C' , 'HA' , 'HG' , 'CA' ,
103+ 'H' , 'OG' , 'OXT' },
104+ 'THR' : {'H2' , 'HXT' , 'N' , 'HG23' , 'O' , 'CB' , 'CG2' , 'OG1' , 'HB' , 'C' ,
105+ 'HA' , 'CA' , 'HG22' , 'H' , 'HG1' , 'HG21' , 'OXT' },
106+ 'TRP' : {'HB2' , 'CG' , 'CE3' , 'CZ3' , 'HE1' , 'H' , 'H2' , 'N' , 'HE3' , 'CA' ,
107+ 'CZ2' , 'HD1' , 'HB3' , 'HXT' , 'O' , 'HZ3' , 'C' , 'CD2' , 'CD1' , 'NE1' ,
108+ 'CB' , 'HH2' , 'CE2' , 'HA' , 'CH2' , 'HZ2' , 'OXT' },
109+ 'U' : {"C4'" , "C2'" , 'C2' , 'O2' , 'H5' , 'O4' , "C1'" , 'OP2' , 'N3' , 'C5' , 'P' ,
110+ "H5''" , 'H6' , "C5'" , "H3'" , 'C4' , 'N1' , "H1'" , 'HOP3' , 'H3' , 'OP1' ,
111+ "O4'" , "H2'" , "HO2'" , 'OP3' , "O3'" , 'HOP2' , "O5'" , "O2'" , "HO3'" ,
112+ "H5'" , "C3'" , 'C6' , "H4'" },
113+ 'VAL' : {'CG1' , 'H' , 'H2' , 'N' , 'CA' , 'HG13' , 'HXT' , 'O' , 'HB' , 'C' ,
114+ 'HG23' , 'HG22' , 'HG21' , 'HG12' , 'CB' , 'CG2' , 'HA' , 'OXT' , 'HG11' }
115+ }
116+
117+
118+ def add_ihm_info (s , fix_histidines , check_atom_names ):
35119 # Non-standard histidine names (protonation states)
36120 histidines = frozenset (('HIP' , 'HID' , 'HIE' ))
37121
@@ -63,6 +147,8 @@ def add_ihm_info(s, fix_histidines):
63147 _get_not_modeled_residues (model ))
64148 if fix_histidines :
65149 _fix_histidine_het_atoms (model , histidines )
150+ if check_atom_names != 'no' :
151+ _check_atom_names (model )
66152 if fix_histidines :
67153 _fix_histidine_chem_comps (s , histidines )
68154 return s
@@ -79,6 +165,34 @@ def _fix_histidine_het_atoms(model, histidines):
79165 atom .het = False
80166
81167
168+ def _get_non_canon (seen_atom_names ):
169+ """Get all non-canonical atom names for each residue type"""
170+ for restyp , atoms in seen_atom_names .items ():
171+ # todo: if restyp not known, query Ligand Expo and parse the
172+ # resulting mmCIF
173+ if restyp in KNOWN_ATOM_NAMES :
174+ non_canon_atoms = atoms - KNOWN_ATOM_NAMES [restyp ]
175+ if non_canon_atoms :
176+ yield restyp , non_canon_atoms
177+
178+
179+ def _check_atom_names (model ):
180+ """Check that only standard atom names are used for known
181+ residue types"""
182+ seen_atom_names = collections .defaultdict (set )
183+ for atom in model ._atoms :
184+ seq_id = 1 if atom .seq_id is None else atom .seq_id
185+ comp = atom .asym_unit .sequence [seq_id - 1 ]
186+ seen_atom_names [comp .id ].add (atom .atom_id )
187+ non_canon = sorted (_get_non_canon (seen_atom_names ),
188+ key = operator .itemgetter (0 ))
189+ if non_canon :
190+ raise ValueError (
191+ "Non-canonical atom names found in the following residues: "
192+ + "; " .join ("%s: %r" % (restyp , sorted (atoms ))
193+ for (restyp , atoms ) in non_canon ))
194+
195+
82196def _fix_histidine_chem_comps (s , histidines ):
83197 """Change any non-standard histidine chemical components to normal HIS"""
84198 his = ihm .LPeptideAlphabet ()['H' ]
@@ -130,15 +244,15 @@ def _get_not_modeled_residues(model):
130244 yield ihm .model .NotModeledResidueRange (asym , r [0 ], r [1 ])
131245
132246
133- def add_ihm_info_one_system (fname , fix_histidines ):
247+ def add_ihm_info_one_system (fname , fix_histidines , check_atom_names ):
134248 """Read mmCIF file `fname`, which must contain a single System, and
135249 return it with any missing IHM data added."""
136250 with open (fname ) as fh :
137251 systems = ihm .reader .read (fh )
138252 if len (systems ) != 1 :
139253 raise ValueError ("mmCIF file %s must contain exactly 1 data block "
140254 "(%d found)" % (fname , len (systems )))
141- return add_ihm_info (systems [0 ], fix_histidines )
255+ return add_ihm_info (systems [0 ], fix_histidines , check_atom_names )
142256
143257
144258def combine (s , other_s ):
@@ -256,6 +370,11 @@ def get_args():
256370 p .add_argument ("--histidines" , action = 'store_true' , dest = "fix_histidines" ,
257371 help = "Convert any non-standard histidine names (HIP, HID, "
258372 "HIE, for different protonation states) to HIS" )
373+ p .add_argument ('--check_atom_names' , choices = ['no' , 'standard' ],
374+ dest = "check_atom_names" , default = 'no' ,
375+ help = "If 'standard', check for non-canonical atom names "
376+ "in standard amino acid and nucleic acid chemical "
377+ "components" )
259378 return p .parse_args ()
260379
261380
@@ -267,9 +386,11 @@ def main():
267386 raise ValueError ("Input and output are the same file" )
268387
269388 if args .add :
270- s = add_ihm_info_one_system (args .input , args .fix_histidines )
389+ s = add_ihm_info_one_system (args .input , args .fix_histidines ,
390+ args .check_atom_names )
271391 for other in args .add :
272- other_s = add_ihm_info_one_system (other , args .fix_histidines )
392+ other_s = add_ihm_info_one_system (other , args .fix_histidines ,
393+ args .check_atom_names )
273394 combine (s , other_s )
274395 with open (args .output , 'w' ) as fhout :
275396 ihm .dumper .write (
@@ -279,7 +400,8 @@ def main():
279400 with open (args .input ) as fh :
280401 with open (args .output , 'w' ) as fhout :
281402 ihm .dumper .write (
282- fhout , [add_ihm_info (s , args .fix_histidines )
403+ fhout , [add_ihm_info (s , args .fix_histidines ,
404+ args .check_atom_names )
283405 for s in ihm .reader .read (fh )],
284406 variant = ihm .dumper .IgnoreVariant (['_audit_conform' ]))
285407
0 commit comments