2727import ihm .model
2828import ihm .protocol
2929import ihm .util
30+ import ihm .format
31+ import urllib .request
3032import os
3133import argparse
3234import collections
3335import operator
36+ import warnings
3437
3538
3639# All canonical atom names for each standard residue type, as per CCD.
@@ -148,7 +151,7 @@ def add_ihm_info(s, fix_histidines, check_atom_names):
148151 if fix_histidines :
149152 _fix_histidine_het_atoms (model , histidines )
150153 if check_atom_names != 'no' :
151- _check_atom_names (model )
154+ _check_atom_names (model , check_atom_names == 'all' )
152155 if fix_histidines :
153156 _fix_histidine_chem_comps (s , histidines )
154157 return s
@@ -165,26 +168,53 @@ def _fix_histidine_het_atoms(model, histidines):
165168 atom .het = False
166169
167170
168- def _get_non_canon (seen_atom_names ):
171+ class _ChemCompAtomHandler :
172+ not_in_file = omitted = unknown = None
173+
174+ def __init__ (self ):
175+ super ().__init__ ()
176+ self .atoms = collections .defaultdict (set )
177+
178+ def __call__ (self , comp_id , atom_id ):
179+ self .atoms [comp_id ].add (atom_id )
180+
181+
182+ def _get_non_std_restyp (restyp ):
183+ """Return CCD info for the given residue type"""
184+ url_pattern = 'http://ligand-expo.rcsb.org/reports/%s/%s/%s.cif'
185+ url = url_pattern % (restyp [:1 ], restyp , restyp )
186+ cca = _ChemCompAtomHandler ()
187+ try :
188+ with urllib .request .urlopen (url ) as fh :
189+ c = ihm .format .CifReader (fh ,
190+ category_handler = {'_chem_comp_atom' : cca })
191+ c .read_file ()
192+ except urllib .error .URLError as exc :
193+ warnings .warn (
194+ "Component %s could not be found in CCD: %s" % (restyp , exc ))
195+ return cca .atoms
196+
197+
198+ def _get_non_canon (seen_atom_names , check_all ):
169199 """Get all non-canonical atom names for each residue type"""
170200 for restyp , atoms in seen_atom_names .items ():
171- # todo: if restyp not known, query Ligand Expo and parse the
172- # resulting mmCIF
201+ if check_all and restyp not in KNOWN_ATOM_NAMES :
202+ KNOWN_ATOM_NAMES . update ( _get_non_std_restyp ( restyp ))
173203 if restyp in KNOWN_ATOM_NAMES :
174204 non_canon_atoms = atoms - KNOWN_ATOM_NAMES [restyp ]
175205 if non_canon_atoms :
176206 yield restyp , non_canon_atoms
177207
178208
179- def _check_atom_names (model ):
209+ def _check_atom_names (model , check_all ):
180210 """Check that only standard atom names are used for known
181211 residue types"""
182212 seen_atom_names = collections .defaultdict (set )
183213 for atom in model ._atoms :
184214 seq_id = 1 if atom .seq_id is None else atom .seq_id
185215 comp = atom .asym_unit .sequence [seq_id - 1 ]
186216 seen_atom_names [comp .id ].add (atom .atom_id )
187- non_canon = sorted (_get_non_canon (seen_atom_names ),
217+ non_canon = sorted (_get_non_canon (seen_atom_names , check_all ),
188218 key = operator .itemgetter (0 ))
189219 if non_canon :
190220 raise ValueError (
@@ -370,11 +400,12 @@ def get_args():
370400 p .add_argument ("--histidines" , action = 'store_true' , dest = "fix_histidines" ,
371401 help = "Convert any non-standard histidine names (HIP, HID, "
372402 "HIE, for different protonation states) to HIS" )
373- p .add_argument ('--check_atom_names' , choices = ['no' , 'standard' ],
403+ p .add_argument ('--check_atom_names' , choices = ['no' , 'standard' , 'all' ],
374404 dest = "check_atom_names" , default = 'no' ,
375405 help = "If 'standard', check for non-canonical atom names "
376406 "in standard amino acid and nucleic acid chemical "
377- "components" )
407+ "components; if 'all', also check non-standard "
408+ "residue types by querying CCD (needs network access)" )
378409 return p .parse_args ()
379410
380411
0 commit comments