Skip to content

Commit 4874f6a

Browse files
committed
Add option to query CCD for non-standard residues
1 parent 19f192a commit 4874f6a

File tree

6 files changed

+110
-17
lines changed

6 files changed

+110
-17
lines changed

ihm/util/make_mmcif.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,13 @@
2727
import ihm.model
2828
import ihm.protocol
2929
import ihm.util
30+
import ihm.format
31+
import urllib.request
3032
import os
3133
import argparse
3234
import collections
3335
import operator
36+
import warnings
3437

3538

3639
# All canonical atom names for each standard residue type, as per CCD.
@@ -148,7 +151,7 @@ def add_ihm_info(s, fix_histidines, check_atom_names):
148151
if fix_histidines:
149152
_fix_histidine_het_atoms(model, histidines)
150153
if check_atom_names != 'no':
151-
_check_atom_names(model)
154+
_check_atom_names(model, check_atom_names == 'all')
152155
if fix_histidines:
153156
_fix_histidine_chem_comps(s, histidines)
154157
return s
@@ -165,26 +168,53 @@ def _fix_histidine_het_atoms(model, histidines):
165168
atom.het = False
166169

167170

168-
def _get_non_canon(seen_atom_names):
171+
class _ChemCompAtomHandler:
172+
not_in_file = omitted = unknown = None
173+
174+
def __init__(self):
175+
super().__init__()
176+
self.atoms = collections.defaultdict(set)
177+
178+
def __call__(self, comp_id, atom_id):
179+
self.atoms[comp_id].add(atom_id)
180+
181+
182+
def _get_non_std_restyp(restyp):
183+
"""Return CCD info for the given residue type"""
184+
url_pattern = 'http://ligand-expo.rcsb.org/reports/%s/%s/%s.cif'
185+
url = url_pattern % (restyp[:1], restyp, restyp)
186+
cca = _ChemCompAtomHandler()
187+
try:
188+
with urllib.request.urlopen(url) as fh:
189+
c = ihm.format.CifReader(fh,
190+
category_handler={'_chem_comp_atom': cca})
191+
c.read_file()
192+
except urllib.error.URLError as exc:
193+
warnings.warn(
194+
"Component %s could not be found in CCD: %s" % (restyp, exc))
195+
return cca.atoms
196+
197+
198+
def _get_non_canon(seen_atom_names, check_all):
169199
"""Get all non-canonical atom names for each residue type"""
170200
for restyp, atoms in seen_atom_names.items():
171-
# todo: if restyp not known, query Ligand Expo and parse the
172-
# resulting mmCIF
201+
if check_all and restyp not in KNOWN_ATOM_NAMES:
202+
KNOWN_ATOM_NAMES.update(_get_non_std_restyp(restyp))
173203
if restyp in KNOWN_ATOM_NAMES:
174204
non_canon_atoms = atoms - KNOWN_ATOM_NAMES[restyp]
175205
if non_canon_atoms:
176206
yield restyp, non_canon_atoms
177207

178208

179-
def _check_atom_names(model):
209+
def _check_atom_names(model, check_all):
180210
"""Check that only standard atom names are used for known
181211
residue types"""
182212
seen_atom_names = collections.defaultdict(set)
183213
for atom in model._atoms:
184214
seq_id = 1 if atom.seq_id is None else atom.seq_id
185215
comp = atom.asym_unit.sequence[seq_id - 1]
186216
seen_atom_names[comp.id].add(atom.atom_id)
187-
non_canon = sorted(_get_non_canon(seen_atom_names),
217+
non_canon = sorted(_get_non_canon(seen_atom_names, check_all),
188218
key=operator.itemgetter(0))
189219
if non_canon:
190220
raise ValueError(
@@ -370,11 +400,12 @@ def get_args():
370400
p.add_argument("--histidines", action='store_true', dest="fix_histidines",
371401
help="Convert any non-standard histidine names (HIP, HID, "
372402
"HIE, for different protonation states) to HIS")
373-
p.add_argument('--check_atom_names', choices=['no', 'standard'],
403+
p.add_argument('--check_atom_names', choices=['no', 'standard', 'all'],
374404
dest="check_atom_names", default='no',
375405
help="If 'standard', check for non-canonical atom names "
376406
"in standard amino acid and nucleic acid chemical "
377-
"components")
407+
"components; if 'all', also check non-standard "
408+
"residue types by querying CCD (needs network access)")
378409
return p.parse_args()
379410

380411

test/input/non_canon_atom.cif

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,20 @@ _exptl.method 'model, MODELLER Version 9.24 2020/08/21 11:54:31'
55
_modeller.version 9.24
66
#
77
loop_
8+
_entity.id
9+
_entity.type
10+
1 polymer
11+
2 non-polymer
12+
3 non-polymer
13+
#
14+
loop_
815
_struct_asym.id
916
_struct_asym.entity_id
1017
_struct_asym.details
1118
A 1 ?
19+
B 2 ?
20+
C 2 ?
21+
D 3 ?
1222
#
1323
loop_
1424
_entity_poly_seq.entity_id
@@ -17,7 +27,13 @@ _entity_poly_seq.mon_id
1727
1 1 VAL
1828
1 2 GLY
1929
1 3 GLN
20-
1 4 MSE
30+
#
31+
loop_
32+
_pdbx_entity_nonpoly.entity_id
33+
_pdbx_entity_nonpoly.name
34+
_pdbx_entity_nonpoly.comp_id
35+
2 ? ZN
36+
3 ? invalid-comp-name
2137
#
2238
loop_
2339
_atom_site.group_PDB
@@ -38,8 +54,10 @@ _atom_site.B_iso_or_equiv
3854
_atom_site.label_entity_id
3955
_atom_site.id
4056
_atom_site.pdbx_PDB_model_num
41-
ATOM C bad1 . VAL A A 1 2 ? 114.370 27.980 -26.088 1.000 143.490 1 1 1
42-
ATOM C bad2 . VAL A A 1 2 ? 114.370 27.980 -26.088 1.000 143.490 1 2 1
43-
ATOM C CA . GLY A A 2 3 ? 111.506 26.368 -28.075 1.000 137.530 1 3 1
44-
ATOM C bad3 . GLN A A 3 4 ? 113.468 23.113 -28.639 1.000 128.420 1 4 1
45-
ATOM C ig1 . MSE A A 4 5 ? 113.808 21.534 -32.168 1.000 117.620 1 5 1
57+
ATOM C bad1 . VAL A A 1 2 ? 114.370 27.980 -26.088 1.000 143.490 1 1 1
58+
ATOM C bad2 . VAL A A 1 2 ? 114.370 27.980 -26.088 1.000 143.490 1 2 1
59+
ATOM C CA . GLY A A 2 3 ? 111.506 26.368 -28.075 1.000 137.530 1 3 1
60+
ATOM C bad3 . GLN A A 3 4 ? 113.468 23.113 -28.639 1.000 128.420 1 4 1
61+
HETATM ZN ZN . ZN B A . 5 ? 113.808 21.534 -32.168 1.000 117.620 2 5 1
62+
HETATM ZN bad4 . ZN C A . 6 ? 113.808 21.534 -32.168 1.000 117.620 2 6 1
63+
HETATM ZN ZN . invalid-comp-name D A . 7 ? 113.808 21.534 -32.168 3.000 117.620 3 7 1
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# noop
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
class URLError(Exception):
2+
pass
3+
4+
5+
class HTTPError(URLError):
6+
pass
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from io import BytesIO
2+
import urllib.error
3+
4+
zinc_atoms = b'_chem_comp_atom.comp_id ZN\n_chem_comp_atom.atom_id ZN\n'
5+
6+
7+
def urlopen(url):
8+
if 'invalid' in url:
9+
raise urllib.error.HTTPError("404")
10+
return BytesIO(zinc_atoms)

test/test_make_mmcif.py

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -257,8 +257,8 @@ def test_histidine(self):
257257
[False, False, False, False, True])
258258
os.unlink('output.cif')
259259

260-
def test_check_non_canon_atom(self):
261-
"""Test check for non-canonical atom names"""
260+
def test_check_non_canon_atom_standard(self):
261+
"""Test check for non-canonical atom names, standard restypes"""
262262
incif = utils.get_input_file_name(TOPDIR, 'non_canon_atom.cif')
263263

264264
# Should work fine without check
@@ -273,10 +273,37 @@ def test_check_non_canon_atom(self):
273273
out, err = r.communicate()
274274
self.assertEqual(r.returncode, 1)
275275
# Non-canonical atoms in standard residues should be reported
276-
# Non-standard residues (MSE) are not checked
276+
# Non-standard residues (ZN, ...) are not checked
277277
self.assertIn("Non-canonical atom names found in the following "
278278
"residues: GLN: ['bad3']; VAL: ['bad1', 'bad2']",
279279
err)
280+
os.unlink('output.cif')
281+
282+
def test_check_non_canon_atom_all(self):
283+
"""Test check for non-canonical atom names, all restypes"""
284+
incif = utils.get_input_file_name(TOPDIR, 'non_canon_atom.cif')
285+
286+
# Use mock urllib so we don't hit the network during this test
287+
env = os.environ.copy()
288+
mockdir = os.path.join(TOPDIR, 'test', 'mock', 'non_canon_atom')
289+
env['PYTHONPATH'] = mockdir + os.pathsep + env['PYTHONPATH']
290+
291+
r = subprocess.Popen([sys.executable, MAKE_MMCIF,
292+
"--check_atom_names=all", incif],
293+
stdout=subprocess.PIPE,
294+
stderr=subprocess.PIPE,
295+
universal_newlines=True, env=env)
296+
out, err = r.communicate()
297+
self.assertEqual(r.returncode, 1)
298+
# Non-canonical atoms in standard residues should be reported
299+
# Non-standard residue (ZN) should also be checked
300+
self.assertIn("Non-canonical atom names found in the following "
301+
"residues: GLN: ['bad3']; VAL: ['bad1', 'bad2']; "
302+
"ZN: ['bad4']", err)
303+
# Residues not in CCD should give a warning
304+
self.assertIn("Component invalid-comp-name could not be found in CCD",
305+
err)
306+
os.unlink('output.cif')
280307

281308

282309
if __name__ == '__main__':

0 commit comments

Comments
 (0)