Skip to content

Commit 92d238c

Browse files
committed
Fill in missing chem_comp data in make_mmcif
Add a new --fix_chem_comp flag to make_mmcif which, if specified, will fill in any missing information in the chem_comp table (type, name, formula) by querying CCD. Closes #179.
1 parent 6a91027 commit 92d238c

File tree

4 files changed

+112
-13
lines changed

4 files changed

+112
-13
lines changed

ihm/util/make_mmcif.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import collections
3535
import operator
3636
import warnings
37+
import inspect
3738

3839

3940
# All canonical atom names for each standard residue type, as per CCD.
@@ -118,7 +119,7 @@
118119
}
119120

120121

121-
def add_ihm_info(s, fix_histidines, check_atom_names):
122+
def add_ihm_info(s, fix_histidines, check_atom_names, fix_chem_comp):
122123
# Non-standard histidine names (protonation states)
123124
histidines = frozenset(('HIP', 'HID', 'HIE'))
124125

@@ -155,6 +156,8 @@ def add_ihm_info(s, fix_histidines, check_atom_names):
155156
if fix_histidines:
156157
_fix_histidine_chem_comps(s, histidines)
157158
_fix_empty_assemblies(s)
159+
if fix_chem_comp:
160+
_fix_incomplete_chem_comps(s)
158161
return s
159162

160163

@@ -180,20 +183,25 @@ def __call__(self, comp_id, atom_id):
180183
self.atoms[comp_id].add(atom_id)
181184

182185

183-
def _get_non_std_restyp(restyp):
184-
"""Return CCD info for the given residue type"""
186+
def _read_ccd(restyp, category_handler):
187+
"""Read the given residue type from CCD using the given CIF handler(s)"""
185188
url_top = 'https://files.rcsb.org'
186189
url_pattern = url_top + '/pub/pdb/refdata/chem_comp/%s/%s/%s.cif'
187190
url = url_pattern % (restyp[-1], restyp, restyp)
188-
cca = _ChemCompAtomHandler()
189191
try:
190192
with urllib.request.urlopen(url) as fh:
191-
c = ihm.format.CifReader(fh,
192-
category_handler={'_chem_comp_atom': cca})
193+
c = ihm.format.CifReader(
194+
fh, category_handler=category_handler)
193195
c.read_file()
194196
except urllib.error.URLError as exc:
195197
warnings.warn(
196198
"Component %s could not be found in CCD: %s" % (restyp, exc))
199+
200+
201+
def _get_non_std_restyp(restyp):
202+
"""Return CCD atom info for the given residue type"""
203+
cca = _ChemCompAtomHandler()
204+
_read_ccd(restyp, {'_chem_comp_atom': cca})
197205
return cca.atoms
198206

199207

@@ -290,15 +298,48 @@ def _fix_empty_assemblies(s):
290298
s.complete_assembly.description = asmb.description
291299

292300

293-
def add_ihm_info_one_system(fname, fix_histidines, check_atom_names):
301+
class _ChemCompHandler:
302+
"""Read the _chem_comp table from a CCD entry"""
303+
not_in_file = omitted = unknown = None
304+
305+
def __call__(self, name, type, formula):
306+
self.name, self.type, self.formula = name, type, formula
307+
308+
309+
def _fix_incomplete_chem_comps(s):
310+
"""Add any missing information to ChemComps using CCD"""
311+
# Map type to ChemComp subclass. Map nonpolymer to NonPolyChemComp,
312+
# not WaterChemComp
313+
typmap = {x[1].type.lower(): x[1]
314+
for x in inspect.getmembers(ihm, inspect.isclass)
315+
if issubclass(x[1], ihm.ChemComp)
316+
and x[1] is not ihm.WaterChemComp}
317+
for cc in s._orphan_chem_comps:
318+
if cc.type == 'other' or cc.name is None or cc.formula is None:
319+
_fix_chem_comp(cc, typmap)
320+
321+
322+
def _fix_chem_comp(cc, typmap):
323+
"""Add missing information to a single ChemComp from CCD"""
324+
h = _ChemCompHandler()
325+
_read_ccd(cc.id.upper(), {'_chem_comp': h})
326+
if hasattr(h, 'name') and h.name is not None:
327+
cc.name = h.name
328+
cc.formula = h.formula
329+
cc.__class__ = typmap.get(h.type.lower(), ihm.ChemComp)
330+
331+
332+
def add_ihm_info_one_system(fname, fix_histidines, check_atom_names,
333+
fix_chem_comp):
294334
"""Read mmCIF file `fname`, which must contain a single System, and
295335
return it with any missing IHM data added."""
296336
with open(fname) as fh:
297337
systems = ihm.reader.read(fh)
298338
if len(systems) != 1:
299339
raise ValueError("mmCIF file %s must contain exactly 1 data block "
300340
"(%d found)" % (fname, len(systems)))
301-
return add_ihm_info(systems[0], fix_histidines, check_atom_names)
341+
return add_ihm_info(systems[0], fix_histidines, check_atom_names,
342+
fix_chem_comp)
302343

303344

304345
def combine(s, other_s):
@@ -422,6 +463,10 @@ def get_args():
422463
"in standard amino acid and nucleic acid chemical "
423464
"components; if 'all', also check non-standard "
424465
"residue types by querying CCD (needs network access)")
466+
p.add_argument("--fix_chem_comp", action='store_true',
467+
dest="fix_chem_comp",
468+
help="Add any missing data to the chem_comp table by"
469+
"querying CCD (needs network access)")
425470
return p.parse_args()
426471

427472

@@ -434,10 +479,12 @@ def main():
434479

435480
if args.add:
436481
s = add_ihm_info_one_system(args.input, args.fix_histidines,
437-
args.check_atom_names)
482+
args.check_atom_names,
483+
args.fix_chem_comp)
438484
for other in args.add:
439485
other_s = add_ihm_info_one_system(other, args.fix_histidines,
440-
args.check_atom_names)
486+
args.check_atom_names,
487+
args.fix_chem_comp)
441488
combine(s, other_s)
442489
with open(args.output, 'w') as fhout:
443490
ihm.dumper.write(
@@ -448,7 +495,8 @@ def main():
448495
with open(args.output, 'w') as fhout:
449496
ihm.dumper.write(
450497
fhout, [add_ihm_info(s, args.fix_histidines,
451-
args.check_atom_names)
498+
args.check_atom_names,
499+
args.fix_chem_comp)
452500
for s in ihm.reader.read(fh)],
453501
variant=ihm.dumper.IgnoreVariant(['_audit_conform']))
454502

test/input/missing_chem_comp.cif

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
loop_
2+
_chem_comp.id
3+
_chem_comp.type
4+
_chem_comp.name
5+
_chem_comp.formula
6+
_chem_comp.formula_weight
7+
ALA 'L-peptide linking' ALANINE 'C3 H7 N O2' 89.094
8+
MG . . . .
9+
ZN . . . .
10+
invalid-chem-comp . . . .

test/mock/non_canon_atom/urllib/request.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,16 @@
33

44
zinc_atoms = b'_chem_comp_atom.comp_id ZN\n_chem_comp_atom.atom_id ZN\n'
55

6+
mg = b"""
7+
_chem_comp.id MG
8+
_chem_comp.name "MAGNESIUM ION"
9+
_chem_comp.type NON-POLYMER
10+
_chem_comp.pdbx_type HETAI
11+
_chem_comp.formula Mg
12+
"""
13+
614

715
def urlopen(url):
8-
if 'invalid' in url:
16+
if 'invalid' in url or 'INVALID' in url:
917
raise urllib.error.HTTPError("404")
10-
return BytesIO(zinc_atoms)
18+
return BytesIO(zinc_atoms if url.endswith('ZN.cif') else mg)

test/test_make_mmcif.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -352,6 +352,39 @@ def test_empty_assembly_no_description(self):
352352
os.unlink(incif_new)
353353
os.unlink('output.cif')
354354

355+
def test_missing_chem_comp(self):
356+
"""Test fix of incomplete chem_comp table"""
357+
incif = utils.get_input_file_name(TOPDIR, 'missing_chem_comp.cif')
358+
359+
# Use mock urllib so we don't hit the network during this test
360+
env = os.environ.copy()
361+
mockdir = os.path.join(TOPDIR, 'test', 'mock', 'non_canon_atom')
362+
env['PYTHONPATH'] = mockdir + os.pathsep + env['PYTHONPATH']
363+
364+
r = subprocess.Popen([sys.executable, MAKE_MMCIF,
365+
"--fix_chem_comp", incif],
366+
stdout=subprocess.PIPE,
367+
stderr=subprocess.PIPE,
368+
universal_newlines=True, env=env)
369+
out, err = r.communicate()
370+
self.assertEqual(r.returncode, 0)
371+
# Residues not in CCD should give a warning
372+
self.assertIn("Component INVALID-CHEM-COMP could not be found in CCD",
373+
err)
374+
# ALA should be left unchanged (already present in the input file);
375+
# MG should be filled in with CCD info;
376+
# ZN should be left unchanged (no chem_comp table in our mock CCD)
377+
with open('output.cif') as fh:
378+
contents = fh.readlines()
379+
ind = contents.index('_chem_comp.formula_weight\n')
380+
self.assertEqual(
381+
contents[ind + 1:ind + 5],
382+
["ALA 'L-peptide linking' ALANINE 'C3 H7 N O2' 89.094\n",
383+
"MG non-polymer 'MAGNESIUM ION' Mg 24.305\n",
384+
'ZN other . . .\n',
385+
'invalid-chem-comp other . . .\n'])
386+
os.unlink('output.cif')
387+
355388

356389
if __name__ == '__main__':
357390
unittest.main()

0 commit comments

Comments
 (0)