From 8723ca38b49a1bdb9707bca29909a017b8393383 Mon Sep 17 00:00:00 2001 From: Alexander Rose <alexander.rose@weirdbyte.de> Date: Mon, 7 Mar 2022 21:31:33 -0800 Subject: [PATCH] improve saccharide detection --- CHANGELOG.md | 1 + README.md | 3 + src/cli/chem-comp-dict/create-saccharides.ts | 77 +++++++++++++++++++ .../structure/common/component.ts | 5 +- .../structure/model/types/saccharides.ts | 9 +++ .../structure/carbohydrates/constants.ts | 16 ++-- 6 files changed, 99 insertions(+), 12 deletions(-) create mode 100644 src/cli/chem-comp-dict/create-saccharides.ts create mode 100644 src/mol-model/structure/model/types/saccharides.ts diff --git a/CHANGELOG.md b/CHANGELOG.md index cf1cf6f6e..f9b898343 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ Note that since we don't clearly distinguish between a public and private interf ## [Unreleased] - Fix handling of mmcif with empty ``label_*`` fields +- Improve saccharide detection (compare against list from CCD) ## [v3.3.1] - 2022-02-27 diff --git a/README.md b/README.md index cf7b7e6d7..8a5f27cc8 100644 --- a/README.md +++ b/README.md @@ -120,6 +120,9 @@ and navigate to `build/viewer` node --max-old-space-size=4096 lib/commonjs/cli/chem-comp-dict/create-ions.js src/mol-model/structure/model/types/ions.ts +**Saccharide names** + + node --max-old-space-size=4096 lib/commonjs/cli/chem-comp-dict/create-saccharides.js src/mol-model/structure/model/types/saccharides.ts **GraphQL schemas** diff --git a/src/cli/chem-comp-dict/create-saccharides.ts b/src/cli/chem-comp-dict/create-saccharides.ts new file mode 100644 index 000000000..5850522d0 --- /dev/null +++ b/src/cli/chem-comp-dict/create-saccharides.ts @@ -0,0 +1,77 @@ +#!/usr/bin/env node +/** + * Copyright (c) 2022 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import * as argparse from 'argparse'; +import * as path from 'path'; +import util from 'util'; +import fs from 'fs'; +require('util.promisify').shim(); +const writeFile = util.promisify(fs.writeFile); + +import { DatabaseCollection } from '../../mol-data/db'; +import { CCD_Schema } from '../../mol-io/reader/cif/schema/ccd'; +import { ensureDataAvailable, readCCD } from './util'; + +function extractSaccharideNames(ccd: DatabaseCollection<CCD_Schema>) { + const saccharideNames: string[] = []; + for (const k in ccd) { + const { chem_comp } = ccd[k]; + const type = chem_comp.type.value(0).toUpperCase(); + if (type.includes('SACCHARIDE')) { + saccharideNames.push(chem_comp.id.value(0)); + } + } + // these are extra saccharides that don't have SACCHARIDE in their type + saccharideNames.push( + 'UMQ', // UNDECYL-MALTOSIDE, via GlyFinder + 'SQD', // SULFOQUINOVOSYLDIACYLGLYCEROL, via GlyFinder + ); + return saccharideNames; +} + +function writeSaccharideNamesFile(filePath: string, ionNames: string[]) { + const output = `/** + * Copyright (c) 2022 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * Code-generated ion names params file. Names extracted from CCD components. + * + * @author molstar/cli/chem-comp-dict/create-saccharides + */ + +export const SaccharideNames = new Set(${JSON.stringify(ionNames).replace(/"/g, "'").replace(/,/g, ', ')}); +`; + writeFile(filePath, output); +} + +async function run(out: string, forceDownload = false) { + await ensureDataAvailable(forceDownload); + const ccd = await readCCD(); + const saccharideNames = extractSaccharideNames(ccd); + if (!fs.existsSync(path.dirname(out))) { + fs.mkdirSync(path.dirname(out)); + } + writeSaccharideNamesFile(out, saccharideNames); +} + +const parser = new argparse.ArgumentParser({ + add_help: true, + description: 'Extract and save SaccharideNames from CCD.' +}); +parser.add_argument('out', { + help: 'Generated file output path.' +}); +parser.add_argument('--forceDownload', '-f', { + action: 'store_true', + help: 'Force download of CCD and PVCD.' +}); +interface Args { + out: string, + forceDownload?: boolean, +} +const args: Args = parser.parse_args(); + +run(args.out, args.forceDownload); diff --git a/src/mol-model-formats/structure/common/component.ts b/src/mol-model-formats/structure/common/component.ts index 11c524975..8cceda0b1 100644 --- a/src/mol-model-formats/structure/common/component.ts +++ b/src/mol-model-formats/structure/common/component.ts @@ -9,6 +9,7 @@ import { WaterNames, PolymerNames } from '../../../mol-model/structure/model/typ import { SetUtils } from '../../../mol-util/set'; import { BasicSchema } from '../basic/schema'; import { mmCIF_chemComp_schema } from '../../../mol-io/reader/cif/schema/mmcif-extras'; +import { SaccharideCompIdMap } from '../../../mol-model/structure/structure/carbohydrates/constants'; type Component = Table.Row<Pick<mmCIF_chemComp_schema, 'id' | 'name' | 'type'>> @@ -30,7 +31,7 @@ const DnaAtomIdsList = [ /** Used to reduce false positives for atom name-based type guessing */ const NonPolymerNames = new Set([ - 'FMN', 'NCN', 'FNS', 'FMA', 'ATP', 'ADP', 'AMP', 'GTP', 'GDP', 'GMP' // Mononucleotides + 'FMN', 'NCN', 'FNS', 'FMA', 'ATP', 'ADP', 'AMP', 'GTP', 'GDP', 'GMP', // Mononucleotides ]); const StandardComponents = (function () { @@ -158,6 +159,8 @@ export class ComponentBuilder { this.set({ id: compId, name: 'WATER', type: 'non-polymer' }); } else if (NonPolymerNames.has(compId.toUpperCase())) { this.set({ id: compId, name: this.namesMap.get(compId) || compId, type: 'non-polymer' }); + } else if (SaccharideCompIdMap.has(compId.toUpperCase())) { + this.set({ id: compId, name: this.namesMap.get(compId) || compId, type: 'saccharide' }); } else { const atomIds = this.getAtomIds(index); if (atomIds.size === 1 && CharmmIonComponents.has(compId)) { diff --git a/src/mol-model/structure/model/types/saccharides.ts b/src/mol-model/structure/model/types/saccharides.ts new file mode 100644 index 000000000..02f4856fd --- /dev/null +++ b/src/mol-model/structure/model/types/saccharides.ts @@ -0,0 +1,9 @@ +/** + * Copyright (c) 2022 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * Code-generated ion names params file. Names extracted from CCD components. + * + * @author molstar/cli/chem-comp-dict/create-saccharides + */ + +export const SaccharideNames = new Set(['145', '147', '149', '289', '291', '293', '445', '475', '491', '510', '604', '045', '05L', '07E', '07Y', '08U', '09X', '0AT', '0BD', '0H0', '0HX', '0LP', '0MK', '0NZ', '0TS', '0UB', '0V4', '0WK', '0XY', '0YT', '10M', '12E', '14T', '15L', '16F', '16G', '16O', '17T', '18D', '18O', '18T', '1AR', '1BW', '1CF', '1FT', '1GL', '1GN', '1JB', '1LL', '1NA', '1S3', '1S4', '1SD', '1X4', '20S', '20X', '22O', '22S', '23V', '24S', '25E', '26M', '26O', '26Q', '26R', '26V', '26W', '26Y', '27C', '2DG', '2DR', '2F8', '2FG', '2FL', '2FP', '2GL', '2GS', '2H5', '2HA', '2M4', '2M5', '2M8', '2OS', '2WP', '2WS', '32O', '34V', '38J', '3BU', '3CM', '3DO', '3DY', '3FM', '3GR', '3HD', '3J3', '3J4', '3LJ', '3LR', '3MF', '3MG', '3MK', '3R3', '3S6', '3SA', '3YW', '40J', '42D', '44S', '46D', '46M', '46Z', '48Z', '49A', '49S', '49T', '49V', '4AM', '4CQ', '4GC', '4GL', '4GP', '4JA', '4N2', '4NN', '4QY', '4R1', '4RS', '4SG', '4U0', '4U1', '4U2', '4UZ', '4V5', '50A', '51N', '56N', '57S', '5DI', '5GF', '5GO', '5II', '5KQ', '5KS', '5KT', '5KV', '5L2', '5L3', '5LS', '5LT', '5MM', '5N6', '5QP', '5RP', '5SA', '5SP', '5TH', '5TJ', '5TK', '5TM', '61J', '62I', '64K', '66O', '6BG', '6C2', '6DM', '6GB', '6GP', '6GR', '6K3', '6KH', '6KL', '6KS', '6KU', '6KW', '6LA', '6LS', '6LW', '6MJ', '6MN', '6PG', '6PY', '6PZ', '6S2', '6SA', '6UD', '6Y6', '6YR', '6ZC', '73E', '79J', '7CV', '7D1', '7GP', '7JZ', '7K2', '7K3', '7NU', '7SA', '83Y', '89Y', '8B7', '8B9', '8EX', '8GA', '8GG', '8GP', '8LM', '8LR', '8OQ', '8PK', '8S0', '8YV', '95Z', '96O', '9AM', '9C1', '9CD', '9GP', '9KJ', '9MR', '9OK', '9PG', '9QG', '9QZ', '9S7', '9SG', '9SJ', '9SM', '9SP', '9T1', '9T7', '9VP', '9WJ', '9WN', '9WZ', '9YW', 'A0K', 'A1Q', 'A2G', 'A5C', 'A6P', 'AAL', 'AAO', 'ABC', 'ABD', 'ABE', 'ABF', 'ABL', 'AC1', 'ACG', 'ACR', 'ACX', 'ADA', 'ADG', 'ADR', 'AF1', 'AFD', 'AFL', 'AFO', 'AFP', 'AFR', 'AGC', 'AGH', 'AGL', 'AGR', 'AH2', 'AH8', 'AHG', 'AHM', 'AHR', 'AIG', 'ALL', 'ALX', 'AMG', 'AMN', 'AMU', 'AMV', 'ANA', 'AOG', 'AOS', 'AQA', 'ARA', 'ARB', 'ARE', 'ARI', 'ARW', 'ASC', 'ASG', 'ASO', 'AXP', 'AXR', 'AY9', 'AZC', 'B0D', 'B16', 'B1H', 'B1N', 'B2G', 'B4G', 'B6D', 'B7G', 'B8D', 'B9D', 'BBK', 'BBV', 'BCD', 'BCW', 'BDF', 'BDG', 'BDP', 'BDR', 'BDZ', 'BEM', 'BFN', 'BFP', 'BG6', 'BG8', 'BGC', 'BGL', 'BGN', 'BGP', 'BGS', 'BHG', 'BM3', 'BM7', 'BMA', 'BMX', 'BND', 'BNG', 'BNX', 'BO1', 'BOG', 'BQY', 'BRI', 'BS7', 'BTG', 'BTU', 'BWG', 'BXF', 'BXP', 'BXX', 'BXY', 'BZD', 'C3B', 'C3G', 'C3X', 'C4B', 'C4W', 'C4X', 'C5X', 'CAP', 'CBF', 'CBI', 'CBK', 'CDR', 'CE5', 'CE6', 'CE8', 'CEG', 'CEX', 'CEY', 'CEZ', 'CGF', 'CJB', 'CKB', 'CKP', 'CNP', 'CR1', 'CR6', 'CRA', 'CT3', 'CTO', 'CTR', 'CTT', 'D0N', 'D1M', 'D5E', 'D6G', 'DAF', 'DAG', 'DAN', 'DDA', 'DDB', 'DDL', 'DEG', 'DEL', 'DFR', 'DFX', 'DG0', 'DGC', 'DGD', 'DGM', 'DGO', 'DGS', 'DGU', 'DIG', 'DJB', 'DJE', 'DK4', 'DKX', 'DKZ', 'DL6', 'DLD', 'DLF', 'DLG', 'DMU', 'DNO', 'DO8', 'DOM', 'DP5', 'DPC', 'DQQ', 'DQR', 'DR2', 'DR3', 'DR4', 'DR5', 'DRI', 'DSR', 'DT6', 'DVC', 'DYM', 'E3M', 'E4P', 'E5G', 'EAG', 'EBG', 'EBQ', 'EEN', 'EEQ', 'EGA', 'EJT', 'EMP', 'EMZ', 'EPG', 'EQP', 'EQV', 'ERE', 'ERI', 'ETT', 'EUS', 'F1P', 'F1X', 'F55', 'F58', 'F6P', 'F8X', 'FBP', 'FCA', 'FCB', 'FCT', 'FDP', 'FDQ', 'FFC', 'FFX', 'FIF', 'FIX', 'FK9', 'FKD', 'FMF', 'FMO', 'FNG', 'FNY', 'FRU', 'FSA', 'FSI', 'FSM', 'FSR', 'FSW', 'FU4', 'FUB', 'FUC', 'FUD', 'FUF', 'FUL', 'FUY', 'FVQ', 'FX1', 'FYJ', 'G0S', 'G16', 'G1P', 'G20', 'G28', 'G2F', 'G3F', 'G3I', 'G4D', 'G4S', 'G6D', 'G6P', 'G6S', 'G7P', 'G8Z', 'GAA', 'GAC', 'GAD', 'GAF', 'GAL', 'GAT', 'GBH', 'GC1', 'GC4', 'GC9', 'GCB', 'GCD', 'GCN', 'GCO', 'GCS', 'GCT', 'GCU', 'GCV', 'GCW', 'GDA', 'GDL', 'GE1', 'GE3', 'GFP', 'GIV', 'GL0', 'GL1', 'GL2', 'GL4', 'GL5', 'GL6', 'GL7', 'GL9', 'GLA', 'GLB', 'GLC', 'GLD', 'GLF', 'GLG', 'GLO', 'GLP', 'GLS', 'GLT', 'GLW', 'GM0', 'GMB', 'GMH', 'GMT', 'GMZ', 'GN1', 'GN4', 'GNS', 'GNX', 'GP0', 'GP1', 'GP4', 'GPH', 'GPK', 'GPM', 'GPO', 'GPQ', 'GPU', 'GPV', 'GPW', 'GQ1', 'GRF', 'GRX', 'GS1', 'GS4', 'GS9', 'GSA', 'GSD', 'GTE', 'GTH', 'GTK', 'GTM', 'GTR', 'GU0', 'GU1', 'GU2', 'GU3', 'GU4', 'GU5', 'GU6', 'GU8', 'GU9', 'GUF', 'GUL', 'GUP', 'GUZ', 'GXL', 'GXV', 'GYE', 'GYG', 'GYP', 'GYU', 'GYV', 'GZL', 'H1M', 'H1S', 'H2P', 'H3S', 'H53', 'H6Q', 'H6Z', 'HBZ', 'HD4', 'HDL', 'HMS', 'HNV', 'HNW', 'HSG', 'HSH', 'HSJ', 'HSQ', 'HSR', 'HSU', 'HSX', 'HSY', 'HSZ', 'HTG', 'HTM', 'I57', 'IAB', 'IDC', 'IDF', 'IDG', 'IDR', 'IDS', 'IDT', 'IDU', 'IDX', 'IDY', 'IEM', 'IN1', 'IPT', 'ISD', 'ISL', 'ISX', 'IXD', 'J5B', 'JFZ', 'JHM', 'JLT', 'JRV', 'JS2', 'JSV', 'JV4', 'JVA', 'JVS', 'JZR', 'K5B', 'K99', 'KBA', 'KBG', 'KD5', 'KDA', 'KDB', 'KDD', 'KDE', 'KDF', 'KDM', 'KDN', 'KDO', 'KDR', 'KFN', 'KG1', 'KGM', 'KHP', 'KME', 'KO1', 'KO2', 'KOT', 'KTU', 'L1L', 'L6S', 'L6T', 'LAG', 'LAH', 'LAI', 'LAK', 'LAO', 'LAT', 'LB2', 'LBS', 'LBT', 'LCN', 'LDY', 'LEC', 'LER', 'LFC', 'LFR', 'LGC', 'LGU', 'LKA', 'LKS', 'LM2', 'LMO', 'LMT', 'LMU', 'LNV', 'LOG', 'LOX', 'LPK', 'LRH', 'LSM', 'LTG', 'LTM', 'LVO', 'LVZ', 'LXB', 'LXC', 'LXZ', 'LZ0', 'M1F', 'M1P', 'M2F', 'M3M', 'M3N', 'M55', 'M6D', 'M6P', 'M7B', 'M7P', 'M8C', 'MA1', 'MA2', 'MA3', 'MA8', 'MAB', 'MAF', 'MAG', 'MAL', 'MAN', 'MAT', 'MAV', 'MAW', 'MBE', 'MBF', 'MBG', 'MCU', 'MDA', 'MDP', 'MFA', 'MFB', 'MFU', 'MG5', 'MGA', 'MGC', 'MGL', 'MGS', 'MJJ', 'MLB', 'MLR', 'MMA', 'MMN', 'MN0', 'MNA', 'MQG', 'MQT', 'MRH', 'MRP', 'MSX', 'MTT', 'MUB', 'MUG', 'MUR', 'MVP', 'MXY', 'MXZ', 'MYG', 'N1L', 'N9S', 'NA1', 'NAA', 'NAG', 'NBG', 'NBX', 'NBY', 'NDG', 'NED', 'NFG', 'NG1', 'NG6', 'NGA', 'NGB', 'NGC', 'NGE', 'NGF', 'NGK', 'NGL', 'NGR', 'NGS', 'NGY', 'NGZ', 'NHF', 'NLC', 'NM6', 'NM9', 'NNG', 'NPF', 'NSQ', 'NT1', 'NTF', 'NTO', 'NTP', 'NXD', 'NYT', 'O1G', 'OAK', 'OEL', 'OI7', 'OPM', 'ORP', 'OSU', 'OTG', 'OTN', 'OTU', 'OX2', 'P53', 'P6P', 'P8E', 'PA1', 'PA5', 'PAV', 'PDX', 'PH5', 'PKM', 'PNA', 'PNG', 'PNJ', 'PNW', 'PPC', 'PRP', 'PSG', 'PSJ', 'PSV', 'PTQ', 'PUF', 'PZU', 'QDK', 'QIF', 'QKH', 'QPS', 'QV4', 'R1P', 'R1X', 'R2B', 'R2G', 'R5P', 'RAA', 'RAE', 'RAF', 'RAM', 'RAO', 'RAT', 'RB5', 'RBL', 'RCD', 'RDP', 'REL', 'RER', 'RF5', 'RG1', 'RGG', 'RHA', 'RHC', 'RI2', 'RIB', 'RIP', 'RM4', 'RNS', 'RNT', 'ROB', 'ROR', 'RP3', 'RP5', 'RP6', 'RPA', 'RR7', 'RRJ', 'RRY', 'RST', 'RTG', 'RTV', 'RUB', 'RUG', 'RUU', 'RV7', 'RVG', 'RVM', 'RWI', 'RY7', 'RZM', 'S6P', 'S7P', 'S81', 'SA0', 'SCG', 'SCR', 'SDD', 'SDY', 'SEJ', 'SF6', 'SF9', 'SFJ', 'SFU', 'SG4', 'SG5', 'SG6', 'SG7', 'SGA', 'SGC', 'SGD', 'SGN', 'SGS', 'SHB', 'SHD', 'SHG', 'SI3', 'SIA', 'SID', 'SIO', 'SIZ', 'SLB', 'SLM', 'SLT', 'SMD', 'SN5', 'SNG', 'SOE', 'SOG', 'SOL', 'SOR', 'SR1', 'SSG', 'SSH', 'STW', 'STZ', 'SUC', 'SUP', 'SUS', 'SWE', 'SZZ', 'T68', 'T6D', 'T6P', 'T6T', 'TA6', 'TAG', 'TCB', 'TCG', 'TDG', 'TEU', 'TF0', 'TFU', 'TGA', 'TGK', 'TGR', 'TGY', 'TH1', 'TM5', 'TM6', 'TM9', 'TMR', 'TMX', 'TNX', 'TOA', 'TOC', 'TQY', 'TRE', 'TRV', 'TS8', 'TT7', 'TTV', 'TTZ', 'TU4', 'TUG', 'TUJ', 'TUP', 'TUR', 'TVD', 'TVG', 'TVM', 'TVS', 'TVV', 'TVY', 'TW7', 'TWA', 'TWD', 'TWG', 'TWJ', 'TWY', 'TXB', 'TYV', 'U1Y', 'U2A', 'U2D', 'U63', 'U8V', 'U97', 'U9A', 'U9D', 'U9G', 'U9J', 'U9M', 'UAP', 'UCD', 'UDC', 'UEA', 'V3M', 'V3P', 'V71', 'VG1', 'VJ1', 'VJ4', 'VKN', 'VTB', 'W9T', 'WIA', 'WOO', 'WUN', 'WZ1', 'WZ2', 'WZ4', 'X0X', 'X1P', 'X1X', 'X2F', 'X2Y', 'X34', 'X4S', 'X5S', 'X6X', 'X6Y', 'XBP', 'XDP', 'XDX', 'XGP', 'XIL', 'XKJ', 'XLF', 'XLS', 'XMM', 'XS2', 'XUL', 'XXM', 'XXR', 'XXX', 'XYB', 'XYF', 'XYL', 'XYP', 'XYS', 'XYT', 'XYZ', 'YDR', 'YIO', 'YJM', 'YKR', 'YO5', 'YX0', 'YX1', 'YYB', 'YYH', 'YYJ', 'YYK', 'YYM', 'YYQ', 'YYR', 'YZ0', 'Z0F', 'Z15', 'Z16', 'Z2D', 'Z2T', 'Z3K', 'Z3L', 'Z3Q', 'Z3U', 'Z4K', 'Z4R', 'Z4S', 'Z4U', 'Z4V', 'Z4W', 'Z4Y', 'Z57', 'Z5J', 'Z5L', 'Z61', 'Z6H', 'Z6J', 'Z6W', 'Z8H', 'Z8T', 'Z9D', 'Z9E', 'Z9H', 'Z9K', 'Z9L', 'Z9M', 'Z9N', 'Z9W', 'ZB0', 'ZB1', 'ZB2', 'ZB3', 'ZCD', 'ZCZ', 'ZD0', 'ZDC', 'ZDM', 'ZDO', 'ZEE', 'ZEL', 'ZGE', 'ZMR', 'UMQ', 'SQD']); diff --git a/src/mol-model/structure/structure/carbohydrates/constants.ts b/src/mol-model/structure/structure/carbohydrates/constants.ts index dc8a1c449..1c0e60209 100644 --- a/src/mol-model/structure/structure/carbohydrates/constants.ts +++ b/src/mol-model/structure/structure/carbohydrates/constants.ts @@ -1,11 +1,12 @@ /** - * Copyright (c) 2018-2021 mol* contributors, licensed under MIT, See LICENSE file for more info. + * Copyright (c) 2018-2022 mol* contributors, licensed under MIT, See LICENSE file for more info. * * @author Alexander Rose <alexander.rose@weirdbyte.de> * @author David Sehnal <david.sehnal@gmail.com> */ import { Color, ColorMap } from '../../../../mol-util/color'; +import { SaccharideNames } from '../../model/types/saccharides'; // follows community standard from https://www.ncbi.nlm.nih.gov/glycans/snfg.html @@ -302,13 +303,6 @@ const CommonSaccharideNames: { [k: string]: string[] } = { Psi: ['PSV', 'SF6', 'SF9', 'TTV'], }; -const UnknownSaccharideNames = [ - 'NGZ', // via CCD - 'LAT', // BETA-LACTOSE, Gal-Glc di-saccharide via GlyFinder - - 'PUF', 'GDA', '9WJ', // via updated CCD -]; - /** * From http://glycam.org/docs/othertoolsservice/2016/06/09/3d-snfg-list-of-residue-names/#CHARMM */ @@ -354,9 +348,9 @@ export const SaccharideCompIdMap = (function () { } } } - for (let i = 0, il = UnknownSaccharideNames.length; i < il; ++i) { - map.set(UnknownSaccharideNames[i], UnknownSaccharideComponent); - } + SaccharideNames.forEach(name => { + if (!map.has(name)) map.set(name, UnknownSaccharideComponent); + }); return map; })(); -- GitLab