From 22e876a31f683cb53b3d4147f544b18b9705ce12 Mon Sep 17 00:00:00 2001 From: Alexander Rose <alex.rose@rcsb.org> Date: Fri, 17 Aug 2018 12:09:43 -0700 Subject: [PATCH] added app to create chemCompBond table, removed combine-mmcif app --- src/apps/chem-comp-bond/create-table.ts | 253 ++++++++++++++++++++++ src/apps/combine-mmcif/index.ts | 276 ------------------------ 2 files changed, 253 insertions(+), 276 deletions(-) create mode 100644 src/apps/chem-comp-bond/create-table.ts delete mode 100644 src/apps/combine-mmcif/index.ts diff --git a/src/apps/chem-comp-bond/create-table.ts b/src/apps/chem-comp-bond/create-table.ts new file mode 100644 index 000000000..8f5e0030f --- /dev/null +++ b/src/apps/chem-comp-bond/create-table.ts @@ -0,0 +1,253 @@ +/** + * Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import * as argparse from 'argparse' +import * as util from 'util' +import * as path from 'path' +import * as fs from 'fs' +import * as zlib from 'zlib' +import fetch from 'node-fetch' +require('util.promisify').shim() +const readFile = util.promisify(fs.readFile) +const writeFile = util.promisify(fs.writeFile) + +import { Progress } from 'mol-task' +import { Database, Table, DatabaseCollection, Column } from 'mol-data/db' +import CIF from 'mol-io/reader/cif' +import { CifWriter } from 'mol-io/writer/cif' +import { CCD_Schema } from 'mol-io/reader/cif/schema/ccd' +import { difference } from 'mol-util/set' +import { DefaultMap } from 'mol-util/map' + +export async function ensureAvailable(path: string, url: string) { + if (FORCE_DOWNLOAD || !fs.existsSync(path)) { + console.log(`downloading ${url}...`) + const data = await fetch(url) + if (!fs.existsSync(DATA_DIR)) { + fs.mkdirSync(DATA_DIR); + } + if (url.endsWith('.gz')) { + await writeFile(path, zlib.gunzipSync(await data.buffer())) + } else { + await writeFile(path, await data.text()) + } + console.log(`done downloading ${url}`) + } +} + +export async function ensureDataAvailable() { + await ensureAvailable(CCD_PATH, CCD_URL) + await ensureAvailable(PVCD_PATH, PVCD_URL) +} + +export async function readFileAsCollection<S extends Database.Schema>(path: string, schema: S) { + const parsed = await parseCif(await readFile(path, 'utf8')) + return CIF.toDatabaseCollection(schema, parsed.result) +} + +export async function readCCD() { + return readFileAsCollection(CCD_PATH, CCD_Schema) +} + +export async function readPVCD() { + return readFileAsCollection(PVCD_PATH, CCD_Schema) +} + +async function parseCif(data: string | Uint8Array) { + const comp = CIF.parse(data); + console.time('parse cif'); + const parsed = await comp.run(p => console.log(Progress.format(p)), 250); + console.timeEnd('parse cif'); + if (parsed.isError) throw parsed; + return parsed +} + +export function getEncodedCif(name: string, database: Database<Database.Schema>, binary = false) { + const encoder = CifWriter.createEncoder({ binary, encoderName: 'mol*' }); + CifWriter.Encoder.writeDatabase(encoder, name, database) + return encoder.getData(); +} + +type CCB = Table<CCD_Schema['chem_comp_bond']> +type CCA = Table<CCD_Schema['chem_comp_atom']> + +const ChemCompBond_Schema = { + ...CCD_Schema['chem_comp_bond'], + molstar_protonation_variant: Column.Schema.Str() +} + +function ccbKey(compId: string, atomId1: string, atomId2: string) { + return atomId1 < atomId2 ? `${compId}:${atomId1}-${atomId2}` : `${compId}:${atomId2}-${atomId1}` +} + +function addChemCompBondToSet(set: Set<string>, ccb: CCB) { + for (let i = 0, il = ccb._rowCount; i < il; ++i) { + set.add(ccbKey(ccb.comp_id.value(i), ccb.atom_id_1.value(i), ccb.atom_id_2.value(i))) + } + return set +} + +function addChemCompAtomToSet(set: Set<string>, cca: CCA) { + for (let i = 0, il = cca._rowCount; i < il; ++i) { + set.add(cca.atom_id.value(i)) + } + return set +} + +function checkAddingBondsFromPVCD(pvcd: DatabaseCollection<CCD_Schema>) { + const ccbSetByParent = DefaultMap<string, Set<string>>(() => new Set()) + + for (const k in pvcd) { + const { chem_comp, chem_comp_bond } = pvcd[k] + if (chem_comp_bond._rowCount) { + const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0) + if (parentIds.length === 0) { + const set = ccbSetByParent.getDefault(chem_comp.id.value(0)) + addChemCompBondToSet(set, chem_comp_bond) + } else { + for (let i = 0, il = parentIds.length; i < il; ++i) { + const parentId = parentIds[i] + const set = ccbSetByParent.getDefault(parentId) + addChemCompBondToSet(set, chem_comp_bond) + } + } + } + } + + for (const k in pvcd) { + const { chem_comp, chem_comp_atom, chem_comp_bond } = pvcd[k] + if (chem_comp_bond._rowCount) { + const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0) + if (parentIds.length > 0) { + for (let i = 0, il = parentIds.length; i < il; ++i) { + const entryBonds = addChemCompBondToSet(new Set<string>(), chem_comp_bond) + const entryAtoms = addChemCompAtomToSet(new Set<string>(), chem_comp_atom) + const extraBonds = difference(ccbSetByParent.get(parentIds[i])!, entryBonds) + extraBonds.forEach(bk => { + const [a1, a2] = bk.split('|') + if (entryAtoms.has(a1) && entryAtoms.has(a2)) { + console.error(`Adding all PVCD bonds would wrongly add bond ${bk} for ${k}`) + } + }) + } + } + } + } +} + +async function createBonds() { + await ensureDataAvailable() + const ccd = await readCCD() + const pvcd = await readPVCD() + + const ccbSet = new Set<string>() + + const comp_id: string[] = [] + const atom_id_1: string[] = [] + const atom_id_2: string[] = [] + const value_order: string[] = [] + const pdbx_aromatic_flag: string[] = [] + const pdbx_stereo_config: string[] = [] + const molstar_protonation_variant: string[] = [] + + function addBonds(compId: string, ccb: CCB, protonationVariant: boolean) { + for (let i = 0, il = ccb._rowCount; i < il; ++i) { + const atomId1 = ccb.atom_id_1.value(i) + const atomId2 = ccb.atom_id_2.value(i) + const k = ccbKey(compId, atomId1, atomId2) + if (!ccbSet.has(k)) { + atom_id_1.push(atomId1) + atom_id_2.push(atomId2) + comp_id.push(compId) + value_order.push(ccb.value_order.value(i)) + pdbx_aromatic_flag.push(ccb.pdbx_aromatic_flag.value(i)) + pdbx_stereo_config.push(ccb.pdbx_stereo_config.value(i)) + molstar_protonation_variant.push(protonationVariant ? 'Y' : 'N') + ccbSet.add(k) + } + } + } + + // check adding bonds from PVCD + checkAddingBondsFromPVCD(pvcd) + + // add bonds from PVCD + for (const k in pvcd) { + const { chem_comp, chem_comp_bond } = pvcd[k] + if (chem_comp_bond._rowCount) { + const parentIds = chem_comp.mon_nstd_parent_comp_id.value(0) + if (parentIds.length === 0) { + addBonds(chem_comp.id.value(0), chem_comp_bond, false) + } else { + for (let i = 0, il = parentIds.length; i < il; ++i) { + addBonds(parentIds[i], chem_comp_bond, true) + } + } + } + } + + // add bonds from CCD + for (const k in ccd) { + const { chem_comp, chem_comp_bond } = ccd[k] + if (chem_comp_bond._rowCount) { + addBonds(chem_comp.id.value(0), chem_comp_bond, false) + } + } + + const bondTable = Table.ofArrays(ChemCompBond_Schema, { + comp_id, atom_id_1, atom_id_2, value_order, + pdbx_aromatic_flag, pdbx_stereo_config, molstar_protonation_variant + }) + + const bondDatabase = Database.ofTables( + TABLE_NAME, + { chem_comp_bond: ChemCompBond_Schema }, + { chem_comp_bond: bondTable } + ) + + return bondDatabase +} + +async function run(out: string, binary = false) { + const bonds = await createBonds() + + const cif = getEncodedCif(TABLE_NAME, bonds, binary) + writeFile(out, cif) +} + +const TABLE_NAME = 'CHEM_COMP_BONDS' + +const DATA_DIR = path.join(__dirname, '..', '..', '..', 'data') +const CCD_PATH = path.join(DATA_DIR, 'components.cif') +const PVCD_PATH = path.join(DATA_DIR, 'aa-variants-v1.cif') +const CCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif' +const PVCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/aa-variants-v1.cif' + +const parser = new argparse.ArgumentParser({ + addHelp: true, + description: 'Create a cif file with one big table of all chem_comp_bond entries from the CCD and PVCD.' +}); +parser.addArgument('out', { + help: 'Generated file output path.' +}); +parser.addArgument([ '--forceDownload', '-f' ], { + action: 'storeTrue', + help: 'Force download of CCD and PVCD.' +}); +parser.addArgument([ '--binary', '-b' ], { + action: 'storeTrue', + help: 'Output as BinaryCIF.' +}); +interface Args { + out: string + forceDownload?: boolean + binary?: boolean +} +const args: Args = parser.parseArgs(); + +const FORCE_DOWNLOAD = args.forceDownload + +run(args.out, args.binary) diff --git a/src/apps/combine-mmcif/index.ts b/src/apps/combine-mmcif/index.ts deleted file mode 100644 index 862c3579d..000000000 --- a/src/apps/combine-mmcif/index.ts +++ /dev/null @@ -1,276 +0,0 @@ -/** - * Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info. - * - * @author Alexander Rose <alexander.rose@weirdbyte.de> - */ - -import * as argparse from 'argparse' -import * as util from 'util' -import * as fs from 'fs' -import * as zlib from 'zlib' -import fetch from 'node-fetch' -require('util.promisify').shim(); -const readFile = util.promisify(fs.readFile); -const writeFile = util.promisify(fs.writeFile); - -import { Progress } from 'mol-task' -import { Database, Table, DatabaseCollection } from 'mol-data/db' -import CIF from 'mol-io/reader/cif' -// import { CCD_Schema } from 'mol-io/reader/cif/schema/ccd' -import { CifWriter } from 'mol-io/writer/cif' -import { mmCIF_Schema, mmCIF_Database } from 'mol-io/reader/cif/schema/mmcif'; -import { CCD_Schema } from 'mol-io/reader/cif/schema/ccd'; -import { BIRD_Schema } from 'mol-io/reader/cif/schema/bird'; -// import { Table } from 'mol-io/reader/csv/data-model'; - -// import { Model, Structure } from 'mol-model/structure' -// import to_mmCIF from 'mol-model/structure/export/mmcif' - -export async function ensureAvailable(path: string, url: string) { - if (FORCE_DOWNLOAD || !fs.existsSync(path)) { - console.log(`downloading ${url}...`) - const data = await fetch(url) - if (!fs.existsSync(DATA_DIR)) { - fs.mkdirSync(DATA_DIR); - } - if (url.endsWith('.gz')) { - await writeFile(path, zlib.gunzipSync(await data.buffer())) - } else { - await writeFile(path, await data.text()) - } - console.log(`done downloading ${url}`) - } -} - -export async function ensureDataAvailable() { - await ensureAvailable(CCD_PATH, CCD_URL) - await ensureAvailable(PVCD_PATH, PVCD_URL) - await ensureAvailable(BIRD_PATH, BIRD_URL) -} - -export async function readFileAsCollection<S extends Database.Schema>(path: string, schema: S) { - const parsed = await parseCif(await readFile(path, 'utf8')) - return CIF.toDatabaseCollection(schema, parsed.result) -} - -export async function readCCD() { - return readFileAsCollection(CCD_PATH, CCD_Schema) -} - -export async function readPVCD() { - return readFileAsCollection(PVCD_PATH, CCD_Schema) -} - -export async function readBIRD() { - return readFileAsCollection(BIRD_PATH, BIRD_Schema) -} - -export async function getCCD() { - await ensureDataAvailable() - return readPVCD() -} - -export async function getBIRD() { - await ensureDataAvailable() - return readBIRD() -} - -async function parseCif(data: string|Uint8Array) { - const comp = CIF.parse(data); - console.time('parse cif'); - const parsed = await comp.run(p => console.log(Progress.format(p)), 250); - console.timeEnd('parse cif'); - if (parsed.isError) throw parsed; - return parsed -} - -export function getEncodedCif(name: string, database: Database<Database.Schema>, binary = false) { - const encoder = CifWriter.createEncoder({ binary, encoderName: 'mol*' }); - CifWriter.Encoder.writeDatabase(encoder, name, database) - return encoder.getData(); -} - -export async function getPdb(pdb: string) { - console.log(`downloading ${pdb}...`) - const data = await fetch(`https://files.rcsb.org/download/${pdb}.cif`) - console.log(`done downloading ${pdb}`) - - const parsed = await parseCif(await data.text()) - return CIF.schema.mmCIF(parsed.result.blocks[0]) -} - -type extraTables = { - chem_comp_bond: Table<mmCIF_Schema['chem_comp_bond']>, - pdbx_reference_entity_list: Table<mmCIF_Schema['pdbx_reference_entity_list']>, - pdbx_reference_entity_link: Table<mmCIF_Schema['pdbx_reference_entity_link']>, - pdbx_reference_entity_poly_link: Table<mmCIF_Schema['pdbx_reference_entity_poly_link']>, -} -type extraTablesLists = { - [k in keyof extraTables]: extraTables[k][] -} - -export function getExtraTables(mmcif: mmCIF_Database, ccd: DatabaseCollection<CCD_Schema>, bird: DatabaseCollection<BIRD_Schema>) { - const extraTablesLists: extraTablesLists = { - chem_comp_bond: [], - pdbx_reference_entity_list: [], - pdbx_reference_entity_link: [], - pdbx_reference_entity_poly_link: [] - } - - for (let i = 0, n = mmcif.chem_comp._rowCount; i < n; ++i) { - const ccdId = mmcif.chem_comp.id.value(i); - if (ccdId in ccd) { - extraTablesLists.chem_comp_bond.push(ccd[ccdId].chem_comp_bond) - } else { - console.error(`ccdId ${ccdId} not found`) - } - } - - for (let i = 0, n = mmcif.pdbx_molecule_features._rowCount; i < n; ++i) { - const birdId = mmcif.pdbx_molecule_features.prd_id.value(i); - if (birdId in bird) { - const e = bird[birdId] - extraTablesLists.pdbx_reference_entity_list.push(e.pdbx_reference_entity_list) - extraTablesLists.pdbx_reference_entity_link.push(e.pdbx_reference_entity_link) - extraTablesLists.pdbx_reference_entity_poly_link.push(e.pdbx_reference_entity_poly_link) - } else { - console.error(`birdId ${birdId} not found`) - } - } - - const extraTables: extraTables = Object.assign({}, ...Object.keys(extraTablesLists).map(k => { - // TODO how to avoid type casting? - return { [k]: Table.concat((extraTablesLists as any)[k], (mmCIF_Schema as any)[k]) } - })) - - return extraTables -} - -type PartialStructConnRow = Partial<Table.Row<mmCIF_Schema['struct_conn']>> - -export function getStructConnValueOrder(value: any): mmCIF_Schema['struct_conn']['pdbx_value_order']['T'] { - return mmCIF_Schema.struct_conn.pdbx_value_order['T'].includes(value) ? value : 'sing' -} - -export function getBirdBonds(mmcif: mmCIF_Database) { - const bonds: PartialStructConnRow[] = [] - - const mol = mmcif.pdbx_molecule - const molFeat = mmcif.pdbx_molecule_features - for (let i = 0, n = molFeat._rowCount; i < n; ++i) { - // console.log(Table.getRow(molFeat, i)) - const instancesAsymIdList: { [k: number]: string[] } = {} - for (let j = 0, m = mol._rowCount; j < m; ++j) { - if (mol.prd_id.value(j) === molFeat.prd_id.value(i)) { - // console.log(Table.getRow(mol, j)) - const instanceId = mol.instance_id.value(j) - if (instancesAsymIdList[instanceId] === undefined) { - instancesAsymIdList[instanceId] = [] - } - instancesAsymIdList[instanceId].push(mol.asym_id.value(j)) - } - } - // console.log(instancesAsymIdList) - const entityLink = mmcif.pdbx_reference_entity_link - for (const instanceId of Object.keys(instancesAsymIdList)) { - const asymIdList = instancesAsymIdList[instanceId as any] - for (let j = 0, m = entityLink._rowCount; j < m; ++j) { - if (entityLink.prd_id.value(j) === molFeat.prd_id.value(i)) { - // console.log(Table.getRow(entityLink, j)) - const link: PartialStructConnRow = { - ptnr1_label_asym_id: asymIdList[ entityLink.component_1.value(j) - 1 ], - ptnr1_label_atom_id: entityLink.atom_id_1.value(j), - ptnr1_label_comp_id: entityLink.comp_id_1.value(j), - ptnr1_label_seq_id: entityLink.entity_seq_num_1.value(j), - - ptnr2_label_asym_id: asymIdList[ entityLink.component_2.value(j) - 1 ], - ptnr2_label_atom_id: entityLink.atom_id_2.value(j), - ptnr2_label_comp_id: entityLink.comp_id_2.value(j), - ptnr2_label_seq_id: entityLink.entity_seq_num_2.value(j), - - pdbx_value_order: getStructConnValueOrder(entityLink.value_order.value(j)), - } - // console.log(link) - bonds.push(link) - } - } - } - } - return bonds -} - -export function getCcdBonds(mmcif: mmCIF_Database) { - // const bonds: PartialStructConnRow[] = [] -} - -async function run(pdb: string, out?: string) { - const ccd = await getCCD() - const bird = await getBIRD() - - const mmcif = await getPdb(pdb) - // console.log(mmcif.chem_comp.id.toArray()) - - for (const k of Object.keys(bird)) { - const entity = bird[k].pdbx_reference_entity_list - for (let i = 0, n = entity._rowCount; i < n; ++i) { - if (entity.ref_entity_id.value(i) !== entity.component_id.value(i).toString()) { - console.log(Table.getRow(entity, i)) - } - } - - const link = bird[k].pdbx_reference_entity_link - for (let i = 0, n = link._rowCount; i < n; ++i) { - if (link.value_order.value(i) !== 'sing') { - console.log(Table.getRow(link, i)) - } - } - - const polyLink = bird[k].pdbx_reference_entity_poly_link - for (let i = 0, n = link._rowCount; i < n; ++i) { - if (polyLink.value_order.value(i) !== 'sing') { - console.log(Table.getRow(polyLink, i)) - } - } - } - - const extraTables = getExtraTables(mmcif, ccd, bird) - const combinedMmcif = Database.ofTables('mmcif_combined', mmCIF_Schema, Object.assign({}, mmcif, extraTables)) - // console.log(getEncodedCif(pdb, combinedMmcif)) - // console.log(Database.getTablesAsRows(combinedMmcif)) - - // console.log(getBirdBonds(combinedMmcif)) - console.log(getCcdBonds(combinedMmcif)) -} - -const DATA_DIR = './build/data' -const CCD_PATH = `${DATA_DIR}/components.cif` -const PVCD_PATH = `${DATA_DIR}/aa-variants-v1.cif` -const BIRD_PATH = `${DATA_DIR}/prd-all.cif` -const CCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif' -const PVCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/aa-variants-v1.cif' -const BIRD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/bird/prd/prd-all.cif.gz' - -const parser = new argparse.ArgumentParser({ - addHelp: true, - description: 'Create a mmcif file that includes relevant CCD and BIRD entries' -}); -parser.addArgument([ '--pdb', '-p' ], { - help: 'Pdb entry id' -}); -parser.addArgument([ '--out', '-o' ], { - help: 'Generated file output path, if not given printed to stdout' -}); -parser.addArgument([ '--forceDownload', '-f' ], { - action: 'storeTrue', - help: 'Force download of CCD and BIRD' -}); -interface Args { - pdb: string - out: string - forceDownload: boolean -} -const args: Args = parser.parseArgs(); - -const FORCE_DOWNLOAD = args.forceDownload - -run(args.pdb, args.out) -- GitLab