diff --git a/src/apps/combine-mmcif/index.ts b/src/apps/combine-mmcif/index.ts index e20356c6e51ce3cfd16d0f962aa2a752a6d93efd..d43d85917587b0559024a6ed498968e697f5db03 100644 --- a/src/apps/combine-mmcif/index.ts +++ b/src/apps/combine-mmcif/index.ts @@ -7,17 +7,20 @@ import * as argparse from 'argparse' import * as util from 'util' import * as fs from 'fs' +import * as zlib from 'zlib' import fetch from 'node-fetch' require('util.promisify').shim(); -const readFileAsync = util.promisify(fs.readFile); +const readFile = util.promisify(fs.readFile); +const writeFile = util.promisify(fs.writeFile); -import { DatabaseCollection, Database, Table } from 'mol-data/db' +import { Database, Table, DatabaseCollection } from 'mol-data/db' import CIF from 'mol-io/reader/cif' // import { CCD_Schema } from 'mol-io/reader/cif/schema/ccd' import * as Encoder from 'mol-io/writer/cif' import Computation from 'mol-util/computation' -import { mmCIF_Schema } from 'mol-io/reader/cif/schema/mmcif'; +import { mmCIF_Schema, mmCIF_Database } from 'mol-io/reader/cif/schema/mmcif'; import { CCD_Schema } from 'mol-io/reader/cif/schema/ccd'; +import { BIRD_Schema } from 'mol-io/reader/cif/schema/bird'; // import { Table } from 'mol-io/reader/csv/data-model'; // import { Model, Structure } from 'mol-model/structure' @@ -30,7 +33,11 @@ export async function ensureAvailable(path: string, url: string) { if (!fs.existsSync(DATA_DIR)) { fs.mkdirSync(DATA_DIR); } - fs.writeFileSync(path, await data.text()) + if (url.endsWith('.gz')) { + await writeFile(path, zlib.gunzipSync(await data.buffer())) + } else { + await writeFile(path, await data.text()) + } console.log(`done downloading ${url}`) } } @@ -38,30 +45,28 @@ export async function ensureAvailable(path: string, url: string) { export async function ensureDataAvailable() { await ensureAvailable(CCD_PATH, CCD_URL) await ensureAvailable(PVCD_PATH, PVCD_URL) + await ensureAvailable(BIRD_PATH, BIRD_URL) } function showProgress(tag: string, p: Computation.Progress) { console.log(`[${tag}] ${p.message} ${p.isIndeterminate ? '' : (p.current / p.max * 100).toFixed(2) + '% '}(${p.elapsedMs | 0}ms)`) } -export async function readCCD(bcif = false) { - const parsed = await parseCif(await readFileAsync(CCD_PATH, 'utf8')) - const ccd: DatabaseCollection<CCD_Schema> = {} - for (const data of parsed.result.blocks) { - // console.log(data.header) - ccd[data.header] = CIF.schema.CCD(data) - } - return ccd; +export async function readFileAsCollection<S extends Database.Schema>(path: string, schema: S) { + const parsed = await parseCif(await readFile(path, 'utf8')) + return CIF.toDatabaseCollection(schema, parsed.result) } -export async function readPVCD(bcif = false) { - const parsed = await parseCif(await readFileAsync(PVCD_PATH, 'utf8')) - const ccd: DatabaseCollection<CCD_Schema> = {} - for (const data of parsed.result.blocks) { - // console.log(data.header) - ccd[data.header] = CIF.schema.CCD(data) - } - return ccd; +export async function readCCD() { + return readFileAsCollection(CCD_PATH, CCD_Schema) +} + +export async function readPVCD() { + return readFileAsCollection(PVCD_PATH, CCD_Schema) +} + +export async function readBIRD() { + return readFileAsCollection(BIRD_PATH, BIRD_Schema) } export async function getCCD() { @@ -69,6 +74,11 @@ export async function getCCD() { return readPVCD() } +export async function getBIRD() { + await ensureDataAvailable() + return readBIRD() +} + async function parseCif(data: string|Uint8Array) { const comp = CIF.parse(data) const ctx = Computation.observable({ @@ -97,53 +107,157 @@ export async function getPdb(pdb: string) { return CIF.schema.mmCIF(parsed.result.blocks[0]) } +type extraTables = { + chem_comp_bond: Table<mmCIF_Schema['chem_comp_bond']>, + pdbx_reference_entity_list: Table<mmCIF_Schema['pdbx_reference_entity_list']>, + pdbx_reference_entity_link: Table<mmCIF_Schema['pdbx_reference_entity_link']>, + pdbx_reference_entity_poly_link: Table<mmCIF_Schema['pdbx_reference_entity_poly_link']>, +} +type extraTablesLists = { + [k in keyof extraTables]: extraTables[k][] +} + +export function getExtraTables(mmcif: mmCIF_Database, ccd: DatabaseCollection<CCD_Schema>, bird: DatabaseCollection<BIRD_Schema>) { + const extraTablesLists: extraTablesLists = { + chem_comp_bond: [], + pdbx_reference_entity_list: [], + pdbx_reference_entity_link: [], + pdbx_reference_entity_poly_link: [] + } + + for (let i = 0, n = mmcif.chem_comp._rowCount; i < n; ++i) { + const ccdId = mmcif.chem_comp.id.value(i); + if (ccdId in ccd) { + extraTablesLists.chem_comp_bond.push(ccd[ccdId].chem_comp_bond) + } else { + console.error(`ccdId ${ccdId} not found`) + } + } + + for (let i = 0, n = mmcif.pdbx_molecule_features._rowCount; i < n; ++i) { + const birdId = mmcif.pdbx_molecule_features.prd_id.value(i); + if (birdId in bird) { + const e = bird[birdId] + extraTablesLists.pdbx_reference_entity_list.push(e.pdbx_reference_entity_list) + extraTablesLists.pdbx_reference_entity_link.push(e.pdbx_reference_entity_link) + extraTablesLists.pdbx_reference_entity_poly_link.push(e.pdbx_reference_entity_poly_link) + } else { + console.error(`birdId ${birdId} not found`) + } + } + + const extraTables: extraTables = Object.assign({}, ...Object.keys(extraTablesLists).map(k => { + // TODO how to avoid type casting? + return { [k]: Table.concat((extraTablesLists as any)[k], (mmCIF_Schema as any)[k]) } + })) + + return extraTables +} + +type PartialStructConnRow = Partial<Table.Row<mmCIF_Schema['struct_conn']>> + +export function getStructConnValueOrder(value: any): mmCIF_Schema['struct_conn']['pdbx_value_order']['T'] { + return mmCIF_Schema.struct_conn.pdbx_value_order['T'].includes(value) ? value : 'sing' +} + +export function getBirdBonds(mmcif: mmCIF_Database) { + const bonds: PartialStructConnRow[] = [] + + const mol = mmcif.pdbx_molecule + const molFeat = mmcif.pdbx_molecule_features + for (let i = 0, n = molFeat._rowCount; i < n; ++i) { + // console.log(Table.getRow(molFeat, i)) + const instancesAsymIdList: { [k: number]: string[] } = {} + for (let j = 0, m = mol._rowCount; j < m; ++j) { + if (mol.prd_id.value(j) === molFeat.prd_id.value(i)) { + // console.log(Table.getRow(mol, j)) + const instanceId = mol.instance_id.value(j) + if (instancesAsymIdList[instanceId] === undefined) { + instancesAsymIdList[instanceId] = [] + } + instancesAsymIdList[instanceId].push(mol.asym_id.value(j)) + } + } + // console.log(instancesAsymIdList) + const entityLink = mmcif.pdbx_reference_entity_link + for (const instanceId of Object.keys(instancesAsymIdList)) { + const asymIdList = instancesAsymIdList[instanceId as any] + for (let j = 0, m = entityLink._rowCount; j < m; ++j) { + if (entityLink.prd_id.value(j) === molFeat.prd_id.value(i)) { + // console.log(Table.getRow(entityLink, j)) + const link: PartialStructConnRow = { + ptnr1_label_asym_id: asymIdList[ entityLink.component_1.value(j) - 1 ], + ptnr1_label_atom_id: entityLink.atom_id_1.value(j), + ptnr1_label_comp_id: entityLink.comp_id_1.value(j), + ptnr1_label_seq_id: entityLink.entity_seq_num_1.value(j), + + ptnr2_label_asym_id: asymIdList[ entityLink.component_2.value(j) - 1 ], + ptnr2_label_atom_id: entityLink.atom_id_2.value(j), + ptnr2_label_comp_id: entityLink.comp_id_2.value(j), + ptnr2_label_seq_id: entityLink.entity_seq_num_2.value(j), + + pdbx_value_order: getStructConnValueOrder(entityLink.value_order.value(j)), + } + // console.log(link) + bonds.push(link) + } + } + } + } + return bonds +} + +export function getCcdBonds(mmcif: mmCIF_Database) { + const bonds: PartialStructConnRow[] = [] + +} + async function run(pdb: string, out?: string) { const ccd = await getCCD() - // console.log(Object.keys(ccd).length) + const bird = await getBIRD() const mmcif = await getPdb(pdb) // console.log(mmcif.chem_comp.id.toArray()) - // const chemCompBond_Schema = { chem_comp_bond: mmCIF_Schema.chem_comp_bond } - // type chemCompBond_Schema = typeof chemCompBond_Schema; - // type chemCompBond_Database = Database<chemCompBond_Schema>; - // interface chemCompBond_Database extends Database<chemCompBond_Schema> {} + for (const k of Object.keys(bird)) { + const entity = bird[k].pdbx_reference_entity_list + for (let i = 0, n = entity._rowCount; i < n; ++i) { + if (entity.ref_entity_id.value(i) !== entity.component_id.value(i).toString()) { + console.log(Table.getRow(entity, i)) + } + } - const chemCompBondTables: Table<mmCIF_Schema['chem_comp_bond']>[] = [] + const link = bird[k].pdbx_reference_entity_link + for (let i = 0, n = link._rowCount; i < n; ++i) { + if (link.value_order.value(i) !== 'sing') { + console.log(Table.getRow(link, i)) + } + } - for (let i = 0, n = mmcif.chem_comp._rowCount; i < n; ++i) { - const ccdId = mmcif.chem_comp.id.value(i); - // console.log(ccdId) - if (ccdId in ccd) { - // console.log(`ccdId ${ccdId} has ${ccd[ccdId].chem_comp_atom._rowCount} atoms`) - // console.log(ccd[ccdId].chem_comp_bond.atom_id_1.toArray()) - chemCompBondTables.push(ccd[ccdId].chem_comp_bond) - } else { - console.error(`ccdId ${ccdId} not found`) + const polyLink = bird[k].pdbx_reference_entity_poly_link + for (let i = 0, n = link._rowCount; i < n; ++i) { + if (polyLink.value_order.value(i) !== 'sing') { + console.log(Table.getRow(polyLink, i)) + } } } - // for (const k of Object.keys(ccd)) { - // console.log(k) - // } + const extraTables = getExtraTables(mmcif, ccd, bird) + const combinedMmcif = Database.ofTables('mmcif_combined', mmCIF_Schema, Object.assign({}, mmcif, extraTables)) + // console.log(getEncodedCif(pdb, combinedMmcif)) + // console.log(Database.getTablesAsRows(combinedMmcif)) - // const combinedChemCompBonds = Database.ofTables('chemCompBond', chemCompBond_Schema, { - // chem_comp_bond: Table.concat(chemCompBondTables, mmCIF_Schema.chem_comp_bond) - // }) - // console.log('concat done') - // console.log(getEncodedCif('chemCompBond', combinedChemCompBonds)) - const combinedMmcif = Database.ofTables('chemCompBond', mmCIF_Schema, Object.assign({}, - mmcif, - { chem_comp_bond: Table.concat(chemCompBondTables, mmCIF_Schema.chem_comp_bond) } - )) - console.log(getEncodedCif(pdb, combinedMmcif)) + // console.log(getBirdBonds(combinedMmcif)) + console.log(getCcdBonds(combinedMmcif)) } const DATA_DIR = './build/data' const CCD_PATH = `${DATA_DIR}/components.cif` const PVCD_PATH = `${DATA_DIR}/aa-variants-v1.cif` +const BIRD_PATH = `${DATA_DIR}/prd-all.cif` const CCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/components.cif' const PVCD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/monomers/aa-variants-v1.cif' +const BIRD_URL = 'http://ftp.wwpdb.org/pub/pdb/data/bird/prd/prd-all.cif.gz' const parser = new argparse.ArgumentParser({ addHelp: true,