diff --git a/src/mol-model-formats/structure/common/component.ts b/src/mol-model-formats/structure/common/component.ts index 3b9624bcf97133f8f078cbb9104abd7d568ba132..30f201b86c2f344560749e26fdd71ecdbb34f46a 100644 --- a/src/mol-model-formats/structure/common/component.ts +++ b/src/mol-model-formats/structure/common/component.ts @@ -73,6 +73,7 @@ const StandardComponents = (function() { })() export class ComponentBuilder { + private namesMap = new Map<string, string>() private comps = new Map<string, Component>() private ids: string[] = [] private names: string[] = [] @@ -129,8 +130,8 @@ export class ComponentBuilder { } else if (WaterNames.has(compId)) { this.set({ id: compId, name: 'WATER', type: 'non-polymer' }) } else { - const atomIds = this.getAtomIds(index) - this.set({ id: compId, name: compId, type: this.getType(atomIds) }) + const type = this.getType(this.getAtomIds(index)) + this.set({ id: compId, name: this.namesMap.get(compId) || compId, type }) } } return this.get(compId)! @@ -145,6 +146,10 @@ export class ComponentBuilder { return CifCategory.ofFields('chem_comp', chemComp) } + setNames(names: [string, string][]) { + names.forEach(n => this.namesMap.set(n[0], n[1])) + } + constructor(private seqId: Column<number>, private atomId: Column<string>) { } diff --git a/src/mol-model-formats/structure/common/entity.ts b/src/mol-model-formats/structure/common/entity.ts new file mode 100644 index 0000000000000000000000000000000000000000..f416aaeac0cb6659f2a91759b12e6a3d5587201d --- /dev/null +++ b/src/mol-model-formats/structure/common/entity.ts @@ -0,0 +1,80 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import { CifCategory, CifField } from '../../../mol-io/reader/cif'; +import { MoleculeType, isPolymer } from '../../../mol-model/structure/model/types'; +import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif'; + +export type EntityCompound = { chains: string[], description: string } + +export class EntityBuilder { + private count = 0 + private ids: string[] = [] + private types: string[] = [] + private descriptions: string[] = [] + + private compoundsMap = new Map<string, string>() + private namesMap = new Map<string, string>() + private heteroMap = new Map<string, string>() + private chainMap = new Map<string, string>() + private waterId?: string + + private set(type: string, description: string) { + this.count += 1 + this.ids.push(`${this.count}`) + this.types.push(type) + this.descriptions.push(description) + } + + getEntityId(compId: string, moleculeType: MoleculeType, chainId: string): string { + if (moleculeType === MoleculeType.water) { + if (this.waterId === undefined) { + this.set('water', 'Water') + this.waterId = `${this.count}` + } + return this.waterId; + } else if (isPolymer(moleculeType)) { + if (this.compoundsMap.has(chainId)) { + return this.compoundsMap.get(chainId)! + } else { + if (!this.chainMap.has(chainId)) { + this.set('polymer', `Polymer ${this.chainMap.size + 1}`) + this.chainMap.set(chainId, `${this.count}`) + } + return this.chainMap.get(chainId)! + } + } else { + if (!this.heteroMap.has(compId)) { + this.set('non-polymer', this.namesMap.get(compId) || compId) + this.heteroMap.set(compId, `${this.count}`) + } + return this.heteroMap.get(compId)! + } + } + + getEntityCategory() { + const entity: CifCategory.SomeFields<mmCIF_Schema['entity']> = { + id: CifField.ofStrings(this.ids), + type: CifField.ofStrings(this.types), + pdbx_description: CifField.ofStrings(this.descriptions), + } + return CifCategory.ofFields('entity', entity) + } + + setCompounds(compounds: EntityCompound[]) { + for (let i = 0, il = compounds.length; i < il; ++i) { + const { chains, description } = compounds[i] + this.set('polymer', description) + for (let j = 0, jl = chains.length; j < jl; ++j) { + this.compoundsMap.set(chains[j], `${this.count}`) + } + } + } + + setNames(names: [string, string][]) { + names.forEach(n => this.namesMap.set(n[0], n[1])) + } +} \ No newline at end of file diff --git a/src/mol-model-formats/structure/gro.ts b/src/mol-model-formats/structure/gro.ts index 1b82cc9fca2a0b43589def0ad99d1ca66ec6663f..0ed74bf3399810bfb2a1293e34a01e6bb7f9be04 100644 --- a/src/mol-model-formats/structure/gro.ts +++ b/src/mol-model-formats/structure/gro.ts @@ -13,61 +13,13 @@ import { CifCategory, CifField } from '../../mol-io/reader/cif'; import { Column } from '../../mol-data/db'; import { mmCIF_Schema } from '../../mol-io/reader/cif/schema/mmcif'; import { guessElementSymbolString } from './util'; -import { MoleculeType, getMoleculeType, isPolymer } from '../../mol-model/structure/model/types'; +import { MoleculeType, getMoleculeType } from '../../mol-model/structure/model/types'; import { ComponentBuilder } from './common/component'; import { getChainId } from './common/util'; +import { EntityBuilder } from './common/entity'; // TODO multi model files -class EntityBuilder { - private count = 0 - private ids: string[] = [] - private types: string[] = [] - private descriptions: string[] = [] - - private heteroMap = new Map<string, string>() - private chainMap = new Map<string, string>() - private waterId?: string - - private set(type: string, description: string) { - this.count += 1 - this.ids.push(`${this.count}`) - this.types.push(type) - this.descriptions.push(description) - } - - getEntityId(compId: string, moleculeType: MoleculeType, chainId: string): string { - if (moleculeType === MoleculeType.water) { - if (this.waterId === undefined) { - this.set('water', 'Water') - this.waterId = `${this.count}` - } - return this.waterId; - } else if (isPolymer(moleculeType)) { - if (!this.chainMap.has(chainId)) { - this.set('polymer', `Polymer ${this.chainMap.size + 1}`) - this.chainMap.set(chainId, `${this.count}`) - } - return this.chainMap.get(chainId)! - } else { - if (!this.heteroMap.has(compId)) { - this.set('non-polymer', compId) - this.heteroMap.set(compId, `${this.count}`) - } - return this.heteroMap.get(compId)! - } - } - - getEntityCategory() { - const entity: CifCategory.SomeFields<mmCIF_Schema['entity']> = { - id: CifField.ofStrings(this.ids), - type: CifField.ofStrings(this.types), - pdbx_description: CifField.ofStrings(this.descriptions), - } - return CifCategory.ofFields('entity', entity) - } -} - function getCategories(atoms: GroAtoms) { const auth_atom_id = CifField.ofColumn(atoms.atomName) const auth_comp_id = CifField.ofColumn(atoms.residueName) diff --git a/src/mol-model-formats/structure/pdb/entity.ts b/src/mol-model-formats/structure/pdb/entity.ts index 97ecd5d44d6baf8c30b596965b71553504eccef5..3402b32e960930964b157f8f28a955ddcdb53ffa 100644 --- a/src/mol-model-formats/structure/pdb/entity.ts +++ b/src/mol-model-formats/structure/pdb/entity.ts @@ -5,8 +5,7 @@ */ import { Tokens } from '../../../mol-io/reader/common/text/tokenizer'; -import { CifCategory, CifField } from '../../../mol-io/reader/cif'; -import { WaterNames } from '../../../mol-model/structure/model/types'; +import { EntityCompound } from '../common/entity'; const Spec = { 'MOL_ID': '', @@ -21,14 +20,12 @@ const Spec = { } type Spec = keyof typeof Spec -type Compound = { chains: string[], name: string } - export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) { const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1]) let currentSpec: Spec | undefined - let currentCompound: Compound = { chains: [], name: '' } - const Compounds: Compound[] = [] + let currentCompound: EntityCompound = { chains: [], description: '' } + const Compounds: EntityCompound[] = [] for (let i = lineStart; i < lineEnd; i++) { let line = getLine(i) @@ -56,12 +53,12 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) { if (currentSpec === 'MOL_ID') { currentCompound = { chains: [], - name: '' + description: '' } Compounds.push(currentCompound) } else if (currentSpec === 'MOLECULE') { - if (currentCompound.name) currentCompound.name += ' ' - currentCompound.name += value + if (currentCompound.description) currentCompound.description += ' ' + currentCompound.description += value } else if (currentSpec === 'CHAIN') { Array.prototype.push.apply(currentCompound.chains, value.split(/\s*,\s*/)) } @@ -70,66 +67,29 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) { return Compounds } -export class EntityBuilder { - private count = 0 - private ids: string[] = [] - private types: string[] = [] - private descriptions: string[] = [] - - private compoundsMap = new Map<string, string>() - private heteroMap = new Map<string, string>() - private chainMap = new Map<string, string>() - private waterId?: string - - private set(type: string, description: string) { - this.count += 1 - this.ids.push(`${this.count}`) - this.types.push(type) - this.descriptions.push(description) - } +export function parseHetnam(lines: Tokens, lineStart: number, lineEnd: number) { + const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1]) - getEntityId(residueName: string, chainId: string, isHet: boolean): string { - if (isHet) { - if (WaterNames.has(residueName)) { - if (this.waterId === undefined) { - this.set('water', 'Water') - this.waterId = `${this.count}` - } - return this.waterId; - } else { - if (!this.heteroMap.has(residueName)) { - this.set('non-polymer', residueName) - this.heteroMap.set(residueName, `${this.count}`) - } - return this.heteroMap.get(residueName)! - } - } else if (this.compoundsMap.has(chainId)) { - return this.compoundsMap.get(chainId)! - } else { - if (!this.chainMap.has(chainId)) { - this.set('polymer', chainId) - this.chainMap.set(chainId, `${this.count}`) - } - return this.chainMap.get(chainId)! - } - } + const hetnams = new Map<string, string>() - getEntityCategory() { - const entity = { - id: CifField.ofStrings(this.ids), - type: CifField.ofStrings(this.types), - pdbx_description: CifField.ofStrings(this.descriptions) + for (let i = lineStart; i < lineEnd; i++) { + let line = getLine(i) + // COLUMNS DATA TYPE FIELD DEFINITION + // ---------------------------------------------------------------------------- + // 1 - 6 Record name "HETNAM" + // 9 - 10 Continuation continuation Allows concatenation of multiple records. + // 12 - 14 LString(3) hetID Het identifier, right-justified. + // 16 - 70 String text Chemical name. + + const het = line.substr(11, 3).trim() + const name = line.substr(15).trim() + + if (hetnams.has(het)) { + hetnams.set(het, `${hetnams.get(het)!} ${name}`) + } else { + hetnams.set(het, name) } - return CifCategory.ofFields('entity', entity) } - setCompounds(compounds: Compound[]) { - for (let i = 0, il = compounds.length; i < il; ++i) { - const { chains, name } = compounds[i] - this.set('polymer', name) - for (let j = 0, jl = chains.length; j < jl; ++j) { - this.compoundsMap.set(chains[j], `${this.count}`) - } - } - } + return hetnams } \ No newline at end of file diff --git a/src/mol-model-formats/structure/pdb/to-cif.ts b/src/mol-model-formats/structure/pdb/to-cif.ts index a7e1aaf826c1508c51685674205e8b4fdd2604f4..573be9a13229ea981d7c296c2194e23733c5a62f 100644 --- a/src/mol-model-formats/structure/pdb/to-cif.ts +++ b/src/mol-model-formats/structure/pdb/to-cif.ts @@ -13,10 +13,14 @@ import { PdbFile } from '../../../mol-io/reader/pdb/schema'; import { parseCryst1, parseRemark350, parseMtrix } from './assembly'; import { parseHelix, parseSheet } from './secondary-structure'; import { guessElementSymbolTokens } from '../util'; -import { parseCmpnd, EntityBuilder } from './entity'; - -type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never -function atom_site_template(data: string, count: number) { +import { parseCmpnd, parseHetnam } from './entity'; +import { ComponentBuilder } from '../common/component'; +import { EntityBuilder } from '../common/entity'; +import { Column } from '../../../mol-data/db'; +import { getMoleculeType } from '../../../mol-model/structure/model/types'; + +type AtomSiteTemplate = typeof getAtomSiteTemplate extends (...args: any) => infer T ? T : never +function getAtomSiteTemplate(data: string, count: number) { const str = () => [] as string[]; const ts = () => TokenBuilder.create(data, 2 * count); return { @@ -41,7 +45,7 @@ function atom_site_template(data: string, count: number) { }; } -function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } { +function getAomSite(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } { const auth_asym_id = CifField.ofTokens(sites.auth_asym_id); const auth_atom_id = CifField.ofTokens(sites.auth_atom_id); const auth_comp_id = CifField.ofTokens(sites.auth_comp_id); @@ -75,7 +79,7 @@ function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_s }; } -function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) { +function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number) { const { data: str } = data; const length = e - s; @@ -103,11 +107,9 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s // 18 - 20 Residue name Residue name. TokenBuilder.addToken(sites.auth_comp_id, Tokenizer.trim(data, s + 17, s + 20)); - const residueName = str.substring(data.tokenStart, data.tokenEnd); // 22 Character Chain identifier. TokenBuilder.add(sites.auth_asym_id, s + 21, s + 22); - const chainId = str.substring(s + 21, s + 22); // 23 - 26 Integer Residue sequence number. // TODO: support HEX @@ -155,7 +157,6 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s guessElementSymbolTokens(sites.type_symbol, str, s + 12, s + 16) } - sites.label_entity_id[sites.index] = entityBuilder.getEntityId(residueName, chainId, isHet); sites.pdbx_PDB_model_num[sites.index] = model; sites.index++; @@ -180,9 +181,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { } } - const atom_site = atom_site_template(data, atomCount); + const atomSite = getAtomSiteTemplate(data, atomCount); const entityBuilder = new EntityBuilder(); const helperCategories: CifCategory[] = []; + const heteroNames: [string, string][] = []; let modelNum = 0, modelStr = ''; @@ -192,7 +194,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { case 'A': if (!substringStartsWith(data, s, e, 'ATOM ')) continue; if (!modelNum) { modelNum++; modelStr = '' + modelNum; } - addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, false); + addAtom(atomSite, modelStr, tokenizer, s, e); break; case 'C': if (substringStartsWith(data, s, e, 'CRYST1')) { @@ -213,7 +215,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { case 'H': if (substringStartsWith(data, s, e, 'HETATM')) { if (!modelNum) { modelNum++; modelStr = '' + modelNum; } - addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, true); + addAtom(atomSite, modelStr, tokenizer, s, e); } else if (substringStartsWith(data, s, e, 'HELIX')) { let j = i + 1; while (true) { @@ -223,8 +225,16 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { } helperCategories.push(parseHelix(lines, i, j)); i = j - 1; + } else if (substringStartsWith(data, s, e, 'HETNAM')) { + let j = i + 1; + while (true) { + s = indices[2 * j]; e = indices[2 * j + 1]; + if (!substringStartsWith(data, s, e, 'HETNAM')) break; + j++; + } + heteroNames.push(...Array.from(parseHetnam(lines, i, j).entries())) + i = j - 1; } - // TODO: HETNAM records => chem_comp (at least partially, needs to be completed with common bases and amino acids) break; case 'M': if (substringStartsWith(data, s, e, 'MODEL ')) { @@ -274,9 +284,24 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { } } + // build entity and chem_comp categories + const seqIds = Column.ofIntTokens(atomSite.auth_seq_id) + const atomIds = Column.ofStringTokens(atomSite.auth_atom_id) + const compIds = Column.ofStringTokens(atomSite.auth_comp_id) + const asymIds = Column.ofStringTokens(atomSite.auth_asym_id) + const componentBuilder = new ComponentBuilder(seqIds, atomIds) + componentBuilder.setNames(heteroNames) + entityBuilder.setNames(heteroNames) + for (let i = 0, il = compIds.rowCount; i < il; ++i) { + const compId = compIds.value(i) + const moleculeType = getMoleculeType(componentBuilder.add(compId, i).type, compId) + atomSite.label_entity_id[i] = entityBuilder.getEntityId(compId, moleculeType, asymIds.value(i)) + } + const categories = { entity: entityBuilder.getEntityCategory(), - atom_site: CifCategory.ofFields('atom_site', _atom_site(atom_site)) + chem_comp: componentBuilder.getChemCompCategory(), + atom_site: CifCategory.ofFields('atom_site', getAomSite(atomSite)) } as any; for (const c of helperCategories) {