Skip to content
Snippets Groups Projects
Commit e57a1985 authored by Alexander Rose's avatar Alexander Rose
Browse files

improved pdb reader, entity

parent 469dd05c
No related branches found
No related tags found
No related merge requests found
...@@ -73,6 +73,7 @@ const StandardComponents = (function() { ...@@ -73,6 +73,7 @@ const StandardComponents = (function() {
})() })()
export class ComponentBuilder { export class ComponentBuilder {
private namesMap = new Map<string, string>()
private comps = new Map<string, Component>() private comps = new Map<string, Component>()
private ids: string[] = [] private ids: string[] = []
private names: string[] = [] private names: string[] = []
...@@ -129,8 +130,8 @@ export class ComponentBuilder { ...@@ -129,8 +130,8 @@ export class ComponentBuilder {
} else if (WaterNames.has(compId)) { } else if (WaterNames.has(compId)) {
this.set({ id: compId, name: 'WATER', type: 'non-polymer' }) this.set({ id: compId, name: 'WATER', type: 'non-polymer' })
} else { } else {
const atomIds = this.getAtomIds(index) const type = this.getType(this.getAtomIds(index))
this.set({ id: compId, name: compId, type: this.getType(atomIds) }) this.set({ id: compId, name: this.namesMap.get(compId) || compId, type })
} }
} }
return this.get(compId)! return this.get(compId)!
...@@ -145,6 +146,10 @@ export class ComponentBuilder { ...@@ -145,6 +146,10 @@ export class ComponentBuilder {
return CifCategory.ofFields('chem_comp', chemComp) return CifCategory.ofFields('chem_comp', chemComp)
} }
setNames(names: [string, string][]) {
names.forEach(n => this.namesMap.set(n[0], n[1]))
}
constructor(private seqId: Column<number>, private atomId: Column<string>) { constructor(private seqId: Column<number>, private atomId: Column<string>) {
} }
......
/**
* Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author Alexander Rose <alexander.rose@weirdbyte.de>
*/
import { CifCategory, CifField } from '../../../mol-io/reader/cif';
import { MoleculeType, isPolymer } from '../../../mol-model/structure/model/types';
import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif';
export type EntityCompound = { chains: string[], description: string }
export class EntityBuilder {
private count = 0
private ids: string[] = []
private types: string[] = []
private descriptions: string[] = []
private compoundsMap = new Map<string, string>()
private namesMap = new Map<string, string>()
private heteroMap = new Map<string, string>()
private chainMap = new Map<string, string>()
private waterId?: string
private set(type: string, description: string) {
this.count += 1
this.ids.push(`${this.count}`)
this.types.push(type)
this.descriptions.push(description)
}
getEntityId(compId: string, moleculeType: MoleculeType, chainId: string): string {
if (moleculeType === MoleculeType.water) {
if (this.waterId === undefined) {
this.set('water', 'Water')
this.waterId = `${this.count}`
}
return this.waterId;
} else if (isPolymer(moleculeType)) {
if (this.compoundsMap.has(chainId)) {
return this.compoundsMap.get(chainId)!
} else {
if (!this.chainMap.has(chainId)) {
this.set('polymer', `Polymer ${this.chainMap.size + 1}`)
this.chainMap.set(chainId, `${this.count}`)
}
return this.chainMap.get(chainId)!
}
} else {
if (!this.heteroMap.has(compId)) {
this.set('non-polymer', this.namesMap.get(compId) || compId)
this.heteroMap.set(compId, `${this.count}`)
}
return this.heteroMap.get(compId)!
}
}
getEntityCategory() {
const entity: CifCategory.SomeFields<mmCIF_Schema['entity']> = {
id: CifField.ofStrings(this.ids),
type: CifField.ofStrings(this.types),
pdbx_description: CifField.ofStrings(this.descriptions),
}
return CifCategory.ofFields('entity', entity)
}
setCompounds(compounds: EntityCompound[]) {
for (let i = 0, il = compounds.length; i < il; ++i) {
const { chains, description } = compounds[i]
this.set('polymer', description)
for (let j = 0, jl = chains.length; j < jl; ++j) {
this.compoundsMap.set(chains[j], `${this.count}`)
}
}
}
setNames(names: [string, string][]) {
names.forEach(n => this.namesMap.set(n[0], n[1]))
}
}
\ No newline at end of file
...@@ -13,61 +13,13 @@ import { CifCategory, CifField } from '../../mol-io/reader/cif'; ...@@ -13,61 +13,13 @@ import { CifCategory, CifField } from '../../mol-io/reader/cif';
import { Column } from '../../mol-data/db'; import { Column } from '../../mol-data/db';
import { mmCIF_Schema } from '../../mol-io/reader/cif/schema/mmcif'; import { mmCIF_Schema } from '../../mol-io/reader/cif/schema/mmcif';
import { guessElementSymbolString } from './util'; import { guessElementSymbolString } from './util';
import { MoleculeType, getMoleculeType, isPolymer } from '../../mol-model/structure/model/types'; import { MoleculeType, getMoleculeType } from '../../mol-model/structure/model/types';
import { ComponentBuilder } from './common/component'; import { ComponentBuilder } from './common/component';
import { getChainId } from './common/util'; import { getChainId } from './common/util';
import { EntityBuilder } from './common/entity';
// TODO multi model files // TODO multi model files
class EntityBuilder {
private count = 0
private ids: string[] = []
private types: string[] = []
private descriptions: string[] = []
private heteroMap = new Map<string, string>()
private chainMap = new Map<string, string>()
private waterId?: string
private set(type: string, description: string) {
this.count += 1
this.ids.push(`${this.count}`)
this.types.push(type)
this.descriptions.push(description)
}
getEntityId(compId: string, moleculeType: MoleculeType, chainId: string): string {
if (moleculeType === MoleculeType.water) {
if (this.waterId === undefined) {
this.set('water', 'Water')
this.waterId = `${this.count}`
}
return this.waterId;
} else if (isPolymer(moleculeType)) {
if (!this.chainMap.has(chainId)) {
this.set('polymer', `Polymer ${this.chainMap.size + 1}`)
this.chainMap.set(chainId, `${this.count}`)
}
return this.chainMap.get(chainId)!
} else {
if (!this.heteroMap.has(compId)) {
this.set('non-polymer', compId)
this.heteroMap.set(compId, `${this.count}`)
}
return this.heteroMap.get(compId)!
}
}
getEntityCategory() {
const entity: CifCategory.SomeFields<mmCIF_Schema['entity']> = {
id: CifField.ofStrings(this.ids),
type: CifField.ofStrings(this.types),
pdbx_description: CifField.ofStrings(this.descriptions),
}
return CifCategory.ofFields('entity', entity)
}
}
function getCategories(atoms: GroAtoms) { function getCategories(atoms: GroAtoms) {
const auth_atom_id = CifField.ofColumn(atoms.atomName) const auth_atom_id = CifField.ofColumn(atoms.atomName)
const auth_comp_id = CifField.ofColumn(atoms.residueName) const auth_comp_id = CifField.ofColumn(atoms.residueName)
......
...@@ -5,8 +5,7 @@ ...@@ -5,8 +5,7 @@
*/ */
import { Tokens } from '../../../mol-io/reader/common/text/tokenizer'; import { Tokens } from '../../../mol-io/reader/common/text/tokenizer';
import { CifCategory, CifField } from '../../../mol-io/reader/cif'; import { EntityCompound } from '../common/entity';
import { WaterNames } from '../../../mol-model/structure/model/types';
const Spec = { const Spec = {
'MOL_ID': '', 'MOL_ID': '',
...@@ -21,14 +20,12 @@ const Spec = { ...@@ -21,14 +20,12 @@ const Spec = {
} }
type Spec = keyof typeof Spec type Spec = keyof typeof Spec
type Compound = { chains: string[], name: string }
export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) { export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1]) const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1])
let currentSpec: Spec | undefined let currentSpec: Spec | undefined
let currentCompound: Compound = { chains: [], name: '' } let currentCompound: EntityCompound = { chains: [], description: '' }
const Compounds: Compound[] = [] const Compounds: EntityCompound[] = []
for (let i = lineStart; i < lineEnd; i++) { for (let i = lineStart; i < lineEnd; i++) {
let line = getLine(i) let line = getLine(i)
...@@ -56,12 +53,12 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) { ...@@ -56,12 +53,12 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
if (currentSpec === 'MOL_ID') { if (currentSpec === 'MOL_ID') {
currentCompound = { currentCompound = {
chains: [], chains: [],
name: '' description: ''
} }
Compounds.push(currentCompound) Compounds.push(currentCompound)
} else if (currentSpec === 'MOLECULE') { } else if (currentSpec === 'MOLECULE') {
if (currentCompound.name) currentCompound.name += ' ' if (currentCompound.description) currentCompound.description += ' '
currentCompound.name += value currentCompound.description += value
} else if (currentSpec === 'CHAIN') { } else if (currentSpec === 'CHAIN') {
Array.prototype.push.apply(currentCompound.chains, value.split(/\s*,\s*/)) Array.prototype.push.apply(currentCompound.chains, value.split(/\s*,\s*/))
} }
...@@ -70,66 +67,29 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) { ...@@ -70,66 +67,29 @@ export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
return Compounds return Compounds
} }
export class EntityBuilder { export function parseHetnam(lines: Tokens, lineStart: number, lineEnd: number) {
private count = 0 const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1])
private ids: string[] = []
private types: string[] = []
private descriptions: string[] = []
private compoundsMap = new Map<string, string>()
private heteroMap = new Map<string, string>()
private chainMap = new Map<string, string>()
private waterId?: string
private set(type: string, description: string) {
this.count += 1
this.ids.push(`${this.count}`)
this.types.push(type)
this.descriptions.push(description)
}
getEntityId(residueName: string, chainId: string, isHet: boolean): string { const hetnams = new Map<string, string>()
if (isHet) {
if (WaterNames.has(residueName)) {
if (this.waterId === undefined) {
this.set('water', 'Water')
this.waterId = `${this.count}`
}
return this.waterId;
} else {
if (!this.heteroMap.has(residueName)) {
this.set('non-polymer', residueName)
this.heteroMap.set(residueName, `${this.count}`)
}
return this.heteroMap.get(residueName)!
}
} else if (this.compoundsMap.has(chainId)) {
return this.compoundsMap.get(chainId)!
} else {
if (!this.chainMap.has(chainId)) {
this.set('polymer', chainId)
this.chainMap.set(chainId, `${this.count}`)
}
return this.chainMap.get(chainId)!
}
}
getEntityCategory() { for (let i = lineStart; i < lineEnd; i++) {
const entity = { let line = getLine(i)
id: CifField.ofStrings(this.ids), // COLUMNS DATA TYPE FIELD DEFINITION
type: CifField.ofStrings(this.types), // ----------------------------------------------------------------------------
pdbx_description: CifField.ofStrings(this.descriptions) // 1 - 6 Record name "HETNAM"
} // 9 - 10 Continuation continuation Allows concatenation of multiple records.
return CifCategory.ofFields('entity', entity) // 12 - 14 LString(3) hetID Het identifier, right-justified.
} // 16 - 70 String text Chemical name.
setCompounds(compounds: Compound[]) { const het = line.substr(11, 3).trim()
for (let i = 0, il = compounds.length; i < il; ++i) { const name = line.substr(15).trim()
const { chains, name } = compounds[i]
this.set('polymer', name) if (hetnams.has(het)) {
for (let j = 0, jl = chains.length; j < jl; ++j) { hetnams.set(het, `${hetnams.get(het)!} ${name}`)
this.compoundsMap.set(chains[j], `${this.count}`) } else {
} hetnams.set(het, name)
} }
} }
return hetnams
} }
\ No newline at end of file
...@@ -13,10 +13,14 @@ import { PdbFile } from '../../../mol-io/reader/pdb/schema'; ...@@ -13,10 +13,14 @@ import { PdbFile } from '../../../mol-io/reader/pdb/schema';
import { parseCryst1, parseRemark350, parseMtrix } from './assembly'; import { parseCryst1, parseRemark350, parseMtrix } from './assembly';
import { parseHelix, parseSheet } from './secondary-structure'; import { parseHelix, parseSheet } from './secondary-structure';
import { guessElementSymbolTokens } from '../util'; import { guessElementSymbolTokens } from '../util';
import { parseCmpnd, EntityBuilder } from './entity'; import { parseCmpnd, parseHetnam } from './entity';
import { ComponentBuilder } from '../common/component';
type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never import { EntityBuilder } from '../common/entity';
function atom_site_template(data: string, count: number) { import { Column } from '../../../mol-data/db';
import { getMoleculeType } from '../../../mol-model/structure/model/types';
type AtomSiteTemplate = typeof getAtomSiteTemplate extends (...args: any) => infer T ? T : never
function getAtomSiteTemplate(data: string, count: number) {
const str = () => [] as string[]; const str = () => [] as string[];
const ts = () => TokenBuilder.create(data, 2 * count); const ts = () => TokenBuilder.create(data, 2 * count);
return { return {
...@@ -41,7 +45,7 @@ function atom_site_template(data: string, count: number) { ...@@ -41,7 +45,7 @@ function atom_site_template(data: string, count: number) {
}; };
} }
function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } { function getAomSite(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } {
const auth_asym_id = CifField.ofTokens(sites.auth_asym_id); const auth_asym_id = CifField.ofTokens(sites.auth_asym_id);
const auth_atom_id = CifField.ofTokens(sites.auth_atom_id); const auth_atom_id = CifField.ofTokens(sites.auth_atom_id);
const auth_comp_id = CifField.ofTokens(sites.auth_comp_id); const auth_comp_id = CifField.ofTokens(sites.auth_comp_id);
...@@ -75,7 +79,7 @@ function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_s ...@@ -75,7 +79,7 @@ function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_s
}; };
} }
function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) { function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number) {
const { data: str } = data; const { data: str } = data;
const length = e - s; const length = e - s;
...@@ -103,11 +107,9 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s ...@@ -103,11 +107,9 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s
// 18 - 20 Residue name Residue name. // 18 - 20 Residue name Residue name.
TokenBuilder.addToken(sites.auth_comp_id, Tokenizer.trim(data, s + 17, s + 20)); TokenBuilder.addToken(sites.auth_comp_id, Tokenizer.trim(data, s + 17, s + 20));
const residueName = str.substring(data.tokenStart, data.tokenEnd);
// 22 Character Chain identifier. // 22 Character Chain identifier.
TokenBuilder.add(sites.auth_asym_id, s + 21, s + 22); TokenBuilder.add(sites.auth_asym_id, s + 21, s + 22);
const chainId = str.substring(s + 21, s + 22);
// 23 - 26 Integer Residue sequence number. // 23 - 26 Integer Residue sequence number.
// TODO: support HEX // TODO: support HEX
...@@ -155,7 +157,6 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s ...@@ -155,7 +157,6 @@ function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: s
guessElementSymbolTokens(sites.type_symbol, str, s + 12, s + 16) guessElementSymbolTokens(sites.type_symbol, str, s + 12, s + 16)
} }
sites.label_entity_id[sites.index] = entityBuilder.getEntityId(residueName, chainId, isHet);
sites.pdbx_PDB_model_num[sites.index] = model; sites.pdbx_PDB_model_num[sites.index] = model;
sites.index++; sites.index++;
...@@ -180,9 +181,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { ...@@ -180,9 +181,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
} }
} }
const atom_site = atom_site_template(data, atomCount); const atomSite = getAtomSiteTemplate(data, atomCount);
const entityBuilder = new EntityBuilder(); const entityBuilder = new EntityBuilder();
const helperCategories: CifCategory[] = []; const helperCategories: CifCategory[] = [];
const heteroNames: [string, string][] = [];
let modelNum = 0, modelStr = ''; let modelNum = 0, modelStr = '';
...@@ -192,7 +194,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { ...@@ -192,7 +194,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
case 'A': case 'A':
if (!substringStartsWith(data, s, e, 'ATOM ')) continue; if (!substringStartsWith(data, s, e, 'ATOM ')) continue;
if (!modelNum) { modelNum++; modelStr = '' + modelNum; } if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, false); addAtom(atomSite, modelStr, tokenizer, s, e);
break; break;
case 'C': case 'C':
if (substringStartsWith(data, s, e, 'CRYST1')) { if (substringStartsWith(data, s, e, 'CRYST1')) {
...@@ -213,7 +215,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { ...@@ -213,7 +215,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
case 'H': case 'H':
if (substringStartsWith(data, s, e, 'HETATM')) { if (substringStartsWith(data, s, e, 'HETATM')) {
if (!modelNum) { modelNum++; modelStr = '' + modelNum; } if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, true); addAtom(atomSite, modelStr, tokenizer, s, e);
} else if (substringStartsWith(data, s, e, 'HELIX')) { } else if (substringStartsWith(data, s, e, 'HELIX')) {
let j = i + 1; let j = i + 1;
while (true) { while (true) {
...@@ -223,8 +225,16 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { ...@@ -223,8 +225,16 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
} }
helperCategories.push(parseHelix(lines, i, j)); helperCategories.push(parseHelix(lines, i, j));
i = j - 1; i = j - 1;
} else if (substringStartsWith(data, s, e, 'HETNAM')) {
let j = i + 1;
while (true) {
s = indices[2 * j]; e = indices[2 * j + 1];
if (!substringStartsWith(data, s, e, 'HETNAM')) break;
j++;
}
heteroNames.push(...Array.from(parseHetnam(lines, i, j).entries()))
i = j - 1;
} }
// TODO: HETNAM records => chem_comp (at least partially, needs to be completed with common bases and amino acids)
break; break;
case 'M': case 'M':
if (substringStartsWith(data, s, e, 'MODEL ')) { if (substringStartsWith(data, s, e, 'MODEL ')) {
...@@ -274,9 +284,24 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { ...@@ -274,9 +284,24 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
} }
} }
// build entity and chem_comp categories
const seqIds = Column.ofIntTokens(atomSite.auth_seq_id)
const atomIds = Column.ofStringTokens(atomSite.auth_atom_id)
const compIds = Column.ofStringTokens(atomSite.auth_comp_id)
const asymIds = Column.ofStringTokens(atomSite.auth_asym_id)
const componentBuilder = new ComponentBuilder(seqIds, atomIds)
componentBuilder.setNames(heteroNames)
entityBuilder.setNames(heteroNames)
for (let i = 0, il = compIds.rowCount; i < il; ++i) {
const compId = compIds.value(i)
const moleculeType = getMoleculeType(componentBuilder.add(compId, i).type, compId)
atomSite.label_entity_id[i] = entityBuilder.getEntityId(compId, moleculeType, asymIds.value(i))
}
const categories = { const categories = {
entity: entityBuilder.getEntityCategory(), entity: entityBuilder.getEntityCategory(),
atom_site: CifCategory.ofFields('atom_site', _atom_site(atom_site)) chem_comp: componentBuilder.getChemCompCategory(),
atom_site: CifCategory.ofFields('atom_site', getAomSite(atomSite))
} as any; } as any;
for (const c of helperCategories) { for (const c of helperCategories) {
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment