Skip to content
Snippets Groups Projects
Commit 55d99096 authored by Alexander Rose's avatar Alexander Rose
Browse files

better entity handling for pdb files

parent 53e0a365
No related branches found
No related tags found
No related merge requests found
/**
* Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author Alexander Rose <alexander.rose@weirdbyte.de>
*/
import { Tokens } from '../../../mol-io/reader/common/text/tokenizer';
import { CifCategory, CifField } from '../../../mol-io/reader/cif';
import { WaterNames } from '../../../mol-model/structure/model/types';
const Spec = {
'MOL_ID': '',
'MOLECULE': '',
'CHAIN': '',
'FRAGMENT': '',
'SYNONYM': '',
'EC': '',
'ENGINEERED': '',
'MUTATION': '',
'OTHER_DETAILS': ''
}
type Spec = keyof typeof Spec
type Compound = { chains: string[], name: string }
export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1])
let currentSpec: Spec | undefined
let currentCompound: Compound = { chains: [], name: '' }
const Compounds: Compound[] = []
for (let i = lineStart; i < lineEnd; i++) {
let line = getLine(i)
// COLUMNS DATA TYPE FIELD DEFINITION
// ----------------------------------------------------------------------------------
// 1 - 6 Record name "COMPND"
// 8 - 10 Continuation continuation Allows concatenation of multiple records.
// 11 - 80 Specification compound Description of the molecular components.
// list
const cmpnd = line.substr(10, 70).trim()
const cmpndSpecEnd = cmpnd.indexOf(':')
const cmpndSpec = cmpnd.substring(0, cmpndSpecEnd)
let value: string
if (cmpndSpec in Spec) {
currentSpec = cmpndSpec as Spec
value = cmpnd.substring(cmpndSpecEnd + 2)
} else {
value = cmpnd
}
value = value.replace(/;$/, '')
if (currentSpec === 'MOL_ID') {
currentCompound = {
chains: [],
name: ''
}
Compounds.push(currentCompound)
} else if (currentSpec === 'MOLECULE') {
if (currentCompound.name) currentCompound.name += ' '
currentCompound.name += value
} else if (currentSpec === 'CHAIN') {
Array.prototype.push.apply(currentCompound.chains, value.split(/\s*,\s*/))
}
}
return Compounds
}
export class EntityBuilder {
private count = 0
private ids: string[] = []
private types: string[] = []
private descriptions: string[] = []
private compoundsMap = new Map<string, string>()
private heteroMap = new Map<string, string>()
private chainMap = new Map<string, string>()
private waterId?: string
private add(type: string, description: string) {
this.count += 1
this.ids.push(`${this.count}`)
this.types.push(type)
this.descriptions.push(description)
}
getEntityId(residueName: string, chainId: string, isHet: boolean): string {
if (isHet) {
if (WaterNames.has(residueName)) {
if (this.waterId === undefined) {
this.add('water', 'Water')
this.waterId = `${this.count}`
}
return this.waterId;
} else {
if (!this.heteroMap.has(residueName)) {
this.add('non-polymer', residueName)
this.heteroMap.set(residueName, `${this.count}`)
}
return this.heteroMap.get(residueName)!
}
} else if (this.compoundsMap.has(chainId)) {
return this.compoundsMap.get(chainId)!
} else {
if (!this.chainMap.has(chainId)) {
this.add('polymer', chainId)
this.chainMap.set(chainId, `${this.count}`)
}
return this.chainMap.get(chainId)!
}
}
getEntityCategory() {
const entity = {
id: CifField.ofStrings(this.ids),
type: CifField.ofStrings(this.types),
pdbx_description: CifField.ofStrings(this.descriptions)
}
return CifCategory.ofFields('entity', entity)
}
setCompounds(compounds: Compound[]) {
for (let i = 0, il = compounds.length; i < il; ++i) {
const { chains, name } = compounds[i]
this.add('polymer', name)
for (let j = 0, jl = chains.length; j < jl; ++j) {
this.compoundsMap.set(chains[j], `${this.count}`)
}
}
}
}
\ No newline at end of file
......@@ -11,16 +11,9 @@ import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif';
import { TokenBuilder, Tokenizer } from '../../../mol-io/reader/common/text/tokenizer';
import { PdbFile } from '../../../mol-io/reader/pdb/schema';
import { parseCryst1, parseRemark350, parseMtrix } from './assembly';
import { WaterNames } from '../../../mol-model/structure/model/types';
import { parseHelix, parseSheet } from './secondary-structure';
import { guessElementSymbolTokens } from '../util';
function _entity(): { [K in keyof mmCIF_Schema['entity']]?: CifField } {
return {
id: CifField.ofStrings(['1', '2', '3']),
type: CifField.ofStrings(['polymer', 'non-polymer', 'water'])
}
}
import { parseCmpnd, EntityBuilder } from './entity';
type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never
function atom_site_template(data: string, count: number) {
......@@ -82,15 +75,7 @@ function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_s
};
}
function getEntityId(residueName: string, isHet: boolean) {
if (isHet) {
if (WaterNames.has(residueName)) return '3';
return '2';
}
return '1';
}
function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) {
function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) {
const { data: str } = data;
const length = e - s;
......@@ -122,6 +107,7 @@ function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: num
// 22 Character Chain identifier.
TokenBuilder.add(sites.auth_asym_id, s + 21, s + 22);
const chainId = str.substring(s + 21, s + 22);
// 23 - 26 Integer Residue sequence number.
// TODO: support HEX
......@@ -169,7 +155,7 @@ function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: num
guessElementSymbolTokens(sites.type_symbol, str, s + 12, s + 16)
}
sites.label_entity_id[sites.index] = getEntityId(residueName, isHet);
sites.label_entity_id[sites.index] = entityBuilder.getEntityId(residueName, chainId, isHet);
sites.pdbx_PDB_model_num[sites.index] = model;
sites.index++;
......@@ -195,7 +181,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
}
const atom_site = atom_site_template(data, atomCount);
const entityBuilder = new EntityBuilder();
const helperCategories: CifCategory[] = [];
let modelNum = 0, modelStr = '';
......@@ -206,19 +192,28 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
case 'A':
if (!substringStartsWith(data, s, e, 'ATOM ')) continue;
if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
addAtom(atom_site, modelStr, tokenizer, s, e, false);
addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, false);
break;
case 'C':
if (substringStartsWith(data, s, e, 'CRYST1')) {
helperCategories.push(...parseCryst1(pdb.id || '?', data.substring(s, e)));
} else if (substringStartsWith(data, s, e, 'CONNECT')) {
// TODO: CONNECT records => struct_conn
} else if (substringStartsWith(data, s, e, 'COMPND')) {
let j = i + 1;
while (true) {
s = indices[2 * j]; e = indices[2 * j + 1];
if (!substringStartsWith(data, s, e, 'COMPND')) break;
j++;
}
entityBuilder.setCompounds(parseCmpnd(lines, i, j))
i = j - 1;
}
// TODO CONNECT records => struct_conn
// TODO COMPND records => entity
break;
case 'H':
if (substringStartsWith(data, s, e, 'HETATM')) {
if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
addAtom(atom_site, modelStr, tokenizer, s, e, true);
addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, true);
} else if (substringStartsWith(data, s, e, 'HELIX')) {
let j = i + 1;
while (true) {
......@@ -229,7 +224,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
helperCategories.push(parseHelix(lines, i, j));
i = j - 1;
}
// TODO HETNAM records => chem_comp (at least partially, needs to be completed with common bases and amino acids)
// TODO: HETNAM records => chem_comp (at least partially, needs to be completed with common bases and amino acids)
break;
case 'M':
if (substringStartsWith(data, s, e, 'MODEL ')) {
......@@ -246,10 +241,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
helperCategories.push(...parseMtrix(lines, i, j));
i = j - 1;
}
// TODO MODRES records => pdbx_struct_mod_residue
// TODO: MODRES records => pdbx_struct_mod_residue
break;
case 'O':
// TODO ORIGX record => cif.database_PDB_matrix.origx, cif.database_PDB_matrix.origx_vector
// TODO: ORIGX record => cif.database_PDB_matrix.origx, cif.database_PDB_matrix.origx_vector
break;
case 'R':
if (substringStartsWith(data, s, e, 'REMARK 350')) {
......@@ -274,13 +269,13 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
helperCategories.push(parseSheet(lines, i, j));
i = j - 1;
}
// TODO SCALE record => cif.atom_sites.fract_transf_matrix, cif.atom_sites.fract_transf_vector
// TODO: SCALE record => cif.atom_sites.fract_transf_matrix, cif.atom_sites.fract_transf_vector
break;
}
}
const categories = {
entity: CifCategory.ofFields('entity', _entity()),
entity: entityBuilder.getEntityCategory(),
atom_site: CifCategory.ofFields('atom_site', _atom_site(atom_site))
} as any;
......
......@@ -39,10 +39,11 @@ export class PolymerSequenceWrapper extends SequenceWrapper<StructureUnit> {
if (StructureElement.isLoci(loci)) {
if (!Structure.areParentsEqual(loci.structure, structure)) return false
const { offset } = this.sequence
for (const e of loci.elements) {
if (e.unit.id === unit.id) {
OrderedSet.forEach(e.indices, v => {
if (apply(getSeqIndices(e.unit, e.unit.elements[v]))) changed = true
if (apply(getSeqIndices(e.unit, e.unit.elements[v], offset))) changed = true
})
}
}
......@@ -99,22 +100,22 @@ function createResidueQuery(unitId: number, label_seq_id: number) {
});
}
function getSeqIndices(unit: Unit, element: ElementIndex): Interval {
function getSeqIndices(unit: Unit, element: ElementIndex, offset: number): Interval {
const { model } = unit
switch (unit.kind) {
case Unit.Kind.Atomic:
const residueIndex = model.atomicHierarchy.residueAtomSegments.index[element]
const seqId = model.atomicHierarchy.residues.label_seq_id.value(residueIndex)
return Interval.ofSingleton(seqId - 1)
return Interval.ofSingleton(seqId - 1 - offset)
case Unit.Kind.Spheres:
return Interval.ofRange(
model.coarseHierarchy.spheres.seq_id_begin.value(element) - 1,
model.coarseHierarchy.spheres.seq_id_end.value(element) - 1
model.coarseHierarchy.spheres.seq_id_begin.value(element) - 1 - offset,
model.coarseHierarchy.spheres.seq_id_end.value(element) - 1 - offset
)
case Unit.Kind.Gaussians:
return Interval.ofRange(
model.coarseHierarchy.gaussians.seq_id_begin.value(element) - 1,
model.coarseHierarchy.gaussians.seq_id_end.value(element) - 1
model.coarseHierarchy.gaussians.seq_id_begin.value(element) - 1 - offset,
model.coarseHierarchy.gaussians.seq_id_end.value(element) - 1 - offset
)
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment