better entity handling for pdb files

55d99096 · Alexander Rose · 53e0a365 · 55d99096 · 55d99096 · 55d99096
Commit 55d99096 authored 5 years ago by Alexander Rose
--- a/src/mol-model-formats/structure/pdb/entity.ts
+++ b/src/mol-model-formats/structure/pdb/entity.ts
+/**
+ * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info.
+ *
+ * @author Alexander Rose <alexander.rose@weirdbyte.de>
+ */
+
+import { Tokens } from '../../../mol-io/reader/common/text/tokenizer';
+import { CifCategory, CifField } from '../../../mol-io/reader/cif';
+import { WaterNames } from '../../../mol-model/structure/model/types';
+
+const Spec = {
+    'MOL_ID': '',
+    'MOLECULE': '',
+    'CHAIN': '',
+    'FRAGMENT': '',
+    'SYNONYM': '',
+    'EC': '',
+    'ENGINEERED': '',
+    'MUTATION': '',
+    'OTHER_DETAILS': ''
+}
+type Spec = keyof typeof Spec
+
+type Compound = { chains: string[], name: string }
+
+export function parseCmpnd(lines: Tokens, lineStart: number, lineEnd: number) {
+    const getLine = (n: number) => lines.data.substring(lines.indices[2 * n], lines.indices[2 * n + 1])
+
+    let currentSpec: Spec | undefined
+    let currentCompound: Compound = { chains: [], name: '' }
+    const Compounds: Compound[] = []
+
+    for (let i = lineStart; i < lineEnd; i++) {
+        let line = getLine(i)
+        // COLUMNS       DATA TYPE       FIELD         DEFINITION
+        // ----------------------------------------------------------------------------------
+        //  1 -  6       Record name     "COMPND"
+        //  8 - 10       Continuation    continuation  Allows concatenation of multiple records.
+        // 11 - 80       Specification   compound      Description of the molecular components.
+        //               list
+
+        const cmpnd = line.substr(10, 70).trim()
+        const cmpndSpecEnd = cmpnd.indexOf(':')
+        const cmpndSpec = cmpnd.substring(0, cmpndSpecEnd)
+
+        let value: string
+
+        if (cmpndSpec in Spec) {
+            currentSpec = cmpndSpec as Spec
+            value = cmpnd.substring(cmpndSpecEnd + 2)
+        } else {
+            value = cmpnd
+        }
+        value = value.replace(/;$/, '')
+
+        if (currentSpec === 'MOL_ID') {
+            currentCompound = {
+                chains: [],
+                name: ''
+            }
+            Compounds.push(currentCompound)
+        } else if (currentSpec === 'MOLECULE') {
+            if (currentCompound.name) currentCompound.name += ' '
+            currentCompound.name += value
+        } else if (currentSpec === 'CHAIN') {
+            Array.prototype.push.apply(currentCompound.chains, value.split(/\s*,\s*/))
+        }
+    }
+
+    return Compounds
+}
+
+export class EntityBuilder {
+    private count = 0
+    private ids: string[] = []
+    private types: string[] = []
+    private descriptions: string[] = []
+
+    private compoundsMap = new Map<string, string>()
+    private heteroMap = new Map<string, string>()
+    private chainMap = new Map<string, string>()
+    private waterId?: string
+
+    private add(type: string, description: string) {
+        this.count += 1
+        this.ids.push(`${this.count}`)
+        this.types.push(type)
+        this.descriptions.push(description)
+    }
+
+    getEntityId(residueName: string, chainId: string, isHet: boolean): string {
+        if (isHet) {
+            if (WaterNames.has(residueName)) {
+                if (this.waterId === undefined) {
+                    this.add('water', 'Water')
+                    this.waterId = `${this.count}`
+                }
+                return this.waterId;
+            } else {
+                if (!this.heteroMap.has(residueName)) {
+                    this.add('non-polymer', residueName)
+                    this.heteroMap.set(residueName, `${this.count}`)
+                }
+                return this.heteroMap.get(residueName)!
+            }
+        } else if (this.compoundsMap.has(chainId)) {
+            return this.compoundsMap.get(chainId)!
+        } else {
+            if (!this.chainMap.has(chainId)) {
+                this.add('polymer', chainId)
+                this.chainMap.set(chainId, `${this.count}`)
+            }
+            return this.chainMap.get(chainId)!
+        }
+    }
+
+    getEntityCategory() {
+        const entity = {
+            id: CifField.ofStrings(this.ids),
+            type: CifField.ofStrings(this.types),
+            pdbx_description: CifField.ofStrings(this.descriptions)
+        }
+        return CifCategory.ofFields('entity', entity)
+    }
+
+    setCompounds(compounds: Compound[]) {
+        for (let i = 0, il = compounds.length; i < il; ++i) {
+            const { chains, name } = compounds[i]
+            this.add('polymer', name)
+            for (let j = 0, jl = chains.length; j < jl; ++j) {
+                this.compoundsMap.set(chains[j], `${this.count}`)
+            }
+        }
+    }
+}
\ No newline at end of file
--- a/src/mol-model-formats/structure/pdb/to-cif.ts
+++ b/src/mol-model-formats/structure/pdb/to-cif.ts
@@ -11,16 +11,9 @@ import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif';
 import { TokenBuilder, Tokenizer } from '../../../mol-io/reader/common/text/tokenizer';
 import { PdbFile } from '../../../mol-io/reader/pdb/schema';
 import { parseCryst1, parseRemark350, parseMtrix } from './assembly';
-import { WaterNames } from '../../../mol-model/structure/model/types';
 import { parseHelix, parseSheet } from './secondary-structure';
 import { guessElementSymbolTokens } from '../util';
-
-function _entity(): { [K in keyof mmCIF_Schema['entity']]?: CifField } {
-    return {
-        id: CifField.ofStrings(['1', '2', '3']),
-        type: CifField.ofStrings(['polymer', 'non-polymer', 'water'])
-    }
-}
+import { parseCmpnd, EntityBuilder } from './entity';

 type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never
 function atom_site_template(data: string, count: number) {
@@ -82,15 +75,7 @@ function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_s
    };
 }

-function getEntityId(residueName: string, isHet: boolean) {
-    if (isHet) {
-        if (WaterNames.has(residueName)) return '3';
-        return '2';
-    }
-    return '1';
-}
-
-function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) {
+function addAtom(sites: AtomSiteTemplate, entityBuilder: EntityBuilder, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) {
    const { data: str } = data;
    const length = e - s;

@@ -122,6 +107,7 @@ function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: num

    // 22             Character       Chain identifier.
    TokenBuilder.add(sites.auth_asym_id, s + 21, s + 22);
+    const chainId = str.substring(s + 21, s + 22);

    // 23 - 26        Integer         Residue sequence number.
    // TODO: support HEX
@@ -169,7 +155,7 @@ function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: num
        guessElementSymbolTokens(sites.type_symbol, str, s + 12, s + 16)
    }

-    sites.label_entity_id[sites.index] = getEntityId(residueName, isHet);
+    sites.label_entity_id[sites.index] = entityBuilder.getEntityId(residueName, chainId, isHet);
    sites.pdbx_PDB_model_num[sites.index] = model;

    sites.index++;
@@ -195,7 +181,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
    }

    const atom_site = atom_site_template(data, atomCount);
-
+    const entityBuilder = new EntityBuilder();
    const helperCategories: CifCategory[] = [];

    let modelNum = 0, modelStr = '';
@@ -206,19 +192,28 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
            case 'A':
                if (!substringStartsWith(data, s, e, 'ATOM  ')) continue;
                if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
-                addAtom(atom_site, modelStr, tokenizer, s, e, false);
+                addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, false);
                break;
            case 'C':
                if (substringStartsWith(data, s, e, 'CRYST1')) {
                    helperCategories.push(...parseCryst1(pdb.id || '?', data.substring(s, e)));
+                } else if (substringStartsWith(data, s, e, 'CONNECT')) {
+                    // TODO: CONNECT records => struct_conn
+                } else if (substringStartsWith(data, s, e, 'COMPND')) {
+                    let j = i + 1;
+                    while (true) {
+                        s = indices[2 * j]; e = indices[2 * j + 1];
+                        if (!substringStartsWith(data, s, e, 'COMPND')) break;
+                        j++;
+                    }
+                    entityBuilder.setCompounds(parseCmpnd(lines, i, j))
+                    i = j - 1;
                }
-                // TODO CONNECT records => struct_conn
-                // TODO COMPND records => entity
                break;
            case 'H':
                if (substringStartsWith(data, s, e, 'HETATM')) {
                    if (!modelNum) { modelNum++; modelStr = '' + modelNum; }
-                    addAtom(atom_site, modelStr, tokenizer, s, e, true);
+                    addAtom(atom_site, entityBuilder, modelStr, tokenizer, s, e, true);
                } else if (substringStartsWith(data, s, e, 'HELIX')) {
                    let j = i + 1;
                    while (true) {
@@ -229,7 +224,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
                    helperCategories.push(parseHelix(lines, i, j));
                    i = j - 1;
                }
-                // TODO HETNAM records => chem_comp (at least partially, needs to be completed with common bases and amino acids)
+                // TODO: HETNAM records => chem_comp (at least partially, needs to be completed with common bases and amino acids)
                break;
            case 'M':
                if (substringStartsWith(data, s, e, 'MODEL ')) {
@@ -246,10 +241,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
                    helperCategories.push(...parseMtrix(lines, i, j));
                    i = j - 1;
                }
-                // TODO MODRES records => pdbx_struct_mod_residue
+                // TODO: MODRES records => pdbx_struct_mod_residue
                break;
            case 'O':
-                // TODO ORIGX record => cif.database_PDB_matrix.origx, cif.database_PDB_matrix.origx_vector
+                // TODO: ORIGX record => cif.database_PDB_matrix.origx, cif.database_PDB_matrix.origx_vector
                break;
            case 'R':
                if (substringStartsWith(data, s, e, 'REMARK 350')) {
@@ -274,13 +269,13 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> {
                    helperCategories.push(parseSheet(lines, i, j));
                    i = j - 1;
                }
-                // TODO SCALE record => cif.atom_sites.fract_transf_matrix, cif.atom_sites.fract_transf_vector
+                // TODO: SCALE record => cif.atom_sites.fract_transf_matrix, cif.atom_sites.fract_transf_vector
                break;
        }
    }

    const categories = {
-        entity: CifCategory.ofFields('entity', _entity()),
+        entity: entityBuilder.getEntityCategory(),
        atom_site: CifCategory.ofFields('atom_site', _atom_site(atom_site))
    } as any;


--- a/src/mol-plugin/ui/sequence/polymer.ts
+++ b/src/mol-plugin/ui/sequence/polymer.ts
@@ -39,10 +39,11 @@ export class PolymerSequenceWrapper extends SequenceWrapper<StructureUnit> {
        if (StructureElement.isLoci(loci)) {
            if (!Structure.areParentsEqual(loci.structure, structure)) return false

+            const { offset } = this.sequence
            for (const e of loci.elements) {
                if (e.unit.id === unit.id) {
                    OrderedSet.forEach(e.indices, v => {
-                        if (apply(getSeqIndices(e.unit, e.unit.elements[v]))) changed = true
+                        if (apply(getSeqIndices(e.unit, e.unit.elements[v], offset))) changed = true
                    })
                }
            }
@@ -99,22 +100,22 @@ function createResidueQuery(unitId: number, label_seq_id: number) {
    });
 }

-function getSeqIndices(unit: Unit, element: ElementIndex): Interval {
+function getSeqIndices(unit: Unit, element: ElementIndex, offset: number): Interval {
    const { model } = unit
    switch (unit.kind) {
        case Unit.Kind.Atomic:
            const residueIndex = model.atomicHierarchy.residueAtomSegments.index[element]
            const seqId = model.atomicHierarchy.residues.label_seq_id.value(residueIndex)
-            return Interval.ofSingleton(seqId - 1)
+            return Interval.ofSingleton(seqId - 1 - offset)
        case Unit.Kind.Spheres:
            return Interval.ofRange(
-                model.coarseHierarchy.spheres.seq_id_begin.value(element) - 1,
-                model.coarseHierarchy.spheres.seq_id_end.value(element) - 1
+                model.coarseHierarchy.spheres.seq_id_begin.value(element) - 1 - offset,
+                model.coarseHierarchy.spheres.seq_id_end.value(element) - 1 - offset
            )
        case Unit.Kind.Gaussians:
            return Interval.ofRange(
-                model.coarseHierarchy.gaussians.seq_id_begin.value(element) - 1,
-                model.coarseHierarchy.gaussians.seq_id_end.value(element) - 1
+                model.coarseHierarchy.gaussians.seq_id_begin.value(element) - 1 - offset,
+                model.coarseHierarchy.gaussians.seq_id_end.value(element) - 1 - offset
            )
    }
 }
\ No newline at end of file