diff --git a/src/mol-model-formats/structure/common/component.ts b/src/mol-model-formats/structure/common/component.ts new file mode 100644 index 0000000000000000000000000000000000000000..3b9624bcf97133f8f078cbb9104abd7d568ba132 --- /dev/null +++ b/src/mol-model-formats/structure/common/component.ts @@ -0,0 +1,151 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import { _parse_mmCif } from '../mmcif/parser'; +import { CifCategory, CifField } from '../../../mol-io/reader/cif'; +import { Table, Column } from '../../../mol-data/db'; +import { mmCIF_Schema } from '../../../mol-io/reader/cif/schema/mmcif'; +import { WaterNames } from '../../../mol-model/structure/model/types'; +import { SetUtils } from '../../../mol-util/set'; + +type Component = Table.Row<Pick<mmCIF_Schema['chem_comp'], 'id' | 'name' | 'type'>> + +const ProteinAtomIdsList = [ + new Set([ 'CA' ]), + new Set([ 'C' ]), + new Set([ 'N' ]) +] +const RnaAtomIdsList = [ + new Set([ 'P', 'O3\'', 'O3*' ]), + new Set([ 'C4\'', 'C4*' ]), + new Set([ 'O2\'', 'O2*', 'F2\'', 'F2*' ]) +] +const DnaAtomIdsList = [ + new Set([ 'P', 'O3\'', 'O3*' ]), + new Set([ 'C3\'', 'C3*' ]), + new Set([ 'O2\'', 'O2*', 'F2\'', 'F2*' ]) +] + +const StandardComponents = (function() { + const map = new Map<string, Component>() + const components: Component[] = [ + { id: 'HIS', name: 'HISTIDINE', type: 'L-peptide linking' }, + { id: 'ARG', name: 'ARGININE', type: 'L-peptide linking' }, + { id: 'LYS', name: 'LYSINE', type: 'L-peptide linking' }, + { id: 'ILE', name: 'ISOLEUCINE', type: 'L-peptide linking' }, + { id: 'PHE', name: 'PHENYLALANINE', type: 'L-peptide linking' }, + { id: 'LEU', name: 'LEUCINE', type: 'L-peptide linking' }, + { id: 'TRP', name: 'TRYPTOPHAN', type: 'L-peptide linking' }, + { id: 'ALA', name: 'ALANINE', type: 'L-peptide linking' }, + { id: 'MET', name: 'METHIONINE', type: 'L-peptide linking' }, + { id: 'CYS', name: 'CYSTEINE', type: 'L-peptide linking' }, + { id: 'ASN', name: 'ASPARAGINE', type: 'L-peptide linking' }, + { id: 'VAL', name: 'VALINE', type: 'L-peptide linking' }, + { id: 'GLY', name: 'GLYCINE', type: 'peptide linking' }, + { id: 'SER', name: 'SERINE', type: 'L-peptide linking' }, + { id: 'GLN', name: 'GLUTAMINE', type: 'L-peptide linking' }, + { id: 'TYR', name: 'TYROSINE', type: 'L-peptide linking' }, + { id: 'ASP', name: 'ASPARTIC ACID', type: 'L-peptide linking' }, + { id: 'GLU', name: 'GLUTAMIC ACID', type: 'L-peptide linking' }, + { id: 'THR', name: 'THREONINE', type: 'L-peptide linking' }, + { id: 'SEC', name: 'SELENOCYSTEINE', type: 'L-peptide linking' }, + { id: 'PYL', name: 'PYRROLYSINE', type: 'L-peptide linking' }, + + { id: 'A', name: 'ADENOSINE-5\'-MONOPHOSPHATE', type: 'RNA linking' }, + { id: 'C', name: 'CYTIDINE-5\'-MONOPHOSPHATE', type: 'RNA linking' }, + { id: 'T', name: 'THYMIDINE-5\'-MONOPHOSPHATE', type: 'RNA linking' }, + { id: 'G', name: 'GUANOSINE-5\'-MONOPHOSPHATE', type: 'RNA linking' }, + { id: 'I', name: 'INOSINIC ACID', type: 'RNA linking' }, + { id: 'U', name: 'URIDINE-5\'-MONOPHOSPHATE', type: 'RNA linking' }, + + { id: 'DA', name: '2\'-DEOXYADENOSINE-5\'-MONOPHOSPHATE', type: 'DNA linking' }, + { id: 'DC', name: '2\'-DEOXYCYTIDINE-5\'-MONOPHOSPHATE', type: 'DNA linking' }, + { id: 'DT', name: 'THYMIDINE-5\'-MONOPHOSPHATE', type: 'DNA linking' }, + { id: 'DG', name: '2\'-DEOXYGUANOSINE-5\'-MONOPHOSPHATE', type: 'DNA linking' }, + { id: 'DI', name: '2\'-DEOXYINOSINE-5\'-MONOPHOSPHATE', type: 'DNA linking' }, + { id: 'DU', name: '2\'-DEOXYURIDINE-5\'-MONOPHOSPHATE', type: 'DNA linking' }, + ] + components.forEach(c => map.set(c.id, c)) + return map +})() + +export class ComponentBuilder { + private comps = new Map<string, Component>() + private ids: string[] = [] + private names: string[] = [] + private types: mmCIF_Schema['chem_comp']['type']['T'][] = [] + + private set(c: Component) { + this.comps.set(c.id, c) + this.ids.push(c.id) + this.names.push(c.name) + this.types.push(c.type) + } + + private getAtomIds(index: number) { + const atomIds = new Set<string>() + let prevSeqId = this.seqId.value(index) + while (index < this.seqId.rowCount) { + const seqId = this.seqId.value(index) + if (seqId !== prevSeqId) break + atomIds.add(this.atomId.value(index)) + prevSeqId - seqId + index += 1 + } + return atomIds + } + + private hasAtomIds (atomIds: Set<string>, atomIdsList: Set<string>[]) { + for (let i = 0, il = atomIdsList.length; i < il; ++i) { + if (!SetUtils.areIntersecting(atomIds, atomIdsList[i])) { + return false + } + } + return true + } + + private getType(atomIds: Set<string>): Component['type'] { + if (this.hasAtomIds(atomIds, ProteinAtomIdsList)) { + return 'peptide linking' + } else if (this.hasAtomIds(atomIds, RnaAtomIdsList)) { + return 'RNA linking' + } else if (this.hasAtomIds(atomIds, DnaAtomIdsList)) { + return 'DNA linking' + } else { + return 'other' + } + } + + has(compId: string) { return this.comps.has(compId) } + get(compId: string) { return this.comps.get(compId) } + + add(compId: string, index: number) { + if (!this.has(compId)) { + if (StandardComponents.has(compId)) { + this.set(StandardComponents.get(compId)!) + } else if (WaterNames.has(compId)) { + this.set({ id: compId, name: 'WATER', type: 'non-polymer' }) + } else { + const atomIds = this.getAtomIds(index) + this.set({ id: compId, name: compId, type: this.getType(atomIds) }) + } + } + return this.get(compId)! + } + + getChemCompCategory() { + const chemComp: CifCategory.SomeFields<mmCIF_Schema['chem_comp']> = { + id: CifField.ofStrings(this.ids), + name: CifField.ofStrings(this.names), + type: CifField.ofStrings(this.types), + } + return CifCategory.ofFields('chem_comp', chemComp) + } + + constructor(private seqId: Column<number>, private atomId: Column<string>) { + + } +} \ No newline at end of file diff --git a/src/mol-model-formats/structure/common/util.ts b/src/mol-model-formats/structure/common/util.ts new file mode 100644 index 0000000000000000000000000000000000000000..2f0c6b39d82f25609b26bd47f7c779d06335f837 --- /dev/null +++ b/src/mol-model-formats/structure/common/util.ts @@ -0,0 +1,27 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import { memoize1 } from '../../../mol-util/memoize'; + +const ChainIdAlphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' + +function _getChainId (index: number) { + const n = ChainIdAlphabet.length + let j = index + let k = 0 + let chainId = ChainIdAlphabet[j % n] + while (j >= n) { + j = Math.floor(j / n) + chainId += ChainIdAlphabet[j % n] + k += 1 + } + if (k >= 5) { + console.warn('getChainId overflow') + } + return chainId +} + +export const getChainId = memoize1(_getChainId) \ No newline at end of file diff --git a/src/mol-model-formats/structure/gro.ts b/src/mol-model-formats/structure/gro.ts index 16d4518f9c8a35eb6ce21cd6103fc51abab60053..1b82cc9fca2a0b43589def0ad99d1ca66ec6663f 100644 --- a/src/mol-model-formats/structure/gro.ts +++ b/src/mol-model-formats/structure/gro.ts @@ -13,62 +13,151 @@ import { CifCategory, CifField } from '../../mol-io/reader/cif'; import { Column } from '../../mol-data/db'; import { mmCIF_Schema } from '../../mol-io/reader/cif/schema/mmcif'; import { guessElementSymbolString } from './util'; +import { MoleculeType, getMoleculeType, isPolymer } from '../../mol-model/structure/model/types'; +import { ComponentBuilder } from './common/component'; +import { getChainId } from './common/util'; // TODO multi model files -// TODO seperate chains -// TODO better entity handling -// TODO improve performance -function _entity(): { [K in keyof mmCIF_Schema['entity']]?: CifField } { - return { - id: CifField.ofStrings(['1', '2', '3']), - type: CifField.ofStrings(['polymer', 'non-polymer', 'water']) +class EntityBuilder { + private count = 0 + private ids: string[] = [] + private types: string[] = [] + private descriptions: string[] = [] + + private heteroMap = new Map<string, string>() + private chainMap = new Map<string, string>() + private waterId?: string + + private set(type: string, description: string) { + this.count += 1 + this.ids.push(`${this.count}`) + this.types.push(type) + this.descriptions.push(description) + } + + getEntityId(compId: string, moleculeType: MoleculeType, chainId: string): string { + if (moleculeType === MoleculeType.water) { + if (this.waterId === undefined) { + this.set('water', 'Water') + this.waterId = `${this.count}` + } + return this.waterId; + } else if (isPolymer(moleculeType)) { + if (!this.chainMap.has(chainId)) { + this.set('polymer', `Polymer ${this.chainMap.size + 1}`) + this.chainMap.set(chainId, `${this.count}`) + } + return this.chainMap.get(chainId)! + } else { + if (!this.heteroMap.has(compId)) { + this.set('non-polymer', compId) + this.heteroMap.set(compId, `${this.count}`) + } + return this.heteroMap.get(compId)! + } + } + + getEntityCategory() { + const entity: CifCategory.SomeFields<mmCIF_Schema['entity']> = { + id: CifField.ofStrings(this.ids), + type: CifField.ofStrings(this.types), + pdbx_description: CifField.ofStrings(this.descriptions), + } + return CifCategory.ofFields('entity', entity) } } -function _atom_site(atoms: GroAtoms): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } { - const auth_asym_id = CifField.ofColumn(Column.Undefined(atoms.count, Column.Schema.str)) +function getCategories(atoms: GroAtoms) { const auth_atom_id = CifField.ofColumn(atoms.atomName) const auth_comp_id = CifField.ofColumn(atoms.residueName) - const auth_seq_id = CifField.ofColumn(atoms.residueNumber) - return { + const entityIds = new Array<string>(atoms.count) + const asymIds = new Array<string>(atoms.count) + const seqIds = new Uint32Array(atoms.count) + const ids = new Uint32Array(atoms.count) + + const entityBuilder = new EntityBuilder() + const componentBuilder = new ComponentBuilder(atoms.residueNumber, atoms.atomName) + + let currentEntityId = '' + let currentAsymIndex = 0 + let currentAsymId = '' + let currentSeqId = 0 + let prevMoleculeType = MoleculeType.unknown + let prevResidueNumber = -1 + + for (let i = 0, il = atoms.count; i < il; ++i) { + const residueNumber = atoms.residueNumber.value(i) + if (residueNumber !== prevResidueNumber) { + const compId = atoms.residueName.value(i) + const moleculeType = getMoleculeType(componentBuilder.add(compId, i).type, compId) + + if (moleculeType !== prevMoleculeType || ( + residueNumber !== prevResidueNumber + 1 && !( + // gro format allows only for 5 character residueNumbers, handle overflow here + prevResidueNumber === 99999 && residueNumber === 0 + ) + )) { + currentAsymId = getChainId(currentAsymIndex) + currentAsymIndex += 1 + currentSeqId = 0 + } + + currentEntityId = entityBuilder.getEntityId(compId, moleculeType, currentAsymId) + currentSeqId += 1 + + prevResidueNumber = residueNumber + prevMoleculeType = moleculeType + } + + entityIds[i] = currentEntityId + asymIds[i] = currentAsymId + seqIds[i] = currentSeqId + ids[i] = i + } + + const auth_asym_id = CifField.ofColumn(Column.ofStringArray(asymIds)) + + const atom_site: CifCategory.SomeFields<mmCIF_Schema['atom_site']> = { auth_asym_id, auth_atom_id, auth_comp_id, - auth_seq_id, + auth_seq_id: CifField.ofColumn(atoms.residueNumber), B_iso_or_equiv: CifField.ofColumn(Column.Undefined(atoms.count, Column.Schema.float)), Cartn_x: CifField.ofNumbers(Column.mapToArray(atoms.x, x => x * 10, Float32Array)), Cartn_y: CifField.ofNumbers(Column.mapToArray(atoms.y, y => y * 10, Float32Array)), Cartn_z: CifField.ofNumbers(Column.mapToArray(atoms.z, z => z * 10, Float32Array)), group_PDB: CifField.ofColumn(Column.Undefined(atoms.count, Column.Schema.str)), - id: CifField.ofColumn(atoms.atomNumber), + id: CifField.ofColumn(Column.ofIntArray(ids)), label_alt_id: CifField.ofColumn(Column.Undefined(atoms.count, Column.Schema.str)), label_asym_id: auth_asym_id, label_atom_id: auth_atom_id, label_comp_id: auth_comp_id, - label_seq_id: auth_seq_id, - label_entity_id: CifField.ofColumn(Column.ofConst('1', atoms.count, Column.Schema.str)), + label_seq_id: CifField.ofColumn(Column.ofIntArray(seqIds)), + label_entity_id: CifField.ofColumn(Column.ofStringArray(entityIds)), occupancy: CifField.ofColumn(Column.ofConst(1, atoms.count, Column.Schema.float)), type_symbol: CifField.ofStrings(Column.mapToArray(atoms.atomName, s => guessElementSymbolString(s))), - // type_symbol: CifField.ofColumn(Column.Undefined(atoms.count, Column.Schema.str)), pdbx_PDB_ins_code: CifField.ofColumn(Column.Undefined(atoms.count, Column.Schema.str)), pdbx_PDB_model_num: CifField.ofColumn(Column.ofConst('1', atoms.count, Column.Schema.str)), } + + return { + entity: entityBuilder.getEntityCategory(), + chem_comp: componentBuilder.getChemCompCategory(), + atom_site: CifCategory.ofFields('atom_site', atom_site) + } } async function groToMmCif(gro: GroFile) { - const categories = { - entity: CifCategory.ofFields('entity', _entity()), - atom_site: CifCategory.ofFields('atom_site', _atom_site(gro.structures[0].atoms)) - } as any; + const categories = getCategories(gro.structures[0].atoms) return { - header: 'GRO', + header: gro.structures[0].header.title, categoryNames: Object.keys(categories), categories }; diff --git a/src/mol-model-formats/structure/pdb/entity.ts b/src/mol-model-formats/structure/pdb/entity.ts index 99cd0a1e54a9532d2774256e536a048707c27080..97ecd5d44d6baf8c30b596965b71553504eccef5 100644 --- a/src/mol-model-formats/structure/pdb/entity.ts +++ b/src/mol-model-formats/structure/pdb/entity.ts @@ -81,7 +81,7 @@ export class EntityBuilder { private chainMap = new Map<string, string>() private waterId?: string - private add(type: string, description: string) { + private set(type: string, description: string) { this.count += 1 this.ids.push(`${this.count}`) this.types.push(type) @@ -92,13 +92,13 @@ export class EntityBuilder { if (isHet) { if (WaterNames.has(residueName)) { if (this.waterId === undefined) { - this.add('water', 'Water') + this.set('water', 'Water') this.waterId = `${this.count}` } return this.waterId; } else { if (!this.heteroMap.has(residueName)) { - this.add('non-polymer', residueName) + this.set('non-polymer', residueName) this.heteroMap.set(residueName, `${this.count}`) } return this.heteroMap.get(residueName)! @@ -107,7 +107,7 @@ export class EntityBuilder { return this.compoundsMap.get(chainId)! } else { if (!this.chainMap.has(chainId)) { - this.add('polymer', chainId) + this.set('polymer', chainId) this.chainMap.set(chainId, `${this.count}`) } return this.chainMap.get(chainId)! @@ -126,7 +126,7 @@ export class EntityBuilder { setCompounds(compounds: Compound[]) { for (let i = 0, il = compounds.length; i < il; ++i) { const { chains, name } = compounds[i] - this.add('polymer', name) + this.set('polymer', name) for (let j = 0, jl = chains.length; j < jl; ++j) { this.compoundsMap.set(chains[j], `${this.count}`) } diff --git a/src/mol-model/structure/model/properties/atomic/hierarchy.ts b/src/mol-model/structure/model/properties/atomic/hierarchy.ts index 1c12b0be4b5b549f0106350851a60ded5ad2aee6..12bac4737e68a5039305568b164b4c7aff5e20f0 100644 --- a/src/mol-model/structure/model/properties/atomic/hierarchy.ts +++ b/src/mol-model/structure/model/properties/atomic/hierarchy.ts @@ -238,4 +238,8 @@ export namespace AtomicHierarchy { export function chainEndResidueIndexExcl(segs: AtomicSegments, cI: ChainIndex) { return segs.residueAtomSegments.index[segs.chainAtomSegments.offsets[cI + 1] - 1] + 1 as ResidueIndex; } + + export function chainResidueCount(segs: AtomicSegments, cI: ChainIndex) { + return chainEndResidueIndexExcl(segs, cI) - chainStartResidueIndex(segs, cI) + } } diff --git a/src/mol-model/structure/model/properties/utils/atomic-derived.ts b/src/mol-model/structure/model/properties/utils/atomic-derived.ts index ae82751c4c6740d05e54a77e0efe7c09acce1a44..255c144473aeaa52d149d5d973bc25905b99f018 100644 --- a/src/mol-model/structure/model/properties/utils/atomic-derived.ts +++ b/src/mol-model/structure/model/properties/utils/atomic-derived.ts @@ -31,6 +31,7 @@ export function getAtomicDerivedData(data: AtomicData, index: AtomicIndex, chemi molType = getMoleculeType(chemCompMap.get(compId)!.type, compId) moleculeTypeMap.set(compId, molType) } else { + console.log('chemComp not found', compId) molType = getMoleculeType(getComponentType(compId), compId) // TODO if unknown molecule type, use atom names to guess molecule type moleculeTypeMap.set(compId, molType) diff --git a/src/mol-model/structure/model/types.ts b/src/mol-model/structure/model/types.ts index 036fd11d0cab18c9d66591b80ae058ced91e8e1d..a8390d1a4ba821b22ef0c382ea1a1de4ae1c3203 100644 --- a/src/mol-model/structure/model/types.ts +++ b/src/mol-model/structure/model/types.ts @@ -185,6 +185,8 @@ export const BaseNames = SetUtils.unionMany(RnaBaseNames, DnaBaseNames, PeptideB export const isPurineBase = (compId: string) => PurineBaseNames.has(compId.toUpperCase()) export const isPyrimidineBase = (compId: string) => PyrimidineBaseNames.has(compId.toUpperCase()) +export const PolymerNames = SetUtils.unionMany(AminoAcidNames, BaseNames) + /** get the molecule type from component type and id */ export function getMoleculeType(compType: string, compId: string) { compType = compType.toUpperCase() @@ -227,12 +229,12 @@ export function getComponentType(compId: string): mmCIF_Schema['chem_comp']['typ export function getEntityType(compId: string): mmCIF_Schema['entity']['type']['T'] { compId = compId.toUpperCase() - if (AminoAcidNames.has(compId) || RnaBaseNames.has(compId) || DnaBaseNames.has(compId)) { + if (WaterNames.has(compId)) { + return 'water' + } else if (PolymerNames.has(compId)) { return 'polymer' } else if (SaccharideCompIdMap.has(compId)) { return 'branched' - } else if (WaterNames.has(compId)) { - return 'water' } else { return 'non-polymer' } diff --git a/src/mol-util/set.ts b/src/mol-util/set.ts index af09cf6081d49d14ce51cbf51192993d1ae7850f..82b91b7baf28b0dec4c20b0f3209470c08386889 100644 --- a/src/mol-util/set.ts +++ b/src/mol-util/set.ts @@ -50,6 +50,13 @@ export namespace SetUtils { return intersection; } + export function areIntersecting<T>(setA: Set<T>, setB: Set<T>): boolean { + for (const elem of Array.from(setB)) { + if (setA.has(elem)) return true; + } + return false; + } + /** Create set containing elements of set a that are not in set b. */ export function difference<T>(setA: Set<T>, setB: Set<T>): Set<T> { const difference = new Set(setA);