diff --git a/src/apps/structure-info/model.ts b/src/apps/structure-info/model.ts index 1b6e2a7612d7486dd36fe09c92beff7828fa9d97..0728dde5d6a95f069cc3fabc9d38037694f421ab 100644 --- a/src/apps/structure-info/model.ts +++ b/src/apps/structure-info/model.ts @@ -50,7 +50,7 @@ export function residueLabel(model: Model, rI: number) { } export function printSecStructure(model: Model) { - console.log('Secondary Structure\n============='); + console.log('\nSecondary Structure\n============='); const { residues } = model.atomicHierarchy; const { type, key } = model.properties.secondaryStructure; @@ -90,20 +90,18 @@ export function printBonds(structure: Structure) { } export function printSequence(model: Model) { - console.log('Sequence\n============='); + console.log('\nSequence\n============='); const { byEntityKey } = model.sequence; for (const key of Object.keys(byEntityKey)) { const seq = byEntityKey[+key]; - console.log(`${seq.entityId} (${seq.num.value(0)}, ${seq.num.value(seq.num.rowCount - 1)}) (${seq.compId.value(0)}, ${seq.compId.value(seq.compId.rowCount - 1)})`); - // for (let i = 0; i < seq.compId.rowCount; i++) { - // console.log(`${seq.entityId} ${seq.num.value(i)} ${seq.compId.value(i)}`); - // } + console.log(`${seq.entityId} (${seq.sequence.kind} ${seq.num.value(0)} (offset ${seq.sequence.offset}), ${seq.num.value(seq.num.rowCount - 1)}) (${seq.compId.value(0)}, ${seq.compId.value(seq.compId.rowCount - 1)})`); + console.log(`${seq.sequence.sequence}`); } console.log(); } export function printUnits(structure: Structure) { - console.log('Units\n============='); + console.log('\nUnits\n============='); const l = Element.Location(); for (const unit of structure.units) { @@ -133,10 +131,9 @@ export function printUnits(structure: Structure) { } } - export function printIHMModels(model: Model) { if (!model.coarseHierarchy.isDefined) return false; - console.log('IHM Models\n============='); + console.log('\nIHM Models\n============='); console.log(Table.formatToString(model.coarseHierarchy.models)); } @@ -144,10 +141,10 @@ async function run(mmcif: mmCIF_Database) { const models = await Model.create({ kind: 'mmCIF', data: mmcif }).run(); const structure = Structure.ofModel(models[0]); printSequence(models[0]); - printIHMModels(models[0]); + //printIHMModels(models[0]); printUnits(structure); - printBonds(structure); - printSecStructure(models[0]); + //printBonds(structure); + //printSecStructure(models[0]); } async function runDL(pdb: string) { diff --git a/src/mol-data/db/column.ts b/src/mol-data/db/column.ts index 355c1c365a7ba7f51c1cf41f7ede0d691f25d48c..a61efa6771935c8e742d5db03d43883e9b00c67a 100644 --- a/src/mol-data/db/column.ts +++ b/src/mol-data/db/column.ts @@ -103,6 +103,15 @@ namespace Column { return lambdaColumn(spec); } + /** values [min, max] (i.e. include both values) */ + export function range(min: number, max: number): Column<number> { + return ofLambda({ + value: i => i + min, + rowCount: Math.max(max - min + 1, 0), + schema: Schema.int + }); + } + export function ofArray<T extends Column.Schema>(spec: Column.ArraySpec<T>): Column<T['T']> { return arrayColumn(spec); } diff --git a/src/mol-model/sequence.ts b/src/mol-model/sequence.ts new file mode 100644 index 0000000000000000000000000000000000000000..bd9af242dc8e9daecd99517add461357b0eec763 --- /dev/null +++ b/src/mol-model/sequence.ts @@ -0,0 +1,7 @@ +/** + * Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +export * from './sequence/sequence' \ No newline at end of file diff --git a/src/mol-model/sequence/constants.ts b/src/mol-model/sequence/constants.ts index 0ffdd02fcbce683e436c0030ffe0517135c6ceda..192d137f5bbd93804bbcdfe509d8f4cf20aacc02 100644 --- a/src/mol-model/sequence/constants.ts +++ b/src/mol-model/sequence/constants.ts @@ -1 +1,71 @@ -// TODO \ No newline at end of file +/** + * Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +export type AminoAlphabet = + | 'H' | 'R' | 'K' | 'I' | 'F' | 'L' | 'W' | 'A' | 'M' | 'P' | 'C' | 'N' | 'V' | 'G' | 'S' | 'Q' | 'Y' | 'D' | 'E' | 'T' | 'U' | 'O' + | 'X' /** = Unknown */ + +export type NuclecicAlphabet = + | 'A' | 'C' | 'G' | 'T' | 'U' + | 'X' /** = Unknown */ + +// from NGL +const ProteinOneLetterCodes: { [name: string]: AminoAlphabet } = { + 'HIS': 'H', + 'ARG': 'R', + 'LYS': 'K', + 'ILE': 'I', + 'PHE': 'F', + 'LEU': 'L', + 'TRP': 'W', + 'ALA': 'A', + 'MET': 'M', + 'PRO': 'P', + 'CYS': 'C', + 'ASN': 'N', + 'VAL': 'V', + 'GLY': 'G', + 'SER': 'S', + 'GLN': 'Q', + 'TYR': 'Y', + 'ASP': 'D', + 'GLU': 'E', + 'THR': 'T', + + 'SEC': 'U', // as per IUPAC definition + 'PYL': 'O', // as per IUPAC definition +} + +const DnaOneLetterCodes: { [name: string]: NuclecicAlphabet } = { + 'DA': 'A', + 'DC': 'C', + 'DG': 'G', + 'DT': 'T', + 'DU': 'U' +} + +const RnaOneLetterCodes: { [name: string]: NuclecicAlphabet } = { + 'A': 'A', + 'C': 'C', + 'G': 'G', + 'T': 'T', + 'U': 'U' +} + +export function getProteinOneLetterCode(residueName: string): AminoAlphabet { + const code = ProteinOneLetterCodes[residueName]; + return code || 'X'; +} + +export function getRnaOneLetterCode(residueName: string): NuclecicAlphabet { + const code = RnaOneLetterCodes[residueName]; + return code || 'X'; +} + +export function getDnaOneLetterCode(residueName: string): NuclecicAlphabet { + const code = DnaOneLetterCodes[residueName]; + return code || 'X'; +} \ No newline at end of file diff --git a/src/mol-model/sequence/sequence.ts b/src/mol-model/sequence/sequence.ts index c13501983abdf90f57512f75e1d4e60447759fc8..7f99ca54ce82ad570fb6f3d76e24d49f730b23f4 100644 --- a/src/mol-model/sequence/sequence.ts +++ b/src/mol-model/sequence/sequence.ts @@ -4,12 +4,13 @@ * @author David Sehnal <david.sehnal@gmail.com> */ -// import { Column } from 'mol-data/db' -// TODO +import { AminoAlphabet, NuclecicAlphabet, getProteinOneLetterCode, getRnaOneLetterCode, getDnaOneLetterCode } from './constants'; +import { Column } from 'mol-data/db' -interface Sequence { +// TODO add mapping support to other sequence spaces, e.g. uniprot +// TODO sequence alignment (take NGL code as starting point) -} +type Sequence = Sequence.Protein | Sequence.DNA | Sequence.RNA | Sequence.Generic namespace Sequence { export const enum Kind { @@ -19,14 +20,83 @@ namespace Sequence { Generic = 'generic' } - export type ProteinAlphabet = 'X' - export type RnaAlphabet = 'X' - export type DnaAlphabet = 'C' - export type GenericAlphabet = 'X' - export interface Base<K extends Kind, Alphabet extends string> { readonly kind: K, - readonly sequence: ReadonlyArray<Alphabet> + readonly offset: number, + readonly sequence: ArrayLike<Alphabet> + } + + export interface Protein extends Base<Kind.Protein, AminoAlphabet> { } + export interface RNA extends Base<Kind.RNA, NuclecicAlphabet> { } + export interface DNA extends Base<Kind.DNA, NuclecicAlphabet> { } + export interface Generic extends Base<Kind.Generic, 'X'> { } + + export function create(kind: Kind, sequence: string, offset: number = 0): Sequence { + return { kind: kind as any, sequence: sequence as any, offset }; + } + + export function getSequenceString(seq: Sequence) { + return seq.sequence as string; + } + + function determineKind(names: Column<string>) { + for (let i = 0, _i = Math.min(names.rowCount, 10); i < _i; i++) { + const name = names.value(i) || ''; + if (getProteinOneLetterCode(name) !== 'X') return { kind: Kind.Protein, code: getProteinOneLetterCode }; + if (getRnaOneLetterCode(name) !== 'X') return { kind: Kind.RNA, code: getRnaOneLetterCode }; + if (getDnaOneLetterCode(name) !== 'X') return { kind: Kind.DNA, code: getDnaOneLetterCode }; + } + return { kind: Kind.Generic, code: (v: string) => 'X' }; + } + + export function ofResidueNames(residueName: Column<string>, seqId: Column<number>): Sequence { + if (seqId.rowCount === 0) throw new Error('cannot be empty'); + + const { kind, code } = determineKind(residueName); + return new Impl(kind, residueName, seqId, code) as Sequence; + } + + class Impl implements Base<any, any> { + private _offset = 0; + private _seq: string | undefined = void 0; + + get offset() { + if (this._seq !== void 0) return this._offset; + this.create(); + return this._offset; + } + + get sequence(): any { + if (this._seq !== void 0) return this._seq; + this.create(); + return this._seq; + } + + private create() { + let maxSeqId = 0, minSeqId = Number.MAX_SAFE_INTEGER; + for (let i = 0, _i = this.seqId.rowCount; i < _i; i++) { + const id = this.seqId.value(i); + if (maxSeqId < id) maxSeqId = id; + if (id < minSeqId) minSeqId = id; + } + + const count = maxSeqId - minSeqId + 1; + const sequenceArray = new Array(maxSeqId + 1); + for (let i = 0; i < count; i++) { + sequenceArray[i] = 'X'; + } + + for (let i = 0, _i = this.seqId.rowCount; i < _i; i++) { + sequenceArray[this.seqId.value(i) - minSeqId] = this.code(this.residueName.value(i) || ''); + } + + this._seq = sequenceArray.join(''); + this._offset = minSeqId - 1; + } + + constructor(public kind: Kind, private residueName: Column<string>, private seqId: Column<number>, private code: (name: string) => string) { + + } } } diff --git a/src/mol-model/structure/model/formats/mmcif/sequence.ts b/src/mol-model/structure/model/formats/mmcif/sequence.ts index c9f0825208a9538a3bac331162776d6d84989fbc..06f0c888d9af880650668dfe69776c0308aae9ce 100644 --- a/src/mol-model/structure/model/formats/mmcif/sequence.ts +++ b/src/mol-model/structure/model/formats/mmcif/sequence.ts @@ -5,10 +5,11 @@ */ import { mmCIF_Database as mmCIF } from 'mol-io/reader/cif/schema/mmcif' -import Sequence from '../../properties/sequence' +import StructureSequence from '../../properties/sequence' import { Column } from 'mol-data/db'; import { AtomicHierarchy } from '../../properties/atomic'; import { Entities } from '../../properties/common'; +import { Sequence } from '../../../../sequence'; // TODO how to handle microheterogeneity // see http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly_seq.html @@ -20,12 +21,12 @@ import { Entities } from '../../properties/common'; // corresponding ATOM_SITE entries should reflect this // heterogeneity. -export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHierarchy): Sequence { - if (!cif.entity_poly_seq._rowCount) return Sequence.fromAtomicHierarchy(entities, hierarchy); +export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHierarchy): StructureSequence { + if (!cif.entity_poly_seq._rowCount) return StructureSequence.fromAtomicHierarchy(entities, hierarchy); const { entity_id, num, mon_id } = cif.entity_poly_seq; - const byEntityKey: Sequence['byEntityKey'] = {}; + const byEntityKey: StructureSequence['byEntityKey'] = {}; const count = entity_id.rowCount; let i = 0; @@ -35,7 +36,15 @@ export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHie i++; const id = entity_id.value(start); - byEntityKey[entities.getEntityIndex(id)] = { entityId: id, compId: Column.window(mon_id, start, i), num: Column.window(num, start, i) } + const _compId = Column.window(mon_id, start, i); + const _num = Column.window(num, start, i); + + byEntityKey[entities.getEntityIndex(id)] = { + entityId: id, + compId: _compId, + num: _num, + sequence: Sequence.ofResidueNames(_compId, _num) + }; } return { byEntityKey }; diff --git a/src/mol-model/structure/model/model.ts b/src/mol-model/structure/model/model.ts index 214dc8d8687ce1b59cfd871513163622b2162420..da0037d65dff36c294992e38a77c7d5d0d109b53 100644 --- a/src/mol-model/structure/model/model.ts +++ b/src/mol-model/structure/model/model.ts @@ -6,7 +6,7 @@ import UUID from 'mol-util/uuid' import Format from './format' -import Sequence from './properties/sequence' +import StructureSequence from './properties/sequence' import { AtomicHierarchy, AtomicConformation } from './properties/atomic' import { ModelSymmetry } from './properties/symmetry' import { CoarseHierarchy, CoarseConformation } from './properties/coarse' @@ -31,7 +31,7 @@ interface Model extends Readonly<{ symmetry: ModelSymmetry, entities: Entities, - sequence: Sequence, + sequence: StructureSequence, atomicHierarchy: AtomicHierarchy, atomicConformation: AtomicConformation, diff --git a/src/mol-model/structure/model/properties/sequence.ts b/src/mol-model/structure/model/properties/sequence.ts index f10338b174124e571eb7f069bd0d34da39eb8001..a2f53c56df7c790bbbacadc67e1d28a4f983a6ac 100644 --- a/src/mol-model/structure/model/properties/sequence.ts +++ b/src/mol-model/structure/model/properties/sequence.ts @@ -7,59 +7,55 @@ import { Column } from 'mol-data/db' import { AtomicHierarchy } from './atomic/hierarchy'; import { Entities } from './common'; +import { Sequence } from '../../../sequence'; -interface Sequence { - readonly byEntityKey: { [key: number]: Sequence.Entity } +interface StructureSequence { + readonly byEntityKey: { [key: number]: StructureSequence.Entity } } -// TODO lift to model/sequence/ folder -// TODO add one letter code sequence string -// TODO add mapping support to other sequence spaces, e.g. uniprot -// TODO add sequence kind, e.g. protein, dna, rna (alphabets?) -// TODO sequence alignment (take NGL code as starting point) - -namespace Sequence { +namespace StructureSequence { export interface Entity { readonly entityId: string, - readonly num: Column<number> - // _entity_poly_seq.mon_id - readonly compId: Column<string> + readonly num: Column<number>, + // Corresponds to _entity_poly_seq.mon_id + readonly compId: Column<string>, + readonly sequence: Sequence } - export function fromAtomicHierarchy(entities: Entities, hierarchy: AtomicHierarchy): Sequence { - const { label_entity_id } = hierarchy.chains + export function fromAtomicHierarchy(entities: Entities, hierarchy: AtomicHierarchy): StructureSequence { const { label_comp_id, label_seq_id } = hierarchy.residues const { chainSegments, residueSegments } = hierarchy - const byEntityKey: Sequence['byEntityKey'] = {}; + const byEntityKey: StructureSequence['byEntityKey'] = { }; - // TODO get min/max of label_seq_id to handle missing residues at start and in between - // note that this assumes label_seq_id is monotonically increasing + for (let cI = 0, _cI = hierarchy.chains._rowCount; cI < _cI; cI++) { + const entityKey = hierarchy.entityKey[cI]; + // Only for polymers, trying to mirror _entity_poly_seq + if (byEntityKey[entityKey] !== void 0 || entities.data.type.value(entityKey) !== 'polymer') continue; - const chainCount = hierarchy.chains._rowCount - for (let i = 0; i < chainCount; ++i) { - const entityId = label_entity_id.value(i) - const entityIndex = entities.getEntityIndex(entityId) - // TODO only for polymers, mirroring _entity_poly_seq, ok??? - if (entities.data.type.value(i) !== 'polymer') continue + let start = cI; + cI++; + while (cI < _cI && entityKey === hierarchy.entityKey[cI] && entities.data.type.value(entityKey) !== 'polymer') { + cI++; + } + cI--; - const entityKey = hierarchy.entityKey[entityIndex] - if (byEntityKey[entityKey] !== undefined) continue + const rStart = residueSegments.segmentMap[chainSegments.segments[start]]; + const rEnd = residueSegments.segmentMap[chainSegments.segments[cI + 1]]; - const start = residueSegments.segmentMap[chainSegments.segments[i]] - let end = residueSegments.segmentMap[chainSegments.segments[i + 1]] - // TODO better way to handle end??? - if (end === undefined) end = hierarchy.residues._rowCount + const compId = Column.window(label_comp_id, rStart, rEnd); + const num = Column.window(label_seq_id, rStart, rEnd); byEntityKey[entityKey] = { - entityId, - compId: Column.window(label_comp_id, start, end), - num: Column.window(label_seq_id, start, end) - } + entityId: entities.data.id.value(entityKey), + compId, + num, + sequence: Sequence.ofResidueNames(compId, num) + }; } return { byEntityKey } } } -export default Sequence \ No newline at end of file +export default StructureSequence \ No newline at end of file