Skip to content
Snippets Groups Projects
Commit e3f49b61 authored by David Sehnal's avatar David Sehnal
Browse files

Sequence as more 1st class citizen

parent 369ff8e5
No related branches found
No related tags found
No related merge requests found
...@@ -50,7 +50,7 @@ export function residueLabel(model: Model, rI: number) { ...@@ -50,7 +50,7 @@ export function residueLabel(model: Model, rI: number) {
} }
export function printSecStructure(model: Model) { export function printSecStructure(model: Model) {
console.log('Secondary Structure\n============='); console.log('\nSecondary Structure\n=============');
const { residues } = model.atomicHierarchy; const { residues } = model.atomicHierarchy;
const { type, key } = model.properties.secondaryStructure; const { type, key } = model.properties.secondaryStructure;
...@@ -90,20 +90,18 @@ export function printBonds(structure: Structure) { ...@@ -90,20 +90,18 @@ export function printBonds(structure: Structure) {
} }
export function printSequence(model: Model) { export function printSequence(model: Model) {
console.log('Sequence\n============='); console.log('\nSequence\n=============');
const { byEntityKey } = model.sequence; const { byEntityKey } = model.sequence;
for (const key of Object.keys(byEntityKey)) { for (const key of Object.keys(byEntityKey)) {
const seq = byEntityKey[+key]; const seq = byEntityKey[+key];
console.log(`${seq.entityId} (${seq.num.value(0)}, ${seq.num.value(seq.num.rowCount - 1)}) (${seq.compId.value(0)}, ${seq.compId.value(seq.compId.rowCount - 1)})`); console.log(`${seq.entityId} (${seq.sequence.kind} ${seq.num.value(0)} (offset ${seq.sequence.offset}), ${seq.num.value(seq.num.rowCount - 1)}) (${seq.compId.value(0)}, ${seq.compId.value(seq.compId.rowCount - 1)})`);
// for (let i = 0; i < seq.compId.rowCount; i++) { console.log(`${seq.sequence.sequence}`);
// console.log(`${seq.entityId} ${seq.num.value(i)} ${seq.compId.value(i)}`);
// }
} }
console.log(); console.log();
} }
export function printUnits(structure: Structure) { export function printUnits(structure: Structure) {
console.log('Units\n============='); console.log('\nUnits\n=============');
const l = Element.Location(); const l = Element.Location();
for (const unit of structure.units) { for (const unit of structure.units) {
...@@ -133,10 +131,9 @@ export function printUnits(structure: Structure) { ...@@ -133,10 +131,9 @@ export function printUnits(structure: Structure) {
} }
} }
export function printIHMModels(model: Model) { export function printIHMModels(model: Model) {
if (!model.coarseHierarchy.isDefined) return false; if (!model.coarseHierarchy.isDefined) return false;
console.log('IHM Models\n============='); console.log('\nIHM Models\n=============');
console.log(Table.formatToString(model.coarseHierarchy.models)); console.log(Table.formatToString(model.coarseHierarchy.models));
} }
...@@ -144,10 +141,10 @@ async function run(mmcif: mmCIF_Database) { ...@@ -144,10 +141,10 @@ async function run(mmcif: mmCIF_Database) {
const models = await Model.create({ kind: 'mmCIF', data: mmcif }).run(); const models = await Model.create({ kind: 'mmCIF', data: mmcif }).run();
const structure = Structure.ofModel(models[0]); const structure = Structure.ofModel(models[0]);
printSequence(models[0]); printSequence(models[0]);
printIHMModels(models[0]); //printIHMModels(models[0]);
printUnits(structure); printUnits(structure);
printBonds(structure); //printBonds(structure);
printSecStructure(models[0]); //printSecStructure(models[0]);
} }
async function runDL(pdb: string) { async function runDL(pdb: string) {
......
...@@ -103,6 +103,15 @@ namespace Column { ...@@ -103,6 +103,15 @@ namespace Column {
return lambdaColumn(spec); return lambdaColumn(spec);
} }
/** values [min, max] (i.e. include both values) */
export function range(min: number, max: number): Column<number> {
return ofLambda({
value: i => i + min,
rowCount: Math.max(max - min + 1, 0),
schema: Schema.int
});
}
export function ofArray<T extends Column.Schema>(spec: Column.ArraySpec<T>): Column<T['T']> { export function ofArray<T extends Column.Schema>(spec: Column.ArraySpec<T>): Column<T['T']> {
return arrayColumn(spec); return arrayColumn(spec);
} }
......
/**
* Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author David Sehnal <david.sehnal@gmail.com>
*/
export * from './sequence/sequence'
\ No newline at end of file
// TODO /**
\ No newline at end of file * Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author David Sehnal <david.sehnal@gmail.com>
*/
export type AminoAlphabet =
| 'H' | 'R' | 'K' | 'I' | 'F' | 'L' | 'W' | 'A' | 'M' | 'P' | 'C' | 'N' | 'V' | 'G' | 'S' | 'Q' | 'Y' | 'D' | 'E' | 'T' | 'U' | 'O'
| 'X' /** = Unknown */
export type NuclecicAlphabet =
| 'A' | 'C' | 'G' | 'T' | 'U'
| 'X' /** = Unknown */
// from NGL
const ProteinOneLetterCodes: { [name: string]: AminoAlphabet } = {
'HIS': 'H',
'ARG': 'R',
'LYS': 'K',
'ILE': 'I',
'PHE': 'F',
'LEU': 'L',
'TRP': 'W',
'ALA': 'A',
'MET': 'M',
'PRO': 'P',
'CYS': 'C',
'ASN': 'N',
'VAL': 'V',
'GLY': 'G',
'SER': 'S',
'GLN': 'Q',
'TYR': 'Y',
'ASP': 'D',
'GLU': 'E',
'THR': 'T',
'SEC': 'U', // as per IUPAC definition
'PYL': 'O', // as per IUPAC definition
}
const DnaOneLetterCodes: { [name: string]: NuclecicAlphabet } = {
'DA': 'A',
'DC': 'C',
'DG': 'G',
'DT': 'T',
'DU': 'U'
}
const RnaOneLetterCodes: { [name: string]: NuclecicAlphabet } = {
'A': 'A',
'C': 'C',
'G': 'G',
'T': 'T',
'U': 'U'
}
export function getProteinOneLetterCode(residueName: string): AminoAlphabet {
const code = ProteinOneLetterCodes[residueName];
return code || 'X';
}
export function getRnaOneLetterCode(residueName: string): NuclecicAlphabet {
const code = RnaOneLetterCodes[residueName];
return code || 'X';
}
export function getDnaOneLetterCode(residueName: string): NuclecicAlphabet {
const code = DnaOneLetterCodes[residueName];
return code || 'X';
}
\ No newline at end of file
...@@ -4,12 +4,13 @@ ...@@ -4,12 +4,13 @@
* @author David Sehnal <david.sehnal@gmail.com> * @author David Sehnal <david.sehnal@gmail.com>
*/ */
// import { Column } from 'mol-data/db' import { AminoAlphabet, NuclecicAlphabet, getProteinOneLetterCode, getRnaOneLetterCode, getDnaOneLetterCode } from './constants';
// TODO import { Column } from 'mol-data/db'
interface Sequence { // TODO add mapping support to other sequence spaces, e.g. uniprot
// TODO sequence alignment (take NGL code as starting point)
} type Sequence = Sequence.Protein | Sequence.DNA | Sequence.RNA | Sequence.Generic
namespace Sequence { namespace Sequence {
export const enum Kind { export const enum Kind {
...@@ -19,14 +20,83 @@ namespace Sequence { ...@@ -19,14 +20,83 @@ namespace Sequence {
Generic = 'generic' Generic = 'generic'
} }
export type ProteinAlphabet = 'X'
export type RnaAlphabet = 'X'
export type DnaAlphabet = 'C'
export type GenericAlphabet = 'X'
export interface Base<K extends Kind, Alphabet extends string> { export interface Base<K extends Kind, Alphabet extends string> {
readonly kind: K, readonly kind: K,
readonly sequence: ReadonlyArray<Alphabet> readonly offset: number,
readonly sequence: ArrayLike<Alphabet>
}
export interface Protein extends Base<Kind.Protein, AminoAlphabet> { }
export interface RNA extends Base<Kind.RNA, NuclecicAlphabet> { }
export interface DNA extends Base<Kind.DNA, NuclecicAlphabet> { }
export interface Generic extends Base<Kind.Generic, 'X'> { }
export function create(kind: Kind, sequence: string, offset: number = 0): Sequence {
return { kind: kind as any, sequence: sequence as any, offset };
}
export function getSequenceString(seq: Sequence) {
return seq.sequence as string;
}
function determineKind(names: Column<string>) {
for (let i = 0, _i = Math.min(names.rowCount, 10); i < _i; i++) {
const name = names.value(i) || '';
if (getProteinOneLetterCode(name) !== 'X') return { kind: Kind.Protein, code: getProteinOneLetterCode };
if (getRnaOneLetterCode(name) !== 'X') return { kind: Kind.RNA, code: getRnaOneLetterCode };
if (getDnaOneLetterCode(name) !== 'X') return { kind: Kind.DNA, code: getDnaOneLetterCode };
}
return { kind: Kind.Generic, code: (v: string) => 'X' };
}
export function ofResidueNames(residueName: Column<string>, seqId: Column<number>): Sequence {
if (seqId.rowCount === 0) throw new Error('cannot be empty');
const { kind, code } = determineKind(residueName);
return new Impl(kind, residueName, seqId, code) as Sequence;
}
class Impl implements Base<any, any> {
private _offset = 0;
private _seq: string | undefined = void 0;
get offset() {
if (this._seq !== void 0) return this._offset;
this.create();
return this._offset;
}
get sequence(): any {
if (this._seq !== void 0) return this._seq;
this.create();
return this._seq;
}
private create() {
let maxSeqId = 0, minSeqId = Number.MAX_SAFE_INTEGER;
for (let i = 0, _i = this.seqId.rowCount; i < _i; i++) {
const id = this.seqId.value(i);
if (maxSeqId < id) maxSeqId = id;
if (id < minSeqId) minSeqId = id;
}
const count = maxSeqId - minSeqId + 1;
const sequenceArray = new Array(maxSeqId + 1);
for (let i = 0; i < count; i++) {
sequenceArray[i] = 'X';
}
for (let i = 0, _i = this.seqId.rowCount; i < _i; i++) {
sequenceArray[this.seqId.value(i) - minSeqId] = this.code(this.residueName.value(i) || '');
}
this._seq = sequenceArray.join('');
this._offset = minSeqId - 1;
}
constructor(public kind: Kind, private residueName: Column<string>, private seqId: Column<number>, private code: (name: string) => string) {
}
} }
} }
......
...@@ -5,10 +5,11 @@ ...@@ -5,10 +5,11 @@
*/ */
import { mmCIF_Database as mmCIF } from 'mol-io/reader/cif/schema/mmcif' import { mmCIF_Database as mmCIF } from 'mol-io/reader/cif/schema/mmcif'
import Sequence from '../../properties/sequence' import StructureSequence from '../../properties/sequence'
import { Column } from 'mol-data/db'; import { Column } from 'mol-data/db';
import { AtomicHierarchy } from '../../properties/atomic'; import { AtomicHierarchy } from '../../properties/atomic';
import { Entities } from '../../properties/common'; import { Entities } from '../../properties/common';
import { Sequence } from '../../../../sequence';
// TODO how to handle microheterogeneity // TODO how to handle microheterogeneity
// see http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly_seq.html // see http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly_seq.html
...@@ -20,12 +21,12 @@ import { Entities } from '../../properties/common'; ...@@ -20,12 +21,12 @@ import { Entities } from '../../properties/common';
// corresponding ATOM_SITE entries should reflect this // corresponding ATOM_SITE entries should reflect this
// heterogeneity. // heterogeneity.
export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHierarchy): Sequence { export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHierarchy): StructureSequence {
if (!cif.entity_poly_seq._rowCount) return Sequence.fromAtomicHierarchy(entities, hierarchy); if (!cif.entity_poly_seq._rowCount) return StructureSequence.fromAtomicHierarchy(entities, hierarchy);
const { entity_id, num, mon_id } = cif.entity_poly_seq; const { entity_id, num, mon_id } = cif.entity_poly_seq;
const byEntityKey: Sequence['byEntityKey'] = {}; const byEntityKey: StructureSequence['byEntityKey'] = {};
const count = entity_id.rowCount; const count = entity_id.rowCount;
let i = 0; let i = 0;
...@@ -35,7 +36,15 @@ export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHie ...@@ -35,7 +36,15 @@ export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHie
i++; i++;
const id = entity_id.value(start); const id = entity_id.value(start);
byEntityKey[entities.getEntityIndex(id)] = { entityId: id, compId: Column.window(mon_id, start, i), num: Column.window(num, start, i) } const _compId = Column.window(mon_id, start, i);
const _num = Column.window(num, start, i);
byEntityKey[entities.getEntityIndex(id)] = {
entityId: id,
compId: _compId,
num: _num,
sequence: Sequence.ofResidueNames(_compId, _num)
};
} }
return { byEntityKey }; return { byEntityKey };
......
...@@ -6,7 +6,7 @@ ...@@ -6,7 +6,7 @@
import UUID from 'mol-util/uuid' import UUID from 'mol-util/uuid'
import Format from './format' import Format from './format'
import Sequence from './properties/sequence' import StructureSequence from './properties/sequence'
import { AtomicHierarchy, AtomicConformation } from './properties/atomic' import { AtomicHierarchy, AtomicConformation } from './properties/atomic'
import { ModelSymmetry } from './properties/symmetry' import { ModelSymmetry } from './properties/symmetry'
import { CoarseHierarchy, CoarseConformation } from './properties/coarse' import { CoarseHierarchy, CoarseConformation } from './properties/coarse'
...@@ -31,7 +31,7 @@ interface Model extends Readonly<{ ...@@ -31,7 +31,7 @@ interface Model extends Readonly<{
symmetry: ModelSymmetry, symmetry: ModelSymmetry,
entities: Entities, entities: Entities,
sequence: Sequence, sequence: StructureSequence,
atomicHierarchy: AtomicHierarchy, atomicHierarchy: AtomicHierarchy,
atomicConformation: AtomicConformation, atomicConformation: AtomicConformation,
......
...@@ -7,59 +7,55 @@ ...@@ -7,59 +7,55 @@
import { Column } from 'mol-data/db' import { Column } from 'mol-data/db'
import { AtomicHierarchy } from './atomic/hierarchy'; import { AtomicHierarchy } from './atomic/hierarchy';
import { Entities } from './common'; import { Entities } from './common';
import { Sequence } from '../../../sequence';
interface Sequence { interface StructureSequence {
readonly byEntityKey: { [key: number]: Sequence.Entity } readonly byEntityKey: { [key: number]: StructureSequence.Entity }
} }
// TODO lift to model/sequence/ folder namespace StructureSequence {
// TODO add one letter code sequence string
// TODO add mapping support to other sequence spaces, e.g. uniprot
// TODO add sequence kind, e.g. protein, dna, rna (alphabets?)
// TODO sequence alignment (take NGL code as starting point)
namespace Sequence {
export interface Entity { export interface Entity {
readonly entityId: string, readonly entityId: string,
readonly num: Column<number> readonly num: Column<number>,
// _entity_poly_seq.mon_id // Corresponds to _entity_poly_seq.mon_id
readonly compId: Column<string> readonly compId: Column<string>,
readonly sequence: Sequence
} }
export function fromAtomicHierarchy(entities: Entities, hierarchy: AtomicHierarchy): Sequence { export function fromAtomicHierarchy(entities: Entities, hierarchy: AtomicHierarchy): StructureSequence {
const { label_entity_id } = hierarchy.chains
const { label_comp_id, label_seq_id } = hierarchy.residues const { label_comp_id, label_seq_id } = hierarchy.residues
const { chainSegments, residueSegments } = hierarchy const { chainSegments, residueSegments } = hierarchy
const byEntityKey: Sequence['byEntityKey'] = {}; const byEntityKey: StructureSequence['byEntityKey'] = { };
// TODO get min/max of label_seq_id to handle missing residues at start and in between for (let cI = 0, _cI = hierarchy.chains._rowCount; cI < _cI; cI++) {
// note that this assumes label_seq_id is monotonically increasing const entityKey = hierarchy.entityKey[cI];
// Only for polymers, trying to mirror _entity_poly_seq
if (byEntityKey[entityKey] !== void 0 || entities.data.type.value(entityKey) !== 'polymer') continue;
const chainCount = hierarchy.chains._rowCount let start = cI;
for (let i = 0; i < chainCount; ++i) { cI++;
const entityId = label_entity_id.value(i) while (cI < _cI && entityKey === hierarchy.entityKey[cI] && entities.data.type.value(entityKey) !== 'polymer') {
const entityIndex = entities.getEntityIndex(entityId) cI++;
// TODO only for polymers, mirroring _entity_poly_seq, ok??? }
if (entities.data.type.value(i) !== 'polymer') continue cI--;
const entityKey = hierarchy.entityKey[entityIndex] const rStart = residueSegments.segmentMap[chainSegments.segments[start]];
if (byEntityKey[entityKey] !== undefined) continue const rEnd = residueSegments.segmentMap[chainSegments.segments[cI + 1]];
const start = residueSegments.segmentMap[chainSegments.segments[i]] const compId = Column.window(label_comp_id, rStart, rEnd);
let end = residueSegments.segmentMap[chainSegments.segments[i + 1]] const num = Column.window(label_seq_id, rStart, rEnd);
// TODO better way to handle end???
if (end === undefined) end = hierarchy.residues._rowCount
byEntityKey[entityKey] = { byEntityKey[entityKey] = {
entityId, entityId: entities.data.id.value(entityKey),
compId: Column.window(label_comp_id, start, end), compId,
num: Column.window(label_seq_id, start, end) num,
} sequence: Sequence.ofResidueNames(compId, num)
};
} }
return { byEntityKey } return { byEntityKey }
} }
} }
export default Sequence export default StructureSequence
\ No newline at end of file \ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment