Skip to content
Snippets Groups Projects
Commit e3f49b61 authored by David Sehnal's avatar David Sehnal
Browse files

Sequence as more 1st class citizen

parent 369ff8e5
No related branches found
No related tags found
No related merge requests found
......@@ -50,7 +50,7 @@ export function residueLabel(model: Model, rI: number) {
}
export function printSecStructure(model: Model) {
console.log('Secondary Structure\n=============');
console.log('\nSecondary Structure\n=============');
const { residues } = model.atomicHierarchy;
const { type, key } = model.properties.secondaryStructure;
......@@ -90,20 +90,18 @@ export function printBonds(structure: Structure) {
}
export function printSequence(model: Model) {
console.log('Sequence\n=============');
console.log('\nSequence\n=============');
const { byEntityKey } = model.sequence;
for (const key of Object.keys(byEntityKey)) {
const seq = byEntityKey[+key];
console.log(`${seq.entityId} (${seq.num.value(0)}, ${seq.num.value(seq.num.rowCount - 1)}) (${seq.compId.value(0)}, ${seq.compId.value(seq.compId.rowCount - 1)})`);
// for (let i = 0; i < seq.compId.rowCount; i++) {
// console.log(`${seq.entityId} ${seq.num.value(i)} ${seq.compId.value(i)}`);
// }
console.log(`${seq.entityId} (${seq.sequence.kind} ${seq.num.value(0)} (offset ${seq.sequence.offset}), ${seq.num.value(seq.num.rowCount - 1)}) (${seq.compId.value(0)}, ${seq.compId.value(seq.compId.rowCount - 1)})`);
console.log(`${seq.sequence.sequence}`);
}
console.log();
}
export function printUnits(structure: Structure) {
console.log('Units\n=============');
console.log('\nUnits\n=============');
const l = Element.Location();
for (const unit of structure.units) {
......@@ -133,10 +131,9 @@ export function printUnits(structure: Structure) {
}
}
export function printIHMModels(model: Model) {
if (!model.coarseHierarchy.isDefined) return false;
console.log('IHM Models\n=============');
console.log('\nIHM Models\n=============');
console.log(Table.formatToString(model.coarseHierarchy.models));
}
......@@ -144,10 +141,10 @@ async function run(mmcif: mmCIF_Database) {
const models = await Model.create({ kind: 'mmCIF', data: mmcif }).run();
const structure = Structure.ofModel(models[0]);
printSequence(models[0]);
printIHMModels(models[0]);
//printIHMModels(models[0]);
printUnits(structure);
printBonds(structure);
printSecStructure(models[0]);
//printBonds(structure);
//printSecStructure(models[0]);
}
async function runDL(pdb: string) {
......
......@@ -103,6 +103,15 @@ namespace Column {
return lambdaColumn(spec);
}
/** values [min, max] (i.e. include both values) */
export function range(min: number, max: number): Column<number> {
return ofLambda({
value: i => i + min,
rowCount: Math.max(max - min + 1, 0),
schema: Schema.int
});
}
export function ofArray<T extends Column.Schema>(spec: Column.ArraySpec<T>): Column<T['T']> {
return arrayColumn(spec);
}
......
/**
* Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author David Sehnal <david.sehnal@gmail.com>
*/
export * from './sequence/sequence'
\ No newline at end of file
// TODO
\ No newline at end of file
/**
* Copyright (c) 2018 mol* contributors, licensed under MIT, See LICENSE file for more info.
*
* @author David Sehnal <david.sehnal@gmail.com>
*/
export type AminoAlphabet =
| 'H' | 'R' | 'K' | 'I' | 'F' | 'L' | 'W' | 'A' | 'M' | 'P' | 'C' | 'N' | 'V' | 'G' | 'S' | 'Q' | 'Y' | 'D' | 'E' | 'T' | 'U' | 'O'
| 'X' /** = Unknown */
export type NuclecicAlphabet =
| 'A' | 'C' | 'G' | 'T' | 'U'
| 'X' /** = Unknown */
// from NGL
const ProteinOneLetterCodes: { [name: string]: AminoAlphabet } = {
'HIS': 'H',
'ARG': 'R',
'LYS': 'K',
'ILE': 'I',
'PHE': 'F',
'LEU': 'L',
'TRP': 'W',
'ALA': 'A',
'MET': 'M',
'PRO': 'P',
'CYS': 'C',
'ASN': 'N',
'VAL': 'V',
'GLY': 'G',
'SER': 'S',
'GLN': 'Q',
'TYR': 'Y',
'ASP': 'D',
'GLU': 'E',
'THR': 'T',
'SEC': 'U', // as per IUPAC definition
'PYL': 'O', // as per IUPAC definition
}
const DnaOneLetterCodes: { [name: string]: NuclecicAlphabet } = {
'DA': 'A',
'DC': 'C',
'DG': 'G',
'DT': 'T',
'DU': 'U'
}
const RnaOneLetterCodes: { [name: string]: NuclecicAlphabet } = {
'A': 'A',
'C': 'C',
'G': 'G',
'T': 'T',
'U': 'U'
}
export function getProteinOneLetterCode(residueName: string): AminoAlphabet {
const code = ProteinOneLetterCodes[residueName];
return code || 'X';
}
export function getRnaOneLetterCode(residueName: string): NuclecicAlphabet {
const code = RnaOneLetterCodes[residueName];
return code || 'X';
}
export function getDnaOneLetterCode(residueName: string): NuclecicAlphabet {
const code = DnaOneLetterCodes[residueName];
return code || 'X';
}
\ No newline at end of file
......@@ -4,12 +4,13 @@
* @author David Sehnal <david.sehnal@gmail.com>
*/
// import { Column } from 'mol-data/db'
// TODO
import { AminoAlphabet, NuclecicAlphabet, getProteinOneLetterCode, getRnaOneLetterCode, getDnaOneLetterCode } from './constants';
import { Column } from 'mol-data/db'
interface Sequence {
// TODO add mapping support to other sequence spaces, e.g. uniprot
// TODO sequence alignment (take NGL code as starting point)
}
type Sequence = Sequence.Protein | Sequence.DNA | Sequence.RNA | Sequence.Generic
namespace Sequence {
export const enum Kind {
......@@ -19,14 +20,83 @@ namespace Sequence {
Generic = 'generic'
}
export type ProteinAlphabet = 'X'
export type RnaAlphabet = 'X'
export type DnaAlphabet = 'C'
export type GenericAlphabet = 'X'
export interface Base<K extends Kind, Alphabet extends string> {
readonly kind: K,
readonly sequence: ReadonlyArray<Alphabet>
readonly offset: number,
readonly sequence: ArrayLike<Alphabet>
}
export interface Protein extends Base<Kind.Protein, AminoAlphabet> { }
export interface RNA extends Base<Kind.RNA, NuclecicAlphabet> { }
export interface DNA extends Base<Kind.DNA, NuclecicAlphabet> { }
export interface Generic extends Base<Kind.Generic, 'X'> { }
export function create(kind: Kind, sequence: string, offset: number = 0): Sequence {
return { kind: kind as any, sequence: sequence as any, offset };
}
export function getSequenceString(seq: Sequence) {
return seq.sequence as string;
}
function determineKind(names: Column<string>) {
for (let i = 0, _i = Math.min(names.rowCount, 10); i < _i; i++) {
const name = names.value(i) || '';
if (getProteinOneLetterCode(name) !== 'X') return { kind: Kind.Protein, code: getProteinOneLetterCode };
if (getRnaOneLetterCode(name) !== 'X') return { kind: Kind.RNA, code: getRnaOneLetterCode };
if (getDnaOneLetterCode(name) !== 'X') return { kind: Kind.DNA, code: getDnaOneLetterCode };
}
return { kind: Kind.Generic, code: (v: string) => 'X' };
}
export function ofResidueNames(residueName: Column<string>, seqId: Column<number>): Sequence {
if (seqId.rowCount === 0) throw new Error('cannot be empty');
const { kind, code } = determineKind(residueName);
return new Impl(kind, residueName, seqId, code) as Sequence;
}
class Impl implements Base<any, any> {
private _offset = 0;
private _seq: string | undefined = void 0;
get offset() {
if (this._seq !== void 0) return this._offset;
this.create();
return this._offset;
}
get sequence(): any {
if (this._seq !== void 0) return this._seq;
this.create();
return this._seq;
}
private create() {
let maxSeqId = 0, minSeqId = Number.MAX_SAFE_INTEGER;
for (let i = 0, _i = this.seqId.rowCount; i < _i; i++) {
const id = this.seqId.value(i);
if (maxSeqId < id) maxSeqId = id;
if (id < minSeqId) minSeqId = id;
}
const count = maxSeqId - minSeqId + 1;
const sequenceArray = new Array(maxSeqId + 1);
for (let i = 0; i < count; i++) {
sequenceArray[i] = 'X';
}
for (let i = 0, _i = this.seqId.rowCount; i < _i; i++) {
sequenceArray[this.seqId.value(i) - minSeqId] = this.code(this.residueName.value(i) || '');
}
this._seq = sequenceArray.join('');
this._offset = minSeqId - 1;
}
constructor(public kind: Kind, private residueName: Column<string>, private seqId: Column<number>, private code: (name: string) => string) {
}
}
}
......
......@@ -5,10 +5,11 @@
*/
import { mmCIF_Database as mmCIF } from 'mol-io/reader/cif/schema/mmcif'
import Sequence from '../../properties/sequence'
import StructureSequence from '../../properties/sequence'
import { Column } from 'mol-data/db';
import { AtomicHierarchy } from '../../properties/atomic';
import { Entities } from '../../properties/common';
import { Sequence } from '../../../../sequence';
// TODO how to handle microheterogeneity
// see http://mmcif.wwpdb.org/dictionaries/mmcif_pdbx_v50.dic/Categories/entity_poly_seq.html
......@@ -20,12 +21,12 @@ import { Entities } from '../../properties/common';
// corresponding ATOM_SITE entries should reflect this
// heterogeneity.
export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHierarchy): Sequence {
if (!cif.entity_poly_seq._rowCount) return Sequence.fromAtomicHierarchy(entities, hierarchy);
export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHierarchy): StructureSequence {
if (!cif.entity_poly_seq._rowCount) return StructureSequence.fromAtomicHierarchy(entities, hierarchy);
const { entity_id, num, mon_id } = cif.entity_poly_seq;
const byEntityKey: Sequence['byEntityKey'] = {};
const byEntityKey: StructureSequence['byEntityKey'] = {};
const count = entity_id.rowCount;
let i = 0;
......@@ -35,7 +36,15 @@ export function getSequence(cif: mmCIF, entities: Entities, hierarchy: AtomicHie
i++;
const id = entity_id.value(start);
byEntityKey[entities.getEntityIndex(id)] = { entityId: id, compId: Column.window(mon_id, start, i), num: Column.window(num, start, i) }
const _compId = Column.window(mon_id, start, i);
const _num = Column.window(num, start, i);
byEntityKey[entities.getEntityIndex(id)] = {
entityId: id,
compId: _compId,
num: _num,
sequence: Sequence.ofResidueNames(_compId, _num)
};
}
return { byEntityKey };
......
......@@ -6,7 +6,7 @@
import UUID from 'mol-util/uuid'
import Format from './format'
import Sequence from './properties/sequence'
import StructureSequence from './properties/sequence'
import { AtomicHierarchy, AtomicConformation } from './properties/atomic'
import { ModelSymmetry } from './properties/symmetry'
import { CoarseHierarchy, CoarseConformation } from './properties/coarse'
......@@ -31,7 +31,7 @@ interface Model extends Readonly<{
symmetry: ModelSymmetry,
entities: Entities,
sequence: Sequence,
sequence: StructureSequence,
atomicHierarchy: AtomicHierarchy,
atomicConformation: AtomicConformation,
......
......@@ -7,59 +7,55 @@
import { Column } from 'mol-data/db'
import { AtomicHierarchy } from './atomic/hierarchy';
import { Entities } from './common';
import { Sequence } from '../../../sequence';
interface Sequence {
readonly byEntityKey: { [key: number]: Sequence.Entity }
interface StructureSequence {
readonly byEntityKey: { [key: number]: StructureSequence.Entity }
}
// TODO lift to model/sequence/ folder
// TODO add one letter code sequence string
// TODO add mapping support to other sequence spaces, e.g. uniprot
// TODO add sequence kind, e.g. protein, dna, rna (alphabets?)
// TODO sequence alignment (take NGL code as starting point)
namespace Sequence {
namespace StructureSequence {
export interface Entity {
readonly entityId: string,
readonly num: Column<number>
// _entity_poly_seq.mon_id
readonly compId: Column<string>
readonly num: Column<number>,
// Corresponds to _entity_poly_seq.mon_id
readonly compId: Column<string>,
readonly sequence: Sequence
}
export function fromAtomicHierarchy(entities: Entities, hierarchy: AtomicHierarchy): Sequence {
const { label_entity_id } = hierarchy.chains
export function fromAtomicHierarchy(entities: Entities, hierarchy: AtomicHierarchy): StructureSequence {
const { label_comp_id, label_seq_id } = hierarchy.residues
const { chainSegments, residueSegments } = hierarchy
const byEntityKey: Sequence['byEntityKey'] = {};
const byEntityKey: StructureSequence['byEntityKey'] = { };
// TODO get min/max of label_seq_id to handle missing residues at start and in between
// note that this assumes label_seq_id is monotonically increasing
for (let cI = 0, _cI = hierarchy.chains._rowCount; cI < _cI; cI++) {
const entityKey = hierarchy.entityKey[cI];
// Only for polymers, trying to mirror _entity_poly_seq
if (byEntityKey[entityKey] !== void 0 || entities.data.type.value(entityKey) !== 'polymer') continue;
const chainCount = hierarchy.chains._rowCount
for (let i = 0; i < chainCount; ++i) {
const entityId = label_entity_id.value(i)
const entityIndex = entities.getEntityIndex(entityId)
// TODO only for polymers, mirroring _entity_poly_seq, ok???
if (entities.data.type.value(i) !== 'polymer') continue
let start = cI;
cI++;
while (cI < _cI && entityKey === hierarchy.entityKey[cI] && entities.data.type.value(entityKey) !== 'polymer') {
cI++;
}
cI--;
const entityKey = hierarchy.entityKey[entityIndex]
if (byEntityKey[entityKey] !== undefined) continue
const rStart = residueSegments.segmentMap[chainSegments.segments[start]];
const rEnd = residueSegments.segmentMap[chainSegments.segments[cI + 1]];
const start = residueSegments.segmentMap[chainSegments.segments[i]]
let end = residueSegments.segmentMap[chainSegments.segments[i + 1]]
// TODO better way to handle end???
if (end === undefined) end = hierarchy.residues._rowCount
const compId = Column.window(label_comp_id, rStart, rEnd);
const num = Column.window(label_seq_id, rStart, rEnd);
byEntityKey[entityKey] = {
entityId,
compId: Column.window(label_comp_id, start, end),
num: Column.window(label_seq_id, start, end)
}
entityId: entities.data.id.value(entityKey),
compId,
num,
sequence: Sequence.ofResidueNames(compId, num)
};
}
return { byEntityKey }
}
}
export default Sequence
\ No newline at end of file
export default StructureSequence
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment