From 6979ae82546ac1030e45f0768651caddd9d755ee Mon Sep 17 00:00:00 2001 From: David Sehnal <david.sehnal@gmail.com> Date: Tue, 3 Oct 2017 14:20:26 +0200 Subject: [PATCH] mmCIF schema --- src/reader/cif/data-model.ts | 22 ++ src/reader/cif/index.ts | 15 ++ src/reader/cif/schema.ts | 50 ++-- src/reader/cif/schema/mmcif.ts | 246 ++++++++++++++++++ src/reader/cif/text-field.ts | 58 ++--- src/reader/cif/text-parser.ts | 10 +- src/reader/common/column.ts | 16 +- src/reader/common/text/column/__token.ts | 114 -------- src/reader/common/text/column/fixed.ts | 32 +-- src/reader/common/text/column/token.ts | 55 ++++ src/reader/common/text/tokenizer.ts | 61 +++-- src/reader/gro/parser.ts | 12 +- src/reader/spec/cif.spec.ts | 4 +- ...xed-column.spec.ts => text-column.spec.ts} | 18 +- src/script.ts | 11 +- 15 files changed, 487 insertions(+), 237 deletions(-) create mode 100644 src/reader/cif/index.ts delete mode 100644 src/reader/common/text/column/__token.ts create mode 100644 src/reader/common/text/column/token.ts rename src/reader/spec/{fixed-column.spec.ts => text-column.spec.ts} (65%) diff --git a/src/reader/cif/data-model.ts b/src/reader/cif/data-model.ts index 642cf5621..4033df03a 100644 --- a/src/reader/cif/data-model.ts +++ b/src/reader/cif/data-model.ts @@ -83,4 +83,26 @@ export function DefaultUndefinedField(rowCount: number): Field { toIntArray: (p) => Column.createArray(rowCount, p).array, toFloatArray: (p) => Column.createArray(rowCount, p).array }; +} + +export function getMatrix(category: Category, field: string, rows: number, cols: number, row: number) { + const ret: number[][] = []; + for (let i = 0; i < rows; i++) { + const r: number[] = []; + for (let j = 0; j < cols; j++) { + const f = category.getField(`${field}[${i + 1}][${j + 1}]`); + r[j] = f ? f.float(row) : 0.0; + } + ret[i] = r; + } + return ret; +} + +export function getVector(category: Category, field: string, rows: number, row: number) { + const ret: number[] = []; + for (let i = 0; i < rows; i++) { + const f = category.getField(`${field}[${i + 1}]`); + ret[i] = f ? f.float(row) : 0.0; + } + return ret; } \ No newline at end of file diff --git a/src/reader/cif/index.ts b/src/reader/cif/index.ts new file mode 100644 index 000000000..c7ba6c3d5 --- /dev/null +++ b/src/reader/cif/index.ts @@ -0,0 +1,15 @@ +/* + * Copyright (c) 2017 molio contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +import parseText from './text-parser' +import { apply as applySchema } from './schema' +import mmCIF from './schema/mmcif' + +export default { + parseText, + applySchema, + schema: { mmCIF } +} \ No newline at end of file diff --git a/src/reader/cif/schema.ts b/src/reader/cif/schema.ts index 1244827e1..aea1576e9 100644 --- a/src/reader/cif/schema.ts +++ b/src/reader/cif/schema.ts @@ -6,6 +6,7 @@ import * as Data from './data-model' import * as Column from '../common/column' +import StringPool from '../../utils/short-string-pool' /** * A schema defines the shape of categories and fields. @@ -47,27 +48,21 @@ export type Category<Fields> = Fields & { } export namespace Category { - export type Schema = { '@alias'?: string } & { [field: string]: Field.Schema<any> } + export type Schema = { [field: string]: Field.Schema<any> } export type Instance<T extends Schema> = Category<{ [F in keyof T]: Column.Column<T[F]['type']> }> } -// export interface Field<T> { -// readonly isDefined: boolean, -// value(row: number): T, -// presence(row: number): Data.ValuePresence, -// areValuesEqual(rowA: number, rowB: number): boolean, -// stringEquals(row: number, value: string | null): boolean, -// /** Converts the selected row range to an array. ctor might or might not be called depedning on the source data format. */ -// toArray(params?: Column.ToArrayParams): ReadonlyArray<T> -// } - export namespace Field { - export interface Schema<T> { type: T, ctor: (field: Data.Field) => Column.Column<T>, undefinedField: (c: number) => Data.Field, alias?: string }; + export interface Schema<T> { type: T, ctor: (field: Data.Field, category: Data.Category, key: string) => Column.Column<T>, undefinedField: (c: number) => Data.Field, alias?: string }; export interface Spec { undefinedField?: (c: number) => Data.Field, alias?: string } + export function alias(name: string): Schema<any> { return { alias: name } as any; } + export function pooledStr(spec?: Spec) { return createSchema(spec, PooledStr); } export function str(spec?: Spec) { return createSchema(spec, Str); } export function int(spec?: Spec) { return createSchema(spec, Int); } export function float(spec?: Spec) { return createSchema(spec, Float); } + export function vector(rows: number, spec?: Spec) { return createSchema(spec, Vector(rows)); } + export function matrix(rows: number, cols: number, spec?: Spec) { return createSchema(spec, Matrix(rows, cols)); } function create<T>(field: Data.Field, value: (row: number) => T, toArray: Column.Column<T>['toArray']): Column.Column<T> { const presence = field.presence; @@ -81,11 +76,31 @@ export namespace Field { }; } + function PooledStr(field: Data.Field) { + const pool = StringPool.create(); + const value = (row: number) => StringPool.get(pool, field.str(row)); + const array = (params?: Column.ToArrayParams) => Column.createAndFillArray(field.rowCount, value, params); + return create<string>(field, value, array); + } function Str(field: Data.Field) { return create(field, field.str, field.toStringArray); } function Int(field: Data.Field) { return create(field, field.int, field.toIntArray); } function Float(field: Data.Field) { return create(field, field.float, field.toFloatArray); } - function createSchema<T>(spec: Spec | undefined, ctor: (field: Data.Field) => Column.Column<T>): Schema<T> { + function Vector(rows: number) { + return function(field: Data.Field, category: Data.Category, key: string) { + const value = (row: number) => Data.getVector(category, key, rows, row); + return create(field, value, params => Column.createAndFillArray(field.rowCount, value, params)); + } + } + + function Matrix(rows: number, cols: number) { + return function(field: Data.Field, category: Data.Category, key: string) { + const value = (row: number) => Data.getMatrix(category, key, rows, cols, row); + return create(field, value, params => Column.createAndFillArray(field.rowCount, value, params)); + } + } + + function createSchema<T>(spec: Spec | undefined, ctor: (field: Data.Field, category: Data.Category, key: string) => Column.Column<T>): Schema<T> { return { type: 0 as any, ctor, undefinedField: (spec && spec.undefinedField) || Data.DefaultUndefinedField, alias: spec && spec.alias }; } } @@ -111,8 +126,9 @@ class _Category implements Category<any> { // tslint:disable-line:class-name Object.defineProperty(this, k, { get: function() { if (cache[k]) return cache[k]; - const field = _category.getField(s.alias || k) || s.undefinedField(_category.rowCount); - cache[k] = s.ctor(field); + const name = s.alias || k; + const field = _category.getField(name) || s.undefinedField(_category.rowCount); + cache[k] = s.ctor(field, _category, name); return cache[k]; }, enumerable: true, @@ -127,6 +143,8 @@ function createBlock(schema: Block.Schema, block: Data.Block): any { } function createCategory(key: string, schema: Category.Schema, block: Data.Block) { - const cat = block.categories[schema['@alias'] || key]; + const alias = (schema['@alias'] && schema['@alias'].alias) || key; + const name = alias[0] === '_' ? alias : '_' + alias; + const cat = block.categories[name]; return new _Category(cat || Data.Category.Empty, schema, !!cat); } \ No newline at end of file diff --git a/src/reader/cif/schema/mmcif.ts b/src/reader/cif/schema/mmcif.ts index e69de29bb..fdca87dc9 100644 --- a/src/reader/cif/schema/mmcif.ts +++ b/src/reader/cif/schema/mmcif.ts @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2017 molio contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +import { Field } from '../schema' + +const pooledStr = Field.pooledStr(); +const str = Field.str(); +const int = Field.int(); +const float = Field.float(); + +const entry = { + id: str +} + +const entity = { + id: str, + type: str as Field.Schema<'polymer' | 'non-polymer' | 'water'>, + src_method: str, + pdbx_description: str, + formula_weight: float, + pdbx_number_of_molecules: int, + details: str, + pdbx_mutation: str, + pdbx_fragment: str, + pdbx_ec: str +} + +const exptl = { + entry_id: str, + method: str +} + +const cell = { + entry_id: str, + length_a: float, + length_b: float, + length_c: float, + angle_alpha: float, + angle_beta: float, + angle_gamma: float, + Z_PDB: int, + pdbx_unique_axis: str +} + +const symmetry = { + entry_id: str, + space_group_name_HM: Field.str({ alias: 'space_group_name_H-M' }), + pdbx_full_space_group_name_HM: Field.str({ alias: 'pdbx_full_space_group_name_H-M' }), + cell_setting: str, + Int_Tables_number: int, + space_group_name_Hall: str +} + +const struct_conf = { + conf_type_id: str, + id: str, + pdbx_PDB_helix_id: int, + beg_label_comp_id: pooledStr, + beg_label_asym_id: pooledStr, + beg_label_seq_id: int, + pdbx_beg_PDB_ins_code: pooledStr, + end_label_comp_id: pooledStr, + end_label_asym_id: pooledStr, + end_label_seq_id: int, + pdbx_end_PDB_ins_code: pooledStr, + beg_auth_comp_id: pooledStr, + beg_auth_asym_id: pooledStr, + beg_auth_seq_id: int, + end_auth_comp_id: pooledStr, + end_auth_asym_id: pooledStr, + end_auth_seq_id: int, + pdbx_PDB_helix_class: int, + details: str, + pdbx_PDB_helix_length: int +} + +const struct_sheet_range = { + sheet_id: pooledStr, + id: int, + beg_label_comp_id: pooledStr, + beg_label_asym_id: pooledStr, + beg_label_seq_id: int, + pdbx_beg_PDB_ins_code: pooledStr, + end_label_comp_id: pooledStr, + end_label_asym_id: pooledStr, + end_label_seq_id: int, + pdbx_end_PDB_ins_code: pooledStr, + beg_auth_comp_id: pooledStr, + beg_auth_asym_id: pooledStr, + beg_auth_seq_id: int, + end_auth_comp_id: pooledStr, + end_auth_asym_id: pooledStr, + end_auth_seq_id: int +} + +type StructConnTypeId = + | 'covale' + | 'covale_base' + | 'covale_phosphate' + | 'covale_sugar' + | 'disulf' + | 'hydrog' + | 'metalc' + | 'mismat' + | 'modres' + | 'saltbr' + +type BondValueOrder = + | 'SING' + | 'DOUB' + | 'TRIP' + | 'QUAD' + +const struct_conn = { + id: str, + conn_type_id: pooledStr as Field.Schema<StructConnTypeId>, + pdbx_PDB_id: str, + ptnr1_label_asym_id: pooledStr, + ptnr1_label_comp_id: pooledStr, + ptnr1_label_seq_id: int, + ptnr1_label_atom_id: pooledStr, + pdbx_ptnr1_label_alt_id: pooledStr, + pdbx_ptnr1_PDB_ins_code: pooledStr, + pdbx_ptnr1_standard_comp_id: pooledStr, + ptnr1_symmetry: pooledStr, + ptnr2_label_asym_id: pooledStr, + ptnr2_label_comp_id: pooledStr, + ptnr2_label_seq_id: int, + ptnr2_label_atom_id: pooledStr, + pdbx_ptnr2_label_alt_id: pooledStr, + pdbx_ptnr2_PDB_ins_code: pooledStr, + ptnr1_auth_asym_id: pooledStr, + ptnr1_auth_comp_id: pooledStr, + ptnr1_auth_seq_id: int, + ptnr2_auth_asym_id: pooledStr, + ptnr2_auth_comp_id: pooledStr, + ptnr2_auth_seq_id: int, + ptnr2_symmetry: pooledStr, + pdbx_ptnr3_label_atom_id: pooledStr, + pdbx_ptnr3_label_seq_id: int, + pdbx_ptnr3_label_comp_id: pooledStr, + pdbx_ptnr3_label_asym_id: pooledStr, + pdbx_ptnr3_label_alt_id: pooledStr, + pdbx_ptnr3_PDB_ins_code: pooledStr, + details: pooledStr, + pdbx_dist_value: float, + pdbx_value_order: pooledStr as Field.Schema<BondValueOrder> +} + +const struct_conn_type = { + id: str as Field.Schema<StructConnTypeId>, + criteria: str, + reference: str +} + +const chem_comp_bond = { + comp_id: pooledStr, + pdbx_stereo_config: pooledStr, + pdbx_ordinal: int, + pdbx_aromatic_flag: pooledStr as Field.Schema<'Y' | 'N'>, + atom_id_1: pooledStr, + atom_id_2: pooledStr, + value_order: pooledStr as Field.Schema<BondValueOrder> +} + +const pdbx_struct_assembly = { + id: str, + details: str, + method_details: str, + oligomeric_details: str, + oligomeric_count: int +} + +const pdbx_struct_assembly_gen = { + assembly_id: str, + oper_expression: str, + asym_id_list: str +} + +const pdbx_struct_oper_list = { + id: str, + type: str, + name: str, + symmetry_operation: str, + matrix: Field.matrix(3, 3), + vector: Field.vector(3) +} + +const pdbx_struct_mod_residue = { + id: int, + label_asym_id: pooledStr, + label_seq_id: int, + label_comp_id: pooledStr, + auth_asym_id: pooledStr, + auth_seq_id: int, + auth_comp_id: pooledStr, + PDB_ins_code: pooledStr, + parent_comp_id: pooledStr, + details: str +} + +const atom_site = { + group_PDB: pooledStr, + id: int, + type_symbol: pooledStr, + label_atom_id: pooledStr, + label_alt_id: pooledStr, + label_comp_id: pooledStr, + label_asym_id: pooledStr, + label_entity_id: pooledStr, + label_seq_id: int, + pdbx_PDB_ins_code: pooledStr, + pdbx_formal_charge: pooledStr, + Cartn_x: float, + Cartn_y: float, + Cartn_z: float, + occupancy: float, + B_iso_or_equiv: float, + auth_atom_id: pooledStr, + auth_comp_id: pooledStr, + auth_asym_id: pooledStr, + auth_seq_id: int, + pdbx_PDB_model_num: int +} + +const schema = { + entry, + entity, + exptl, + cell, + symmetry, + struct_conf, + struct_sheet_range, + struct_conn, + struct_conn_type, + chem_comp_bond, + pdbx_struct_assembly, + pdbx_struct_assembly_gen, + pdbx_struct_oper_list, + pdbx_struct_mod_residue, + atom_site +}; +export default schema; \ No newline at end of file diff --git a/src/reader/cif/text-field.ts b/src/reader/cif/text-field.ts index f09893691..c67cc8cf7 100644 --- a/src/reader/cif/text-field.ts +++ b/src/reader/cif/text-field.ts @@ -5,30 +5,31 @@ */ import * as Column from '../common/column' +import * as TokenColumn from '../common/text/column/token' +import { Tokens } from '../common/text/tokenizer' import * as Data from './data-model' import { parseInt as fastParseInt, parseFloat as fastParseFloat } from '../common/text/number-parser' -import StringPool from '../../utils/short-string-pool' -export default function CifTextField(data: string, tokens: ArrayLike<number>, rowCount: number): Data.Field { - const stringPool = StringPool.create(); +export default function CifTextField(tokens: Tokens, rowCount: number): Data.Field { + const { data, indices } = tokens; const str: Data.Field['str'] = row => { - const ret = StringPool.get(stringPool, data.substring(tokens[2 * row], tokens[2 * row + 1])); + const ret = data.substring(indices[2 * row], indices[2 * row + 1]); if (ret === '.' || ret === '?') return ''; return ret; }; const int: Data.Field['int'] = row => { - return fastParseInt(data, tokens[2 * row], tokens[2 * row + 1]) || 0; + return fastParseInt(data, indices[2 * row], indices[2 * row + 1]) || 0; }; const float: Data.Field['float'] = row => { - return fastParseFloat(data, tokens[2 * row], tokens[2 * row + 1]) || 0; + return fastParseFloat(data, indices[2 * row], indices[2 * row + 1]) || 0; }; const presence: Data.Field['presence'] = row => { - const s = tokens[2 * row]; - if (tokens[2 * row + 1] - s !== 1) return Data.ValuePresence.Present; + const s = indices[2 * row]; + if (indices[2 * row + 1] - s !== 1) return Data.ValuePresence.Present; const v = data.charCodeAt(s); if (v === 46 /* . */) return Data.ValuePresence.NotSpecified; if (v === 63 /* ? */) return Data.ValuePresence.Unknown; @@ -42,43 +43,20 @@ export default function CifTextField(data: string, tokens: ArrayLike<number>, ro int, float, presence, - areValuesEqual(rowA, rowB) { - const aS = tokens[2 * rowA], bS = tokens[2 * rowB]; - const len = tokens[2 * rowA + 1] - aS; - if (len !== tokens[2 * rowB + 1] - bS) return false; - for (let i = 0; i < len; i++) { - if (data.charCodeAt(i + aS) !== data.charCodeAt(i + bS)) { - return false; - } - } - return true; - }, - stringEquals(row, value) { - const s = tokens[2 * row]; - if (!value) return presence(row) !== Data.ValuePresence.Present; + areValuesEqual: TokenColumn.areValuesEqualProvider(tokens), + stringEquals(row, v) { + const s = indices[2 * row]; + const value = v || ''; + if (!value && presence(row) !== Data.ValuePresence.Present) return true; const len = value.length; - if (len !== tokens[2 * row + 1] - s) return false; + if (len !== indices[2 * row + 1] - s) return false; for (let i = 0; i < len; i++) { if (data.charCodeAt(i + s) !== value.charCodeAt(i)) return false; } return true; }, - toStringArray(params) { - const { array, start } = Column.createArray(rowCount, params); - return fillArrayValues(str, array, start); - }, - toIntArray(params) { - const { array, start } = Column.createArray(rowCount, params); - return fillArrayValues(int, array, start); - }, - toFloatArray(params) { - const { array, start } = Column.createArray(rowCount, params); - return fillArrayValues(float, array, start); - } + toStringArray(params) { return Column.createAndFillArray(rowCount, str, params); }, + toIntArray(params) { return Column.createAndFillArray(rowCount, int, params); }, + toFloatArray(params) { return Column.createAndFillArray(rowCount, float, params); } } -} - -function fillArrayValues(value: (row: number) => any, target: any[], start: number) { - for (let i = 0, _e = target.length; i < _e; i++) target[i] = value(start + i); - return target; } \ No newline at end of file diff --git a/src/reader/cif/text-parser.ts b/src/reader/cif/text-parser.ts index 7ea6a0d9b..4e37ddb56 100644 --- a/src/reader/cif/text-parser.ts +++ b/src/reader/cif/text-parser.ts @@ -24,7 +24,7 @@ import * as Data from './data-model' import Field from './text-field' -import { Tokens } from '../common/text/tokenizer' +import { Tokens, TokenBuilder } from '../common/text/tokenizer' import Result from '../result' /** @@ -430,7 +430,7 @@ function handleSingle(tokenizer: TokenizerState, categories: { [name: string]: D errorMessage: 'Expected value.' } } - fields[fieldName] = Field(tokenizer.data, [tokenizer.currentTokenStart, tokenizer.currentTokenEnd], 1); + fields[fieldName] = Field({ data: tokenizer.data, indices: [tokenizer.currentTokenStart, tokenizer.currentTokenEnd], count: 1 }, 1); moveNext(tokenizer); } @@ -461,11 +461,11 @@ function handleLoop(tokenizer: TokenizerState, categories: { [name: string]: Dat const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32; const tokens: Tokens[] = []; const fieldCount = fieldNames.length; - for (let i = 0; i < fieldCount; i++) tokens[i] = Tokens.create(rowCountEstimate); + for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer, rowCountEstimate); let tokenCount = 0; while (tokenizer.currentTokenType === CifTokenType.Value) { - Tokens.add(tokens[(tokenCount++) % fieldCount], tokenizer.currentTokenStart, tokenizer.currentTokenEnd); + TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.currentTokenStart, tokenizer.currentTokenEnd); moveNext(tokenizer); } @@ -480,7 +480,7 @@ function handleLoop(tokenizer: TokenizerState, categories: { [name: string]: Dat const rowCount = (tokenCount / fieldCount) | 0; const fields = Object.create(null); for (let i = 0; i < fieldCount; i++) { - fields[fieldNames[i]] = Field(tokenizer.data, tokens[i].indices, rowCount); + fields[fieldNames[i]] = Field(tokens[i], rowCount); } categories[name] = Data.Category(rowCount, fields); diff --git a/src/reader/common/column.ts b/src/reader/common/column.ts index dab5ac063..1c27a911e 100644 --- a/src/reader/common/column.ts +++ b/src/reader/common/column.ts @@ -53,4 +53,18 @@ export function createArray(rowCount: number, params?: ToArrayParams) { const s = typeof start !== 'undefined' ? Math.max(Math.min(start, rowCount - 1), 0) : 0; const e = typeof end !== 'undefined' ? Math.min(end, rowCount) : rowCount; return { array: new c(e - s) as any[], start: s, end: e }; -} \ No newline at end of file +} + +/** A helped function for Column.toArray */ +export function fillArrayValues(value: (row: number) => any, target: any[], start: number) { + for (let i = 0, _e = target.length; i < _e; i++) target[i] = value(start + i); + return target; +} + +/** A helped function for Column.toArray */ +export function createAndFillArray(rowCount: number, value: (row: number) => any, params?: ToArrayParams) { + const { array, start } = createArray(rowCount, params); + return fillArrayValues(value, array, start); +} + + diff --git a/src/reader/common/text/column/__token.ts b/src/reader/common/text/column/__token.ts deleted file mode 100644 index 87326c26d..000000000 --- a/src/reader/common/text/column/__token.ts +++ /dev/null @@ -1,114 +0,0 @@ -// /* -// * Copyright (c) 2017 molio contributors, licensed under MIT, See LICENSE file for more info. -// * -// * @author David Sehnal <david.sehnal@gmail.com> -// */ - -// import * as Data from '../../../../data/data' -// import { parseInt as fastParseInt, parseFloat as fastParseFloat } from '../number-parser' -// import { Tokens } from '../tokenizer' -// import ShortStringPool from '../../../../utils/short-string-pool' - -// export function createTokenFields(data: string, fields: string[], tokens: Tokens): { [name: string]: Data.Field } { -// const fi: TokenFieldInfo = { data, fieldCount: fields.length, tokens: tokens.indices }; -// const categoryFields = Object.create(null); -// for (let i = 0; i < fi.fieldCount; ++i) { -// categoryFields[fields[i]] = TokenField(fi, i); -// } -// return categoryFields; -// } - -// export interface TokenFieldInfo { -// data: string, -// tokens: ArrayLike<number>, -// fieldCount: number, -// isCif?: boolean -// } - -// export function TokenField(info: TokenFieldInfo, index: number): Data.Field { -// const { data, tokens, fieldCount, isCif = false } = info; -// const stringPool = ShortStringPool.create(); - -// const str: Data.Field['str'] = isCif ? row => { -// const i = (row * fieldCount + index) * 2; -// const ret = ShortStringPool.get(stringPool, data.substring(tokens[i], tokens[i + 1])); -// if (ret === '.' || ret === '?') return null; -// return ret; -// } : row => { -// const i = (row * fieldCount + index) * 2; -// return ShortStringPool.get(stringPool, data.substring(tokens[i], tokens[i + 1])); -// }; - -// const int: Data.Field['int'] = row => { -// const i = (row * fieldCount + index) * 2; -// return fastParseInt(data, tokens[i], tokens[i + 1]) || 0; -// }; - -// const float: Data.Field['float'] = row => { -// const i = (row * fieldCount + index) * 2; -// return fastParseFloat(data, tokens[i], tokens[i + 1]) || 0; -// }; - -// const presence: Data.Field['presence'] = isCif ? row => { -// const i = 2 * (row * fieldCount + index); -// const s = tokens[i]; -// if (tokens[i + 1] - s !== 1) return Data.ValuePresence.Present; -// const v = data.charCodeAt(s); -// if (v === 46 /* . */) return Data.ValuePresence.NotSpecified; -// if (v === 63 /* ? */) return Data.ValuePresence.Unknown; -// return Data.ValuePresence.Present; -// } : row => { -// const i = 2 * (row * fieldCount + index); -// return tokens[i] === tokens[i + 1] ? Data.ValuePresence.NotSpecified : Data.ValuePresence.Present -// }; - -// return { -// isDefined: true, -// str, -// int, -// float, -// value: str, -// presence, -// areValuesEqual: (rowA, rowB) => { -// const aI = (rowA * fieldCount + index) * 2, aS = tokens[aI]; -// const bI = (rowB * fieldCount + index) * 2, bS = tokens[bI]; -// const len = tokens[aI + 1] - aS; -// if (len !== tokens[bI + 1] - bS) return false; -// for (let i = 0; i < len; i++) { -// if (data.charCodeAt(i + aS) !== data.charCodeAt(i + bS)) { -// return false; -// } -// } -// return true; -// }, -// stringEquals: (row, value) => { -// const aI = (row * fieldCount + index) * 2; -// const s = tokens[aI]; -// if (!value) return presence(row) !== Data.ValuePresence.Present; -// const len = value.length; -// if (len !== tokens[aI + 1] - s) return false; -// for (let i = 0; i < len; i++) { -// if (data.charCodeAt(i + s) !== value.charCodeAt(i)) return false; -// } -// return true; -// }, -// toStringArray: (startRow, endRowExclusive, ctor) => { -// const count = endRowExclusive - startRow; -// const ret = ctor(count) as any; -// for (let i = 0; i < count; i++) { ret[i] = str(startRow + i); } -// return ret; -// }, -// toIntArray: (startRow, endRowExclusive, ctor) => { -// const count = endRowExclusive - startRow; -// const ret = ctor(count) as any; -// for (let i = 0; i < count; i++) { ret[i] = int(startRow + i); } -// return ret; -// }, -// toFloatArray: (startRow, endRowExclusive, ctor) => { -// const count = endRowExclusive - startRow; -// const ret = ctor(count) as any; -// for (let i = 0; i < count; i++) { ret[i] = float(startRow + i); } -// return ret; -// } -// } -// } \ No newline at end of file diff --git a/src/reader/common/text/column/fixed.ts b/src/reader/common/text/column/fixed.ts index bb307f969..7caf97e35 100644 --- a/src/reader/common/text/column/fixed.ts +++ b/src/reader/common/text/column/fixed.ts @@ -4,46 +4,41 @@ * @author David Sehnal <david.sehnal@gmail.com> */ -import { Column, ColumnType, createArray } from '../../column' -import { trimStr, Lines } from '../tokenizer' +import { Column, ColumnType, createAndFillArray } from '../../column' +import { trimStr, Tokens } from '../tokenizer' import { parseIntSkipLeadingWhitespace, parseFloatSkipLeadingWhitespace } from '../number-parser' import StringPool from '../../../../utils/short-string-pool' -export default function FixedColumnProvider(lines: Lines) { +export default function FixedColumnProvider(lines: Tokens) { return function<T extends ColumnType>(offset: number, width: number, type: T) { return FixedColumn(lines, offset, width, type); } } -function fillArrayValues(value: (row: number) => any, target: any[], start: number) { - for (let i = 0, _e = target.length; i < _e; i++) target[i] = value(start + i); - return target; -} - -export function FixedColumn<T extends ColumnType>(lines: Lines, offset: number, width: number, type: T): Column<T['@type']> { - const { data, tokens, count: rowCount } = lines; +export function FixedColumn<T extends ColumnType>(lines: Tokens, offset: number, width: number, type: T): Column<T['@type']> { + const { data, indices, count: rowCount } = lines; const { kind } = type; const pool = kind === 'pooled-str' ? StringPool.create() : void 0; const value: Column<T['@type']>['value'] = kind === 'str' ? row => { - let s = tokens[2 * row] + offset, le = tokens[2 * row + 1]; + let s = indices[2 * row] + offset, le = indices[2 * row + 1]; if (s >= le) return ''; let e = s + width; if (e > le) e = le; return trimStr(data, s, e); } : kind === 'pooled-str' ? row => { - let s = tokens[2 * row] + offset, le = tokens[2 * row + 1]; + let s = indices[2 * row] + offset, le = indices[2 * row + 1]; if (s >= le) return ''; let e = s + width; if (e > le) e = le; return StringPool.get(pool!, trimStr(data, s, e)); } : kind === 'int' ? row => { - const s = tokens[2 * row] + offset; - if (s > tokens[2 * row + 1]) return 0; + const s = indices[2 * row] + offset; + if (s > indices[2 * row + 1]) return 0; return parseIntSkipLeadingWhitespace(data, s, s + width); } : row => { - const s = tokens[2 * row] + offset; - if (s > tokens[2 * row + 1]) return 0; + const s = indices[2 * row] + offset; + if (s > indices[2 * row + 1]) return 0; return parseFloatSkipLeadingWhitespace(data, s, s + width); }; return { @@ -51,10 +46,7 @@ export function FixedColumn<T extends ColumnType>(lines: Lines, offset: number, rowCount, value, isValueDefined(row) { return true; }, - toArray(params) { - const { array, start } = createArray(rowCount, params); - return fillArrayValues(value, array, start); - }, + toArray(params) { return createAndFillArray(rowCount, value, params); }, areValuesEqual(rowA, rowB) { return value(rowA) === value(rowB); } diff --git a/src/reader/common/text/column/token.ts b/src/reader/common/text/column/token.ts new file mode 100644 index 000000000..c78ccc794 --- /dev/null +++ b/src/reader/common/text/column/token.ts @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2017 molio contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +import { Column, ColumnType, createAndFillArray } from '../../column' +import { Tokens } from '../tokenizer' +import { parseInt as fastParseInt, parseFloat as fastParseFloat } from '../number-parser' +import StringPool from '../../../../utils/short-string-pool' + +export default function TokenColumnProvider(tokens: Tokens) { + return function<T extends ColumnType>(type: T) { + return TokenColumn(tokens, type); + } +} + +export function TokenColumn<T extends ColumnType>(tokens: Tokens, type: T): Column<T['@type']> { + const { data, indices, count: rowCount } = tokens; + const { kind } = type; + const pool = kind === 'pooled-str' ? StringPool.create() : void 0; + + const value: Column<T['@type']>['value'] = + kind === 'str' + ? row => data.substring(indices[2 * row], indices[2 * row + 1]) + : kind === 'pooled-str' + ? row => StringPool.get(pool!, data.substring(indices[2 * row], indices[2 * row + 1])) + : kind === 'int' + ? row => fastParseInt(data, indices[2 * row], indices[2 * row + 1]) || 0 + : row => fastParseFloat(data, indices[2 * row], indices[2 * row + 1]) || 0; + + return { + isDefined: true, + rowCount, + value, + isValueDefined(row) { return true; }, + toArray(params) { return createAndFillArray(rowCount, value, params); }, + areValuesEqual: areValuesEqualProvider(tokens) + }; +} + +export function areValuesEqualProvider(tokens: Tokens) { + const { data, indices } = tokens; + return function(rowA: number, rowB: number) { + const aS = indices[2 * rowA], bS = indices[2 * rowB]; + const len = indices[2 * rowA + 1] - aS; + if (len !== indices[2 * rowB + 1] - bS) return false; + for (let i = 0; i < len; i++) { + if (data.charCodeAt(i + aS) !== data.charCodeAt(i + bS)) { + return false; + } + } + return true; + } +} \ No newline at end of file diff --git a/src/reader/common/text/tokenizer.ts b/src/reader/common/text/tokenizer.ts index 97fb00ef7..326497591 100644 --- a/src/reader/common/text/tokenizer.ts +++ b/src/reader/common/text/tokenizer.ts @@ -17,10 +17,10 @@ export interface Tokenizer { currentTokenEnd: number } -export interface Lines { +export interface Tokens { data: string, count: number, - tokens: ArrayLike<number> + indices: ArrayLike<number> } export function Tokenizer(data: string): Tokenizer { @@ -80,15 +80,21 @@ export namespace Tokenizer { } /** Advance the state by the given number of lines and return line starts/ends as tokens. */ - export function readLines(state: Tokenizer, count: number): Lines { - const lineTokens = Tokens.create(count * 2); + export function readLine(state: Tokenizer): string { + markLine(state); + return getTokenString(state); + } + + /** Advance the state by the given number of lines and return line starts/ends as tokens. */ + export function readLines(state: Tokenizer, count: number): Tokens { + const lineTokens = TokenBuilder.create(state, count * 2); for (let i = 0; i < count; i++) { markLine(state); - Tokens.addUnchecked(lineTokens, state.currentTokenStart, state.currentTokenEnd); + TokenBuilder.addUnchecked(lineTokens, state.currentTokenStart, state.currentTokenEnd); } - return { data: state.data, count, tokens: lineTokens.indices }; + return { data: state.data, count, indices: lineTokens.indices }; } /** @@ -170,38 +176,43 @@ export function trimStr(data: string, start: number, end: number) { return data.substring(s, e + 1); } -export interface Tokens { - indicesLenMinus2: number, - count: number, - indices: Uint32Array -} +export namespace TokenBuilder { + interface Builder extends Tokens { + offset: number, + indices: Uint32Array, + indicesLenMinus2: number + } -export namespace Tokens { - function resize(tokens: Tokens) { + function resize(builder: Builder) { // scale the size using golden ratio, because why not. - const newBuffer = new Uint32Array((1.61 * tokens.indices.length) | 0); - newBuffer.set(tokens.indices); - tokens.indices = newBuffer; - tokens.indicesLenMinus2 = (newBuffer.length - 2) | 0; + const newBuffer = new Uint32Array((1.61 * builder.indices.length) | 0); + newBuffer.set(builder.indices); + builder.indices = newBuffer; + builder.indicesLenMinus2 = (newBuffer.length - 2) | 0; } export function add(tokens: Tokens, start: number, end: number) { - if (tokens.count > tokens.indicesLenMinus2) { - resize(tokens); + const builder = tokens as Builder; + if (builder.offset > builder.indicesLenMinus2) { + resize(builder); } - tokens.indices[tokens.count++] = start; - tokens.indices[tokens.count++] = end; + builder.indices[builder.offset++] = start; + builder.indices[builder.offset++] = end; + tokens.count++; } export function addUnchecked(tokens: Tokens, start: number, end: number) { - tokens.indices[tokens.count++] = start; - tokens.indices[tokens.count++] = end; + (tokens as Builder).indices[(tokens as Builder).offset++] = start; + (tokens as Builder).indices[(tokens as Builder).offset++] = end; + tokens.count++; } - export function create(size: number): Tokens { - return { + export function create(tokenizer: Tokenizer, size: number): Tokens { + return <Builder>{ + data: tokenizer.data, indicesLenMinus2: (size - 2) | 0, count: 0, + offset: 0, indices: new Uint32Array(size) } } diff --git a/src/reader/gro/parser.ts b/src/reader/gro/parser.ts index bca4df98a..28a1e26ca 100644 --- a/src/reader/gro/parser.ts +++ b/src/reader/gro/parser.ts @@ -40,14 +40,11 @@ function State(tokenizer: Tokenizer): State { */ function handleTitleString(state: State) { const { tokenizer, header } = state; - Tokenizer.markLine(tokenizer); - - let line = Tokenizer.getTokenString(tokenizer); + let line = Tokenizer.readLine(tokenizer); // skip potential empty lines... if (line.trim().length === 0) { - Tokenizer.markLine(tokenizer); - line = Tokenizer.getTokenString(tokenizer); + line = Tokenizer.readLine(tokenizer); } const timeOffset = line.lastIndexOf('t='); @@ -92,7 +89,7 @@ function handleAtoms(state: State): Schema.Atoms { const { tokenizer, numberOfAtoms } = state; const lines = Tokenizer.readLines(tokenizer, numberOfAtoms); - const positionSample = tokenizer.data.substring(lines.tokens[0], lines.tokens[1]).substring(20); + const positionSample = tokenizer.data.substring(lines.indices[0], lines.indices[1]).substring(20); const precisions = positionSample.match(/\.\d+/g)!; const hasVelocities = precisions.length === 6; @@ -133,8 +130,7 @@ function handleAtoms(state: State): Schema.Atoms { */ function handleBoxVectors(state: State) { const { tokenizer } = state; - Tokenizer.markLine(tokenizer); - const values = Tokenizer.getTokenString(tokenizer).trim().split(/\s+/g); + const values = Tokenizer.readLine(tokenizer).trim().split(/\s+/g); state.header.box = [+values[0], +values[1], +values[2]]; } diff --git a/src/reader/spec/cif.spec.ts b/src/reader/spec/cif.spec.ts index 80b4c1388..87950cf64 100644 --- a/src/reader/spec/cif.spec.ts +++ b/src/reader/spec/cif.spec.ts @@ -10,8 +10,8 @@ import * as Schema from '../cif/schema' const columnData = `123abc`; -const intField = TextField(columnData, [0, 1, 1, 2, 2, 3], 3); -const strField = TextField(columnData, [3, 4, 4, 5, 5, 6], 3); +const intField = TextField({ data: columnData, indices: [0, 1, 1, 2, 2, 3], count: 3 }, 3); +const strField = TextField({ data: columnData, indices: [3, 4, 4, 5, 5, 6], count: 3 }, 3); const testBlock = Data.Block({ 'atoms': Data.Category(3, { diff --git a/src/reader/spec/fixed-column.spec.ts b/src/reader/spec/text-column.spec.ts similarity index 65% rename from src/reader/spec/fixed-column.spec.ts rename to src/reader/spec/text-column.spec.ts index c91bfb742..e37d18d6a 100644 --- a/src/reader/spec/fixed-column.spec.ts +++ b/src/reader/spec/text-column.spec.ts @@ -6,6 +6,7 @@ */ import FixedColumn from '../common/text/column/fixed' +import TokenColumn from '../common/text/column/token' import { ColumnType } from '../common/column' const lines = [ @@ -16,7 +17,7 @@ const lines = [ ' 5' ] -const data = lines.join('\n'); +const linesData = lines.join('\n'); const linesTokens = (function () { const tokens: number[] = []; @@ -25,12 +26,12 @@ const linesTokens = (function () { tokens.push(last, last + l.length); last += l.length + 1; } - if (tokens[tokens.length - 1] > data.length) tokens[tokens.length - 1] = data.length; + if (tokens[tokens.length - 1] > linesData.length) tokens[tokens.length - 1] = linesData.length; return tokens; }()); describe('fixed text column', () => { - const col = FixedColumn({ data, tokens: linesTokens, count: lines.length }); + const col = FixedColumn({ data: linesData, indices: linesTokens, count: lines.length }); const col1 = col(0, 5, ColumnType.float); const col2 = col(5, 4, ColumnType.str); it('number', () => { @@ -48,3 +49,14 @@ describe('fixed text column', () => { expect(col2.value(4)).toBe(''); }) }); + +describe('token text column', () => { + const tokensData = '321'; + const col = TokenColumn({ data: tokensData, indices: [0, 1, 1, 2, 2, 3], count: 3 }); + const col1 = col(ColumnType.int); + it('number', () => { + expect(col1.value(0)).toBe(3); + expect(col1.value(1)).toBe(2); + expect(col1.value(2)).toBe(1); + }) +}); diff --git a/src/script.ts b/src/script.ts index ea36fe76a..06615a362 100644 --- a/src/script.ts +++ b/src/script.ts @@ -8,7 +8,7 @@ import * as fs from 'fs' import Gro from './reader/gro/parser' -import CIF from './reader/cif/text-parser' +import CIF from './reader/cif/index' // const file = '1crn.gro' // const file = 'water.gro' @@ -81,7 +81,7 @@ export function _cif() { } console.time('parseCIF'); - const parsed = CIF(input); + const parsed = CIF.parseText(input); console.timeEnd('parseCIF'); if (parsed.isError) { console.log(parsed); @@ -92,7 +92,12 @@ export function _cif() { const atom_site = data.categories._atom_site; console.log(atom_site.getField('Cartn_x')!.float(0)); - console.log(atom_site.getField('label_atom_id')!.toStringArray()); + //console.log(atom_site.getField('label_atom_id')!.toStringArray()); + + const mmcif = CIF.applySchema(CIF.schema.mmCIF, data); + console.log(mmcif.atom_site.Cartn_x.value(0)); + console.log(mmcif.entity.type.toArray()); + console.log(mmcif.pdbx_struct_oper_list.matrix.value(0)); }); } -- GitLab