Skip to content
Snippets Groups Projects
Commit c5b1b84e authored by David Sehnal's avatar David Sehnal
Browse files

Data model improvements

parent c236b5d0
No related branches found
No related tags found
No related merge requests found
......@@ -60,9 +60,27 @@ export interface Field {
presence(row: number): ValuePresence,
areValuesEqual(rowA: number, rowB: number): boolean,
stringEquals(row: number, value: string | null): boolean,
stringEquals(row: number, value: string): boolean,
toStringArray(ctor?: (size: number) => Column.ArrayType, startRow?: number, endRowExclusive?: number): ReadonlyArray<string>,
toIntArray(ctor?: (size: number) => Column.ArrayType, startRow?: number, endRowExclusive?: number): ReadonlyArray<number>,
toFloatArray(ctor?: (size: number) => Column.ArrayType, startRow?: number, endRowExclusive?: number): ReadonlyArray<number>
toStringArray(params?: Column.ToArrayParams): ReadonlyArray<string>,
toIntArray(params?: Column.ToArrayParams): ReadonlyArray<number>,
toFloatArray(params?: Column.ToArrayParams): ReadonlyArray<number>
}
export function DefaultUndefinedField(rowCount: number): Field {
return {
isDefined: false,
rowCount,
str: row => '',
int: row => 0,
float: row => 0,
presence: row => ValuePresence.NotSpecified,
areValuesEqual: (rowA, rowB) => true,
stringEquals: (row, value) => value === null,
toStringArray: (p) => Column.createArray(rowCount, p).array,
toIntArray: (p) => Column.createArray(rowCount, p).array,
toFloatArray: (p) => Column.createArray(rowCount, p).array
};
}
\ No newline at end of file
......@@ -48,55 +48,45 @@ export type Category<Fields> = Fields & {
export namespace Category {
export type Schema = { '@alias'?: string } & { [field: string]: Field.Schema<any> }
export type Instance<T extends Schema> = Category<{ [F in keyof T]: Field<T[F]['type']> }>
export type Instance<T extends Schema> = Category<{ [F in keyof T]: Column.Column<T[F]['type']> }>
}
export interface Field<T> {
readonly isDefined: boolean,
value(row: number): T,
presence(row: number): Data.ValuePresence,
areValuesEqual(rowA: number, rowB: number): boolean,
stringEquals(row: number, value: string | null): boolean,
/** Converts the selected row range to an array. ctor might or might not be called depedning on the source data format. */
toArray(ctor?: (size: number) => Column.ArrayType, startRow?: number, endRowExclusive?: number): ReadonlyArray<T> | undefined
}
// export interface Field<T> {
// readonly isDefined: boolean,
// value(row: number): T,
// presence(row: number): Data.ValuePresence,
// areValuesEqual(rowA: number, rowB: number): boolean,
// stringEquals(row: number, value: string | null): boolean,
// /** Converts the selected row range to an array. ctor might or might not be called depedning on the source data format. */
// toArray(params?: Column.ToArrayParams): ReadonlyArray<T>
// }
export namespace Field {
export interface Schema<T> { type: T, ctor: (field: Data.Field) => Field<T>, undefinedField: (c: number) => Data.Field, alias?: string };
export interface Schema<T> { type: T, ctor: (field: Data.Field) => Column.Column<T>, undefinedField: (c: number) => Data.Field, alias?: string };
export interface Spec { undefinedField?: (c: number) => Data.Field, alias?: string }
export function str(spec?: Spec) { return createSchema(spec, Str); }
export function int(spec?: Spec) { return createSchema(spec, Int); }
export function float(spec?: Spec) { return createSchema(spec, Float); }
function create<T>(field: Data.Field, value: (row: number) => T, toArray: Field<T>['toArray']): Field<T> {
return { isDefined: field.isDefined, value, presence: field.presence, areValuesEqual: field.areValuesEqual, stringEquals: field.stringEquals, toArray };
function create<T>(field: Data.Field, value: (row: number) => T, toArray: Column.Column<T>['toArray']): Column.Column<T> {
const presence = field.presence;
return {
isDefined: field.isDefined,
rowCount: field.rowCount,
value,
isValueDefined: row => presence(row) === Data.ValuePresence.Present,
areValuesEqual: field.areValuesEqual,
toArray
};
}
function Str(field: Data.Field) { return create(field, field.str, field.toStringArray); }
function Int(field: Data.Field) { return create(field, field.int, field.toIntArray); }
function Float(field: Data.Field) { return create(field, field.float, field.toFloatArray); }
function defaultUndefined(rowCount: number): Data.Field {
return {
isDefined: false,
rowCount,
str: row => '',
int: row => 0,
float: row => 0,
presence: row => Data.ValuePresence.NotSpecified,
areValuesEqual: (rowA, rowB) => true,
stringEquals: (row, value) => value === null,
toStringArray: (ctor, s, e) => Column.createArray(rowCount, ctor, s, e).array,
toIntArray: (ctor, s, e) => Column.createArray(rowCount, ctor, s, e).array,
toFloatArray: (ctor, s, e) => Column.createArray(rowCount, ctor, s, e).array
};
}
function createSchema<T>(spec: Spec | undefined, ctor: (field: Data.Field) => Field<T>): Schema<T> {
return { type: 0 as any, ctor, undefinedField: (spec && spec.undefinedField) || defaultUndefined, alias: spec && spec.alias };
function createSchema<T>(spec: Spec | undefined, ctor: (field: Data.Field) => Column.Column<T>): Schema<T> {
return { type: 0 as any, ctor, undefinedField: (spec && spec.undefinedField) || Data.DefaultUndefinedField, alias: spec && spec.alias };
}
}
......
......@@ -42,7 +42,7 @@ export default function CifTextField(data: string, tokens: ArrayLike<number>, ro
int,
float,
presence,
areValuesEqual: (rowA, rowB) => {
areValuesEqual(rowA, rowB) {
const aS = tokens[2 * rowA], bS = tokens[2 * rowB];
const len = tokens[2 * rowA + 1] - aS;
if (len !== tokens[2 * rowB + 1] - bS) return false;
......@@ -53,7 +53,7 @@ export default function CifTextField(data: string, tokens: ArrayLike<number>, ro
}
return true;
},
stringEquals: (row, value) => {
stringEquals(row, value) {
const s = tokens[2 * row];
if (!value) return presence(row) !== Data.ValuePresence.Present;
const len = value.length;
......@@ -63,16 +63,16 @@ export default function CifTextField(data: string, tokens: ArrayLike<number>, ro
}
return true;
},
toStringArray: (ctor, s, e) => {
const { array, start } = Column.createArray(rowCount, ctor, s, e);
toStringArray(params) {
const { array, start } = Column.createArray(rowCount, params);
return fillArrayValues(str, array, start);
},
toIntArray: (ctor, s, e) => {
const { array, start } = Column.createArray(rowCount, ctor, s, e);
toIntArray(params) {
const { array, start } = Column.createArray(rowCount, params);
return fillArrayValues(int, array, start);
},
toFloatArray: (ctor, s, e) => {
const { array, start } = Column.createArray(rowCount, ctor, s, e);
toFloatArray(params) {
const { array, start } = Column.createArray(rowCount, params);
return fillArrayValues(float, array, start);
}
}
......
......@@ -4,7 +4,6 @@
* @author David Sehnal <david.sehnal@gmail.com>
*/
export type ArrayType = string[] | number[] | Float32Array | Float64Array | Int8Array | Int16Array | Int32Array | Uint8Array | Uint16Array | Uint32Array
export type ColumnType = typeof ColumnType.str | typeof ColumnType.pooledStr | typeof ColumnType.int | typeof ColumnType.float
export namespace ColumnType {
......@@ -14,11 +13,21 @@ export namespace ColumnType {
export const float = { '@type': 0 as number, kind: 'float' as 'float' };
}
export interface ToArrayParams {
array?: { new(size: number): ArrayLike<number> },
/** First row */
start?: number,
/** Last row (exclusive) */
end?: number
}
export interface Column<T> {
readonly isDefined: boolean,
readonly rowCount: number,
value(row: number): T,
toArray(ctor?: (size: number) => ArrayType, startRow?: number, endRowExclusive?: number): ReadonlyArray<T>
isValueDefined(row: number): boolean,
toArray(params?: ToArrayParams): ReadonlyArray<T>,
areValuesEqual(rowA: number, rowB: number): boolean
}
export function UndefinedColumn<T extends ColumnType>(rowCount: number, type: T): Column<T['@type']> {
......@@ -27,18 +36,21 @@ export function UndefinedColumn<T extends ColumnType>(rowCount: number, type: T)
isDefined: false,
rowCount,
value,
toArray(ctor, s, e) {
const { array } = createArray(rowCount, ctor, s, e);
isValueDefined(row) { return false; },
toArray(params) {
const { array } = createArray(rowCount, params);
for (let i = 0, _i = array.length; i < _i; i++) array[i] = value(0)
return array;
}
},
areValuesEqual() { return true; }
}
}
/** A helped function for Column.toArray */
export function createArray(rowCount: number, ctor?: (size: number) => ArrayType, start?: number, end?: number) {
const c = typeof ctor !== 'undefined' ? ctor : (s: number) => new Array(s);
export function createArray(rowCount: number, params?: ToArrayParams) {
const { array, start, end } = params || ({} as ToArrayParams);
const c = typeof array !== 'undefined' ? array : Array;
const s = typeof start !== 'undefined' ? Math.max(Math.min(start, rowCount - 1), 0) : 0;
const e = typeof end !== 'undefined' ? Math.min(end, rowCount) : rowCount;
return { array: c(e - s) as any[], start: s, end: e };
return { array: new c(e - s) as any[], start: s, end: e };
}
\ No newline at end of file
......@@ -5,19 +5,13 @@
*/
import { Column, ColumnType, createArray } from '../../column'
import { trimStr } from '../tokenizer'
import { trimStr, Lines } from '../tokenizer'
import { parseIntSkipLeadingWhitespace, parseFloatSkipLeadingWhitespace } from '../number-parser'
import StringPool from '../../../../utils/short-string-pool'
export interface FixedColumnInfo {
data: string,
lines: ArrayLike<number>,
rowCount: number
}
export default function FixedColumnProvider(info: FixedColumnInfo) {
export default function FixedColumnProvider(lines: Lines) {
return function<T extends ColumnType>(offset: number, width: number, type: T) {
return FixedColumn(info, offset, width, type);
return FixedColumn(lines, offset, width, type);
}
}
......@@ -26,39 +20,43 @@ function fillArrayValues(value: (row: number) => any, target: any[], start: numb
return target;
}
export function FixedColumn<T extends ColumnType>(info: FixedColumnInfo, offset: number, width: number, type: T): Column<T['@type']> {
const { data, lines, rowCount } = info;
export function FixedColumn<T extends ColumnType>(lines: Lines, offset: number, width: number, type: T): Column<T['@type']> {
const { data, tokens, count: rowCount } = lines;
const { kind } = type;
const pool = kind === 'pooled-str' ? StringPool.create() : void 0;
const value: Column<T['@type']>['value'] = kind === 'str' ? row => {
let s = lines[2 * row] + offset, le = lines[2 * row + 1];
let s = tokens[2 * row] + offset, le = tokens[2 * row + 1];
if (s >= le) return '';
let e = s + width;
if (e > le) e = le;
return trimStr(data, s, e);
} : kind === 'pooled-str' ? row => {
let s = lines[2 * row] + offset, le = lines[2 * row + 1];
let s = tokens[2 * row] + offset, le = tokens[2 * row + 1];
if (s >= le) return '';
let e = s + width;
if (e > le) e = le;
return StringPool.get(pool!, trimStr(data, s, e));
} : kind === 'int' ? row => {
const s = lines[2 * row] + offset;
if (s > lines[2 * row + 1]) return 0;
const s = tokens[2 * row] + offset;
if (s > tokens[2 * row + 1]) return 0;
return parseIntSkipLeadingWhitespace(data, s, s + width);
} : row => {
const s = lines[2 * row] + offset;
if (s > lines[2 * row + 1]) return 0;
const s = tokens[2 * row] + offset;
if (s > tokens[2 * row + 1]) return 0;
return parseFloatSkipLeadingWhitespace(data, s, s + width);
};
return {
isDefined: true,
rowCount,
value,
toArray(ctor, s, e) {
const { array, start } = createArray(rowCount, ctor, s, e);
isValueDefined(row) { return true; },
toArray(params) {
const { array, start } = createArray(rowCount, params);
return fillArrayValues(value, array, start);
},
areValuesEqual(rowA, rowB) {
return value(rowA) === value(rowB);
}
};
}
\ No newline at end of file
......@@ -6,7 +6,7 @@
* @author Alexander Rose <alexander.rose@weirdbyte.de>
*/
export interface State<TokenType = any> {
export interface Tokenizer {
data: string
position: number
......@@ -15,30 +15,35 @@ export interface State<TokenType = any> {
currentLineNumber: number
currentTokenStart: number
currentTokenEnd: number
}
currentTokenType: TokenType
export interface Lines {
data: string,
count: number,
tokens: ArrayLike<number>
}
export function State<TokenType>(data: string, initialTokenType?: TokenType): State<TokenType> {
export function Tokenizer(data: string): Tokenizer {
return {
data,
position: 0,
length: data.length,
currentLineNumber: 1,
currentTokenStart: 0,
currentTokenEnd: 0,
currentTokenType: initialTokenType!
currentTokenEnd: 0
};
}
export function getTokenString(state: State) {
export namespace Tokenizer {
export function getTokenString(state: Tokenizer) {
return state.data.substring(state.currentTokenStart, state.currentTokenEnd);
}
/**
* Eat everything until a newline occurs.
*/
export function eatLine(state: State) {
export function eatLine(state: Tokenizer) {
const { data } = state;
while (state.position < state.length) {
switch (data.charCodeAt(state.position)) {
......@@ -64,20 +69,32 @@ export function eatLine(state: State) {
}
/** Sets the current token start to the current position */
export function markStart(state: State) {
export function markStart(state: Tokenizer) {
state.currentTokenStart = state.position;
}
/** Sets the current token start to current position and moves to the next line. */
export function markLine(state: State) {
export function markLine(state: Tokenizer) {
state.currentTokenStart = state.position;
eatLine(state);
}
/** Advance the state by the given number of lines and return line starts/ends as tokens. */
export function readLines(state: Tokenizer, count: number): Lines {
const lineTokens = Tokens.create(count * 2);
for (let i = 0; i < count; i++) {
markLine(state);
Tokens.addUnchecked(lineTokens, state.currentTokenStart, state.currentTokenEnd);
}
return { data: state.data, count, tokens: lineTokens.indices };
}
/**
* Eat everything until a whitespace/newline occurs.
*/
export function eatValue(state: State) {
export function eatValue(state: Tokenizer) {
while (state.position < state.length) {
switch (state.data.charCodeAt(state.position)) {
case 9: // \t
......@@ -98,7 +115,7 @@ export function eatValue(state: State) {
* Skips all the whitespace - space, tab, newline, CR
* Handles incrementing line count.
*/
export function skipWhitespace(state: State): number {
export function skipWhitespace(state: Tokenizer): number {
let prev = 10;
while (state.position < state.length) {
let c = state.data.charCodeAt(state.position);
......@@ -129,7 +146,7 @@ export function skipWhitespace(state: State): number {
}
/** Trims spaces and tabs */
export function trim(state: State, start: number, end: number) {
export function trim(state: Tokenizer, start: number, end: number) {
const { data } = state;
let s = start, e = end - 1;
......@@ -142,6 +159,7 @@ export function trim(state: State, start: number, end: number) {
state.currentTokenEnd = e + 1;
state.position = end;
}
}
export function trimStr(data: string, start: number, end: number) {
let s = start, e = end - 1;
......@@ -189,8 +207,4 @@ export namespace Tokens {
}
}
/**
* A helper for building a typed array of token indices.
*/
export default Tokens
\ No newline at end of file
export default Tokenizer
\ No newline at end of file
......@@ -5,14 +5,14 @@
* @author David Sehnal <david.sehnal@gmail.com>
*/
import { State as TokenizerState, Tokens, markLine, getTokenString } from '../common/text/tokenizer'
import Tokenizer from '../common/text/tokenizer'
import FixedColumn from '../common/text/column/fixed'
import { ColumnType, UndefinedColumn } from '../common/column'
import * as Schema from './schema'
import Result from '../result'
interface State {
tokenizer: TokenizerState,
tokenizer: Tokenizer,
header: Schema.Header,
numberOfAtoms: number,
}
......@@ -27,7 +27,7 @@ function createEmptyHeader(): Schema.Header {
};
}
function createState(tokenizer: TokenizerState): State {
function State(tokenizer: Tokenizer): State {
return {
tokenizer,
header: createEmptyHeader(),
......@@ -40,14 +40,14 @@ function createState(tokenizer: TokenizerState): State {
*/
function handleTitleString(state: State) {
const { tokenizer, header } = state;
markLine(tokenizer);
Tokenizer.markLine(tokenizer);
let line = getTokenString(tokenizer);
let line = Tokenizer.getTokenString(tokenizer);
// skip potential empty lines...
if (line.trim().length === 0) {
markLine(tokenizer);
line = getTokenString(tokenizer);
Tokenizer.markLine(tokenizer);
line = Tokenizer.getTokenString(tokenizer);
}
const timeOffset = line.lastIndexOf('t=');
......@@ -67,8 +67,8 @@ function handleTitleString(state: State) {
*/
function handleNumberOfAtoms(state: State) {
const { tokenizer } = state;
markLine(tokenizer);
const line = getTokenString(tokenizer);
Tokenizer.markLine(tokenizer);
const line = Tokenizer.getTokenString(tokenizer);
state.numberOfAtoms = parseInt(line);
}
......@@ -90,17 +90,12 @@ function handleNumberOfAtoms(state: State) {
*/
function handleAtoms(state: State): Schema.Atoms {
const { tokenizer, numberOfAtoms } = state;
const lineTokens = Tokens.create(numberOfAtoms * 2);
const lines = Tokenizer.readLines(tokenizer, numberOfAtoms);
for (let i = 0; i < numberOfAtoms; i++) {
markLine(tokenizer);
Tokens.addUnchecked(lineTokens, tokenizer.currentTokenStart, tokenizer.currentTokenEnd);
}
const lines = lineTokens.indices;
const positionSample = tokenizer.data.substring(lines[0], lines[1]).substring(20);
const positionSample = tokenizer.data.substring(lines.tokens[0], lines.tokens[1]).substring(20);
const precisions = positionSample.match(/\.\d+/g)!;
const hasVelocities = precisions.length === 6;
state.header.hasVelocities = hasVelocities;
state.header.precision.position = precisions[0].length - 1;
state.header.precision.velocity = hasVelocities ? precisions[3].length - 1 : 0;
......@@ -110,7 +105,7 @@ function handleAtoms(state: State): Schema.Atoms {
const vO = pO + 3 * pW;
const vW = state.header.precision.velocity + 4;
const col = FixedColumn({ data: tokenizer.data, lines, rowCount: state.numberOfAtoms });
const col = FixedColumn(lines);
const undef = UndefinedColumn(state.numberOfAtoms, ColumnType.float);
const ret = {
......@@ -138,17 +133,17 @@ function handleAtoms(state: State): Schema.Atoms {
*/
function handleBoxVectors(state: State) {
const { tokenizer } = state;
markLine(tokenizer);
const values = getTokenString(tokenizer).trim().split(/\s+/g);
Tokenizer.markLine(tokenizer);
const values = Tokenizer.getTokenString(tokenizer).trim().split(/\s+/g);
state.header.box = [+values[0], +values[1], +values[2]];
}
function parseInternal(data: string): Result<Schema.File> {
const tokenizer = TokenizerState(data);
const tokenizer = Tokenizer(data);
const structures: Schema.Structure[] = [];
while (tokenizer.position < data.length) {
const state = createState(tokenizer);
const state = State(tokenizer);
handleTitleString(state);
handleNumberOfAtoms(state);
const atoms = handleAtoms(state);
......
......@@ -34,7 +34,7 @@ describe('schema', () => {
});
it('toArray', () => {
const ret = data.atoms.x.toArray(s => new Int32Array(s))!;
const ret = data.atoms.x.toArray({ array: Int32Array });
expect(ret.length).toBe(3);
expect(ret[0]).toBe(1);
expect(ret[1]).toBe(2);
......
......@@ -30,7 +30,7 @@ const linesTokens = (function () {
}());
describe('fixed text column', () => {
const col = FixedColumn({ data, lines: linesTokens, rowCount: lines.length });
const col = FixedColumn({ data, tokens: linesTokens, count: lines.length });
const col1 = col(0, 5, ColumnType.float);
const col2 = col(5, 4, ColumnType.str);
it('number', () => {
......
......@@ -51,22 +51,22 @@ export function _gro() {
console.log('rowCount', n)
console.time('getFloatArray x')
const x = atoms.x.toArray(x => new Float32Array(x))!
const x = atoms.x.toArray({ array: Float32Array })
console.timeEnd('getFloatArray x')
console.log(x.length, x[0], x[x.length - 1])
console.time('getFloatArray y')
const y = atoms.y.toArray(x => new Float32Array(x))!
const y = atoms.y.toArray({ array: Float32Array })
console.timeEnd('getFloatArray y')
console.log(y.length, y[0], y[y.length - 1])
console.time('getFloatArray z')
const z = atoms.z.toArray(x => new Float32Array(x))!
const z = atoms.z.toArray({ array: Float32Array })
console.timeEnd('getFloatArray z')
console.log(z.length, z[0], z[z.length - 1])
console.time('getIntArray residueNumber')
const residueNumber = atoms.residueNumber.toArray(x => new Int32Array(x))!
const residueNumber = atoms.residueNumber.toArray({ array: Int32Array })
console.timeEnd('getIntArray residueNumber')
console.log(residueNumber.length, residueNumber[0], residueNumber[residueNumber.length - 1])
});
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment