diff --git a/src/mol-data/db/column.ts b/src/mol-data/db/column.ts index adcd28cc2b69648eb4481a5478534052bf6b27b7..fa0e0920e905952132821dfca929964bc6869b75 100644 --- a/src/mol-data/db/column.ts +++ b/src/mol-data/db/column.ts @@ -139,7 +139,7 @@ namespace Column { return columnIndicesOf(c, test); } - /** Makes the column backned by an array. Useful for columns that accessed often. */ + /** Makes the column backed by an array. Useful for columns that are accessed often. */ export function asArrayColumn<T>(c: Column<T>, array?: ArrayCtor<T>): Column<T> { if (c['@array']) return c; if (!c.isDefined) return Undefined(c.rowCount, c.schema) as any as Column<T>; diff --git a/src/mol-io/reader/_spec/csv.spec.ts b/src/mol-io/reader/_spec/csv.spec.ts new file mode 100644 index 0000000000000000000000000000000000000000..cd14e30e425101170e3a4d3c48a5d530fd6c9e5c --- /dev/null +++ b/src/mol-io/reader/_spec/csv.spec.ts @@ -0,0 +1,80 @@ +/** + * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import Csv from '../csv/parser' + +const csvStringBasic = `StrCol,IntCol,FloatCol +# comment +string1,-1,-0.34e3 +string2,42,2.44` + +const csvStringAdvanced = `StrCol,"Int Col",FloatCol + string1 \t , -1, -0.34e3 + # comment + " stri +ng2" ,42, 2.44 ` + +const tabString = `StrCol\tIntCol\tFloatCol +string1\t-1\t-0.34e3 +string2\t42\t2.44` + +describe('csv reader', () => { + it('basic', async () => { + const parsed = await Csv(csvStringBasic)(); + if (parsed.isError) return; + const csvFile = parsed.result; + + // csvFile.table.columnNames.forEach(name => { + // const col = csvFile.table.getColumn(name) + // if (col) console.log(name, col.toStringArray()) + // }) + + const strCol = csvFile.table.getColumn('StrCol') + if (strCol) expect(strCol.toStringArray()).toEqual(['string1', 'string2']) + + const intCol = csvFile.table.getColumn('IntCol') + if (intCol) expect(intCol.toIntArray()).toEqual([-1, 42]) + + const floatCol = csvFile.table.getColumn('FloatCol') + if (floatCol) expect(floatCol.toFloatArray()).toEqual([-340.0, 2.44]) + + expect.assertions(3) + }); + + it('advanced', async () => { + const parsed = await Csv(csvStringAdvanced)(); + if (parsed.isError) return; + const csvFile = parsed.result; + + const strCol = csvFile.table.getColumn('StrCol') + if (strCol) expect(strCol.toStringArray()).toEqual(['string1', ' stri\nng2']) + + const intCol = csvFile.table.getColumn('Int Col') + if (intCol) expect(intCol.toIntArray()).toEqual([-1, 42]) + + const floatCol = csvFile.table.getColumn('FloatCol') + if (floatCol) expect(floatCol.toFloatArray()).toEqual([-340.0, 2.44]) + + expect.assertions(3) + }); + + it('tabs', async () => { + const parsed = await Csv(tabString, { delimiter: '\t' })(); + if (parsed.isError) return; + const csvFile = parsed.result; + + const strCol = csvFile.table.getColumn('StrCol') + if (strCol) expect(strCol.toStringArray()).toEqual(['string1', 'string2']) + + const intCol = csvFile.table.getColumn('IntCol') + if (intCol) expect(intCol.toIntArray()).toEqual([-1, 42]) + + const floatCol = csvFile.table.getColumn('FloatCol') + if (floatCol) expect(floatCol.toFloatArray()).toEqual([-340.0, 2.44]) + + expect.assertions(3) + }); +}); \ No newline at end of file diff --git a/src/mol-io/reader/common/text/tokenizer.ts b/src/mol-io/reader/common/text/tokenizer.ts index 55b310e5d825008fa16a7b91dd443e08d3e8d768..15a8492c74ffd939be708e62e4858a8129291106 100644 --- a/src/mol-io/reader/common/text/tokenizer.ts +++ b/src/mol-io/reader/common/text/tokenizer.ts @@ -41,6 +41,14 @@ export namespace Tokenizer { return state.data.substring(state.tokenStart, state.tokenEnd); } + /** Resets the state */ + export function reset (state: Tokenizer) { + state.position = 0 + state.lineNumber = 1 + state.tokenStart = 0 + state.tokenEnd = 0 + } + /** * Eat everything until a newline occurs. */ @@ -227,6 +235,7 @@ export namespace TokenBuilder { } export function create(tokenizer: Tokenizer, size: number): Tokens { + size = Math.max(10, size) return <Builder>{ data: tokenizer.data, indicesLenMinus2: (size - 2) | 0, diff --git a/src/mol-io/reader/csv/data-model.ts b/src/mol-io/reader/csv/data-model.ts new file mode 100644 index 0000000000000000000000000000000000000000..86f50f077574d8fd4e7fa5b511bdb6773821ccae --- /dev/null +++ b/src/mol-io/reader/csv/data-model.ts @@ -0,0 +1,36 @@ +/** + * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import { Field as Column } from '../cif/data-model' + +export { Column } + +export interface File { + readonly name?: string, + readonly table: Table +} + +export function File(table: Table, name?: string): File { + return { name, table }; +} + +export interface Table { + readonly rowCount: number, + readonly columnNames: ReadonlyArray<string>, + getColumn(name: string): Column | undefined +} + +export function Table(rowCount: number, columnNames: string[], columns: Columns): Table { + return { rowCount, columnNames: [...columnNames], getColumn(name) { return columns[name]; } }; +} + +export type Columns = { [name: string]: Column } + +// export namespace Table { +// export function empty(name: string): Table { +// return { rowCount: 0, name, fieldNames: [], getColumn(name: string) { return void 0; } }; +// }; +// } \ No newline at end of file diff --git a/src/mol-io/reader/csv/field.ts b/src/mol-io/reader/csv/field.ts new file mode 100644 index 0000000000000000000000000000000000000000..fdc4c5135d4037d72dbd06385accae9bd805bbfa --- /dev/null +++ b/src/mol-io/reader/csv/field.ts @@ -0,0 +1,9 @@ +/** + * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import Field from '../cif/text/field' + +export default Field \ No newline at end of file diff --git a/src/mol-io/reader/csv/parser.ts b/src/mol-io/reader/csv/parser.ts new file mode 100644 index 0000000000000000000000000000000000000000..3d80e906e9c57eb219614cc171a243a0949a746a --- /dev/null +++ b/src/mol-io/reader/csv/parser.ts @@ -0,0 +1,287 @@ +/** + * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +// import { Column } from 'mol-data/db' +import { Tokens, TokenBuilder, Tokenizer } from '../common/text/tokenizer' +import * as Data from './data-model' +import Field from './field' +import Result from '../result' +import Computation from 'mol-util/computation' + +const enum CsvTokenType { + Value = 0, + Comment = 1, + End = 2 +} + +interface State { + data: string; + tokenizer: Tokenizer, + + tokenType: CsvTokenType; + chunker: Computation.Chunker, + tokens: Tokens[], + + fieldCount: number, + recordCount: number, + + columnCount: number, + columnNames: string[], + + quoteCharCode: number, + commentCharCode: number, + delimiterCharCode: number, + + noColumnNamesRecord: boolean +} + +function State(data: string, ctx: Computation.Context, opts: CsvOptions): State { + + const tokenizer = Tokenizer(data) + return { + data, + tokenizer, + + tokenType: CsvTokenType.End, + chunker: Computation.chunker(ctx, 100000), + tokens: [], + + fieldCount: 0, + recordCount: 0, + + columnCount: 0, + columnNames: [], + + quoteCharCode: opts.quote.charCodeAt(0), + commentCharCode: opts.comment.charCodeAt(0), + delimiterCharCode: opts.delimiter.charCodeAt(0), + noColumnNamesRecord: opts.noColumnNames + }; +} + +/** + * Eat everything until a delimiter or newline occurs. + * Ignores whitespace at the end of the value, i.e. trim right. + * Returns true when a newline occurs after the value. + */ +function eatValue(state: Tokenizer, delimiterCharCode: number) { + while (state.position < state.length) { + const c = state.data.charCodeAt(state.position); + ++state.position + switch (c) { + case 10: // \n + case 13: // \r + return true; + case delimiterCharCode: + return; + case 9: // \t + case 32: // ' ' + break; + default: + ++state.tokenEnd; + break; + } + } +} + +/** + * Eats a quoted value. Can contain a newline. + * Returns true when a newline occurs after the quoted value. + * + * Embedded quotes are represented by a pair of double quotes: + * - ""xx"" => "xx" + */ +function eatQuoted(state: Tokenizer, quoteCharCode: number, delimiterCharCode: number) { + ++state.position; + while (state.position < state.length) { + const c = state.data.charCodeAt(state.position); + if (c === quoteCharCode) { + const next = state.data.charCodeAt(state.position + 1); + if (next !== quoteCharCode) { + // get rid of the quotes. + state.tokenStart++; + state.tokenEnd = state.position; + ++state.position; + return skipEmpty(state, delimiterCharCode) + } + } + ++state.position; + } + state.tokenEnd = state.position; +} + +/** + * Skips empty chars. + * Returns true when the current char is a newline. + */ +function skipEmpty(state: Tokenizer, delimiterCharCode: number) { + while (state.position < state.length) { + const c = state.data.charCodeAt(state.position); + if (c !== 9 && c !== 32 && c !== delimiterCharCode) { // \t or ' ' + return c === 10 || c === 13; // \n or \r + } + ++state.position + } +} + +function skipWhitespace(state: Tokenizer) { + let prev = -1; + while (state.position < state.length) { + const c = state.data.charCodeAt(state.position); + switch (c) { + case 9: // '\t' + case 32: // ' ' + prev = c; + ++state.position; + break; + case 10: // \n + // handle \r\n + if (prev !== 13) { + ++state.lineNumber; + } + prev = c; + ++state.position; + break; + case 13: // \r + prev = c; + ++state.position; + ++state.lineNumber; + break; + default: + return; + } + } +} + +function skipLine(state: Tokenizer) { + while (state.position < state.length) { + const c = state.data.charCodeAt(state.position); + if (c === 10 || c === 13) return // \n or \r + ++state.position + } +} + +/** + * Move to the next token. + * Returns true when the current char is a newline, i.e. indicating a full record. + */ +function moveNextInternal(state: State) { + const tokenizer = state.tokenizer + skipWhitespace(tokenizer); + + if (tokenizer.position >= tokenizer.length) { + state.tokenType = CsvTokenType.End; + return true; + } + + tokenizer.tokenStart = tokenizer.position; + tokenizer.tokenEnd = tokenizer.position; + const c = state.data.charCodeAt(tokenizer.position); + switch (c) { + case state.commentCharCode: + state.tokenType = CsvTokenType.Comment; + skipLine(tokenizer); + break; + case state.quoteCharCode: + state.tokenType = CsvTokenType.Value; + return eatQuoted(tokenizer, state.quoteCharCode, state.delimiterCharCode); + default: + state.tokenType = CsvTokenType.Value; + return eatValue(tokenizer, state.delimiterCharCode); + } +} + +/** + * Moves to the next non-comment token/line. + * Returns true when the current char is a newline, i.e. indicating a full record. + */ +function moveNext(state: State) { + let newRecord = moveNextInternal(state); + while (state.tokenType === CsvTokenType.Comment) { + newRecord = moveNextInternal(state); + } + return newRecord +} + +function readRecordsChunk(state: State, chunkSize: number) { + if (state.tokenType === CsvTokenType.End) return 0 + + let newRecord = moveNext(state); + if (newRecord) ++state.recordCount + + const { tokens, tokenizer } = state; + let counter = 0; + while (state.tokenType === CsvTokenType.Value && counter < chunkSize) { + TokenBuilder.add(tokens[state.fieldCount % state.columnCount], tokenizer.tokenStart, tokenizer.tokenEnd); + ++state.fieldCount + newRecord = moveNext(state); + if (newRecord) ++state.recordCount + ++counter; + } + return counter; +} + +function readRecordsChunks(state: State) { + return state.chunker.process( + chunkSize => readRecordsChunk(state, chunkSize), + update => update({ message: 'Parsing...', current: state.tokenizer.position, max: state.data.length })); +} + +function addColumn (state: State) { + state.columnNames.push(Tokenizer.getTokenString(state.tokenizer)) + state.tokens.push(TokenBuilder.create(state.tokenizer, state.data.length / 80)) +} + +function init(state: State) { + let newRecord = moveNext(state) + while (!newRecord) { + addColumn(state) + newRecord = moveNext(state); + } + addColumn(state) + state.columnCount = state.columnNames.length + if (state.noColumnNamesRecord) { + state.columnNames.forEach((x, i, arr) => arr[i] = i+'') + Tokenizer.reset(state.tokenizer) + } +} + +async function handleRecords(state: State): Promise<Data.Table> { + init(state) + await readRecordsChunks(state) + + const columns: Data.Columns = Object.create(null); + for (let i = 0; i < state.columnCount; ++i) { + columns[state.columnNames[i]] = Field(state.tokens[i], state.recordCount); + } + + return Data.Table(state.recordCount, state.columnNames, columns) +} + +async function parseInternal(data: string, ctx: Computation.Context, opts: CsvOptions): Promise<Result<Data.File>> { + const state = State(data, ctx, opts); + + ctx.update({ message: 'Parsing...', current: 0, max: data.length }); + const table = await handleRecords(state) + const result = Data.File(table) + return Result.success(result); +} + +interface CsvOptions { + quote: string; + comment: string; + delimiter: string; + noColumnNames: boolean; +} + +export function parse(data: string, opts?: Partial<CsvOptions>) { + const completeOpts = Object.assign({}, { quote: '"', comment: '#', delimiter: ',', noColumnNames: false }, opts) + return Computation.create<Result<Data.File>>(async ctx => { + return await parseInternal(data, ctx, completeOpts); + }); +} + +export default parse; \ No newline at end of file diff --git a/src/mol-io/reader/csv/schema.ts b/src/mol-io/reader/csv/schema.ts new file mode 100644 index 0000000000000000000000000000000000000000..da4529753d64a9d41c3cfab71213c7523dae4010 --- /dev/null +++ b/src/mol-io/reader/csv/schema.ts @@ -0,0 +1,7 @@ +/** + * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +export { toTable } from '../cif/schema' \ No newline at end of file diff --git a/src/script.ts b/src/script.ts index de1047e64d48f55c7e5f77161d7cc068df61bbcc..5eee1e26d2f092072357a59efe72b32a80d831ae 100644 --- a/src/script.ts +++ b/src/script.ts @@ -10,18 +10,15 @@ import * as fs from 'fs' require('util.promisify').shim(); const readFileAsync = util.promisify(fs.readFile); -const writeFileAsync = util.promisify(fs.writeFile); import Gro from 'mol-io/reader/gro/parser' +import Csv from 'mol-io/reader/csv/parser' import CIF from 'mol-io/reader/cif' import Computation from 'mol-util/computation' import { Model } from 'mol-model/structure' -// import { toTypedFrame as applySchema } from './reader/cif/schema' -import { generateSchema } from 'mol-io/reader/cif/schema/utils' - const file = '1crn.gro' // const file = 'water.gro' // const file = 'test.gro' @@ -155,7 +152,7 @@ export async function _cif() { runCIF(input); path = `./examples/1cbs_full.bcif`; - + const input2 = await readFileAsync(path) console.log('------------------'); console.log('BinaryCIF:'); @@ -164,38 +161,7 @@ export async function _cif() { runCIF(input2); } -_cif(); - -async function runDic(input: string | Uint8Array) { - console.time('parseDic'); - const comp = typeof input === 'string' ? CIF.parseText(input) : CIF.parseBinary(input); - - const ctx = Computation.observable({ updateRateMs: 250, observer: p => showProgress('DIC', p) }); - const parsed = await comp(ctx); - console.timeEnd('parseDic'); - if (parsed.isError) { - console.log(parsed); - return; - } - - const schema = generateSchema(parsed.result.blocks[0]) - // console.log(schema) - // console.log(util.inspect(Object.keys(schema).length, {showHidden: false, depth: 1})) - - await writeFileAsync('./src/reader/cif/schema/mmcif-gen.ts', schema, 'utf8') - - return schema -} - -export async function _dic() { - let path = './build/dics/mmcif_pdbx_v50.dic' - const input = await readFileAsync(path, 'utf8') - console.log('------------------'); - console.log('Text DIC:'); - return runDic(input); -} - -_dic(); +// _cif(); const comp = Computation.create(async ctx => { for (let i = 0; i < 0; i++) { @@ -204,9 +170,38 @@ const comp = Computation.create(async ctx => { } return 42; }); -async function testComp() { +export async function testComp() { const ctx = Computation.observable({ observer: p => showProgress('test', p) }); const ret = await comp(ctx); console.log('computation returned', ret); } -testComp(); \ No newline at end of file +// testComp(); + + +const csvString = ` Year,Make,Model,Length +1997,Ford,"E350 + +MOIN",2.34 +2000,Mercury, Cougar,2.38` + +export async function testCsv () { + const parsed = await Csv(csvString)(); + + if (parsed.isError) { + console.log(parsed) + return; + } + + const csvFile = parsed.result; + csvFile.table.columnNames.forEach(name => { + const col = csvFile.table.getColumn(name) + if (col) console.log(name, col.toStringArray()) + }) + + const year = csvFile.table.getColumn('Year') + if (year) console.log('(int)Year', year.toIntArray()) + + const length = csvFile.table.getColumn('Length') + if (length) console.log('(float)Length', length.toFloatArray()) +} +testCsv() \ No newline at end of file