parser.ts

/**
 * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info.
 *
 * @author Alexander Rose <alexander.rose@weirdbyte.de>
 */

// import { Column } from 'mol-data/db'
import { Tokens, TokenBuilder, Tokenizer } from '../common/text/tokenizer'
import * as Data from './data-model'
import { Field } from './field'
import { ReaderResult as Result } from '../result'
import { Task, RuntimeContext, chunkedSubtask, } from '../../../mol-task'

const enum CsvTokenType {
    Value = 0,
    Comment = 1,
    End = 2
}

interface State {
    data: string;
    tokenizer: Tokenizer,

    tokenType: CsvTokenType;
    runtimeCtx: RuntimeContext,
    tokens: Tokens[],

    fieldCount: number,
    recordCount: number,

    columnCount: number,
    columnNames: string[],

    quoteCharCode: number,
    commentCharCode: number,
    delimiterCharCode: number,

    noColumnNamesRecord: boolean
}

function State(data: string, runtimeCtx: RuntimeContext, opts: CsvOptions): State {

    const tokenizer = Tokenizer(data)
    return {
        data,
        tokenizer,

        tokenType: CsvTokenType.End,
        runtimeCtx,
        tokens: [],

        fieldCount: 0,
        recordCount: 0,

        columnCount: 0,
        columnNames: [],

        quoteCharCode: opts.quote.charCodeAt(0),
        commentCharCode: opts.comment.charCodeAt(0),
        delimiterCharCode: opts.delimiter.charCodeAt(0),
        noColumnNamesRecord: opts.noColumnNames
    };
}

/**
 * Eat everything until a delimiter or newline occurs.
 * Ignores whitespace at the end of the value, i.e. trim right.
 * Returns true when a newline occurs after the value.
 */
function eatValue(state: Tokenizer, delimiterCharCode: number) {
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        ++state.position
        switch (c) {
            case 10:  // \n
            case 13:  // \r
                return true;
            case delimiterCharCode:
                return;
            case 9:  // \t
            case 32:  // ' '
                break;
            default:
                ++state.tokenEnd;
                break;
        }
    }
}

/**
 * Eats a quoted value. Can contain a newline.
 * Returns true when a newline occurs after the quoted value.
 *
 * Embedded quotes are represented by a pair of double quotes:
 * - ""xx"" => "xx"
 */
function eatQuoted(state: Tokenizer, quoteCharCode: number, delimiterCharCode: number) {
    ++state.position;
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        if (c === quoteCharCode) {
            const next = state.data.charCodeAt(state.position + 1);
            if (next !== quoteCharCode) {
                // get rid of the quotes.
                state.tokenStart++;
                state.tokenEnd = state.position;
                ++state.position;
                return skipEmpty(state, delimiterCharCode)
            }
        }
        ++state.position;
    }
    state.tokenEnd = state.position;
}

/**
 * Skips empty chars.
 * Returns true when the current char is a newline.
 */
function skipEmpty(state: Tokenizer, delimiterCharCode: number) {
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        if (c !== 9 && c !== 32 && c !== delimiterCharCode) {  // \t or ' '
            return c === 10 || c === 13;  // \n or \r
        }
        ++state.position
    }
}

function skipWhitespace(state: Tokenizer) {
    let prev = -1;
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        switch (c) {
            case 9:  // '\t'
            case 32:  // ' '
                prev = c;
                ++state.position;
                break;
            case 10:  // \n
                // handle \r\n
                if (prev !== 13) {
                    ++state.lineNumber;
                }
                prev = c;
                ++state.position;
                break;
            case 13:  // \r
                prev = c;
                ++state.position;
                ++state.lineNumber;
                break;
            default:
                return;
        }
    }
}

function skipLine(state: Tokenizer) {
    while (state.position < state.length) {
        const c = state.data.charCodeAt(state.position);
        if (c === 10 || c === 13) return  // \n or \r
        ++state.position
    }
}

/**
 * Move to the next token.
 * Returns true when the current char is a newline, i.e. indicating a full record.
 */
function moveNextInternal(state: State) {
    const tokenizer = state.tokenizer
    skipWhitespace(tokenizer);

    if (tokenizer.position >= tokenizer.length) {
        state.tokenType = CsvTokenType.End;
        return false;
    }

    tokenizer.tokenStart = tokenizer.position;
    tokenizer.tokenEnd = tokenizer.position;
    const c = state.data.charCodeAt(tokenizer.position);
    switch (c) {
        case state.commentCharCode:
            state.tokenType = CsvTokenType.Comment;
            skipLine(tokenizer);
            break;
        case state.quoteCharCode:
            state.tokenType = CsvTokenType.Value;
            return eatQuoted(tokenizer, state.quoteCharCode, state.delimiterCharCode);
        default:
            state.tokenType = CsvTokenType.Value;
            return eatValue(tokenizer, state.delimiterCharCode);
    }
}

/**
 * Moves to the next non-comment token/line.
 * Returns true when the current char is a newline, i.e. indicating a full record.
 */
function moveNext(state: State) {
    let newRecord = moveNextInternal(state);
    while (state.tokenType === CsvTokenType.Comment) {
        newRecord = moveNextInternal(state);
    }
    return newRecord
}

function readRecordsChunk(chunkSize: number, state: State) {
    if (state.tokenType === CsvTokenType.End) return 0

    let counter = 0;
    let newRecord: boolean | undefined

    const { tokens, tokenizer } = state;

    while (state.tokenType === CsvTokenType.Value && counter < chunkSize) {
        TokenBuilder.add(tokens[state.fieldCount % state.columnCount], tokenizer.tokenStart, tokenizer.tokenEnd);
        ++state.fieldCount
        newRecord = moveNext(state);
        if (newRecord) {
            ++state.recordCount
            ++counter;
        }
    }
    return counter;
}

function readRecordsChunks(state: State) {
    let newRecord = moveNext(state);
    if (newRecord) ++state.recordCount
    return chunkedSubtask(state.runtimeCtx, 100000, state, readRecordsChunk,
        (ctx, state) => ctx.update({ message: 'Parsing...', current: state.tokenizer.position, max: state.data.length }));
}

function addColumn (state: State) {
    state.columnNames.push(Tokenizer.getTokenString(state.tokenizer))
    state.tokens.push(TokenBuilder.create(state.tokenizer.data, state.data.length / 80))
}

function init(state: State) {
    let newRecord = moveNext(state)
    while (!newRecord) {
        addColumn(state)
        newRecord = moveNext(state);
    }
    addColumn(state)
    state.columnCount = state.columnNames.length
    if (state.noColumnNamesRecord) {
        state.columnNames.forEach((x, i, arr) => arr[i] = i+'')
        Tokenizer.reset(state.tokenizer)
    }
}

async function handleRecords(state: State): Promise<Data.CsvTable> {
    init(state)
    await readRecordsChunks(state)

    const columns: Data.CsvColumns = Object.create(null);
    for (let i = 0; i < state.columnCount; ++i) {
        columns[state.columnNames[i]] = Field(state.tokens[i]);
    }

    return Data.CsvTable(state.recordCount, state.columnNames, columns)
}

async function parseInternal(data: string, ctx: RuntimeContext, opts: CsvOptions): Promise<Result<Data.CsvFile>> {
    const state = State(data, ctx, opts);

    ctx.update({ message: 'Parsing...', current: 0, max: data.length });
    const table = await handleRecords(state)
    const result = Data.CsvFile(table)
    return Result.success(result);
}

interface CsvOptions {
    quote: string;
    comment: string;
    delimiter: string;
    noColumnNames: boolean;
}

export function parseCsv(data: string, opts?: Partial<CsvOptions>) {
    const completeOpts = Object.assign({}, { quote: '"', comment: '#', delimiter: ',', noColumnNames: false }, opts)
    return Task.create<Result<Data.CsvFile>>('Parse CSV', async ctx => {
        return await parseInternal(data, ctx, completeOpts);
    });
}