-
David Sehnal authoredDavid Sehnal authored
parser.ts 19.83 KiB
/*
* Copyright (c) 2017 molio contributors, licensed under MIT, See LICENSE file for more info.
*
* @author David Sehnal <david.sehnal@gmail.com>
*/
/**
* mmCIF parser.
*
* Trying to be as close to the specification http://www.iucr.org/resources/cif/spec/version1.1/cifsyntax
*
* Differences I'm aware of:
* - Except keywords (data_, loop_, save_) everything is case sensitive.
* - The tokens . and ? are treated the same as the values '.' and '?'.
* - Ignores \ in the multiline values:
* ;abc\
* efg
* ;
* should have the value 'abcefg' but will have the value 'abc\\nefg' instead.
* Post processing of this is left to the consumer of the data.
* - Similarly, things like punctuation (\', ..) are left to be processed by the user if needed.
*
*/
import * as Data from '../data-model'
import Field from './field'
import { Tokens, TokenBuilder } from '../../common/text/tokenizer'
import Result from '../../result'
import Computation from '../../../utils/computation'
/**
* Types of supported mmCIF tokens.
*/
const enum CifTokenType {
Data = 0,
Save = 1,
Loop = 2,
Value = 3,
ColumnName = 4,
Comment = 5,
End = 6
}
interface TokenizerState {
data: string;
position: number;
length: number;
isEscaped: boolean;
currentLineNumber: number;
currentTokenType: CifTokenType;
currentTokenStart: number;
currentTokenEnd: number;
chunker: Computation.Chunker
}
/**
* Eat everything until a whitespace/newline occurs.
*/
function eatValue(state: TokenizerState) {
while (state.position < state.length) {
switch (state.data.charCodeAt(state.position)) {
case 9: // \t
case 10: // \n
case 13: // \r
case 32: // ' '
state.currentTokenEnd = state.position;
return;
default:
++state.position;
break;
}
}
state.currentTokenEnd = state.position;
}
/**
* Eats an escaped values. Handles the "degenerate" cases as well.
*
* "Degenerate" cases:
* - 'xx'x' => xx'x
* - 'xxxNEWLINE => 'xxx
*
*/
function eatEscaped(state: TokenizerState, esc: number) {
let next: number, c: number;
++state.position;
while (state.position < state.length) {
c = state.data.charCodeAt(state.position);
if (c === esc) {
next = state.data.charCodeAt(state.position + 1);
switch (next) {
case 9: // \t
case 10: // \n
case 13: // \r
case 32: // ' '
// get rid of the quotes.
state.currentTokenStart++;
state.currentTokenEnd = state.position;
state.isEscaped = true;
++state.position;
return;
default:
if (next === void 0) { // = "end of stream"
// get rid of the quotes.
state.currentTokenStart++;
state.currentTokenEnd = state.position;
state.isEscaped = true;
++state.position;
return;
}
++state.position;
break;
}
} else {
// handle 'xxxNEWLINE => 'xxx
if (c === 10 || c === 13) {
state.currentTokenEnd = state.position;
return;
}
++state.position;
}
}
state.currentTokenEnd = state.position;
}
/**
* Eats a multiline token of the form NL;....NL;
*/
function eatMultiline(state: TokenizerState) {
let prev = 59, pos = state.position + 1, c: number;
while (pos < state.length) {
c = state.data.charCodeAt(pos);
if (c === 59 && (prev === 10 || prev === 13)) { // ;, \n \r
state.position = pos + 1;
// get rid of the ;
state.currentTokenStart++;
// remove trailing newlines
pos--;
c = state.data.charCodeAt(pos);
while (c === 10 || c === 13) {
pos--;
c = state.data.charCodeAt(pos);
}
state.currentTokenEnd = pos + 1;
state.isEscaped = true;
return;
} else {
// handle line numbers
if (c === 13) { // \r
state.currentLineNumber++;
} else if (c === 10 && prev !== 13) { // \r\n
state.currentLineNumber++;
}
prev = c;
++pos;
}
}
state.position = pos;
return prev;
}
/**
* Skips until \n or \r occurs -- therefore the newlines get handled by the "skipWhitespace" function.
*/
function skipCommentLine(state: TokenizerState) {
while (state.position < state.length) {
let c = state.data.charCodeAt(state.position);
if (c === 10 || c === 13) {
return;
}
++state.position;
}
}
/**
* Skips all the whitespace - space, tab, newline, CR
* Handles incrementing line count.
*/
function skipWhitespace(state: TokenizerState): number {
let prev = 10;
while (state.position < state.length) {
let c = state.data.charCodeAt(state.position);
switch (c) {
case 9: // '\t'
case 32: // ' '
prev = c;
++state.position;
break;
case 10: // \n
// handle \r\n
if (prev !== 13) {
++state.currentLineNumber;
}
prev = c;
++state.position;
break;
case 13: // \r
prev = c;
++state.position;
++state.currentLineNumber;
break;
default:
return prev;
}
}
return prev;
}
function isData(state: TokenizerState): boolean {
// here we already assume the 5th char is _ and that the length >= 5
// d/D
let c = state.data.charCodeAt(state.currentTokenStart);
if (c !== 68 && c !== 100) return false;
// a/A
c = state.data.charCodeAt(state.currentTokenStart + 1);
if (c !== 65 && c !== 97) return false;
// t/t
c = state.data.charCodeAt(state.currentTokenStart + 2);
if (c !== 84 && c !== 116) return false;
// a/A
c = state.data.charCodeAt(state.currentTokenStart + 3);
if (c !== 65 && c !== 97) return false;
return true;
}
function isSave(state: TokenizerState): boolean {
// here we already assume the 5th char is _ and that the length >= 5
// s/S
let c = state.data.charCodeAt(state.currentTokenStart);
if (c !== 83 && c !== 115) return false;
// a/A
c = state.data.charCodeAt(state.currentTokenStart + 1);
if (c !== 65 && c !== 97) return false;
// v/V
c = state.data.charCodeAt(state.currentTokenStart + 2);
if (c !== 86 && c !== 118) return false;
// e/E
c = state.data.charCodeAt(state.currentTokenStart + 3);
if (c !== 69 && c !== 101) return false;
return true;
}
function isLoop(state: TokenizerState): boolean {
// here we already assume the 5th char is _ and that the length >= 5
if (state.currentTokenEnd - state.currentTokenStart !== 5) return false;
// l/L
let c = state.data.charCodeAt(state.currentTokenStart);
if (c !== 76 && c !== 108) return false;
// o/O
c = state.data.charCodeAt(state.currentTokenStart + 1);
if (c !== 79 && c !== 111) return false;
// o/O
c = state.data.charCodeAt(state.currentTokenStart + 2);
if (c !== 79 && c !== 111) return false;
// p/P
c = state.data.charCodeAt(state.currentTokenStart + 3);
if (c !== 80 && c !== 112) return false;
return true;
}
/**
* Checks if the current token shares the namespace with string at <start,end).
*/
function isNamespace(state: TokenizerState, start: number, end: number): boolean {
let i: number,
nsLen = end - start,
offset = state.currentTokenStart - start,
tokenLen = state.currentTokenEnd - state.currentTokenStart;
if (tokenLen < nsLen) return false;
for (i = start; i < end; ++i) {
if (state.data.charCodeAt(i) !== state.data.charCodeAt(i + offset)) return false;
}
if (nsLen === tokenLen) return true;
if (state.data.charCodeAt(i + offset) === 46) { // .
return true;
}
return false;
}
/**
* Returns the index of '.' in the current token. If no '.' is present, returns currentTokenEnd.
*/
function getNamespaceEnd(state: TokenizerState): number {
let i: number;
for (i = state.currentTokenStart; i < state.currentTokenEnd; ++i) {
if (state.data.charCodeAt(i) === 46) return i;
}
return i;
}
/**
* Get the namespace string. endIndex is obtained by the getNamespaceEnd() function.
*/
function getNamespace(state: TokenizerState, endIndex: number) {
return state.data.substring(state.currentTokenStart, endIndex);
}
/**
* String representation of the current token.
*/
function getTokenString(state: TokenizerState) {
return state.data.substring(state.currentTokenStart, state.currentTokenEnd);
}
/**
* Move to the next token.
*/
function moveNextInternal(state: TokenizerState) {
let prev = skipWhitespace(state);
if (state.position >= state.length) {
state.currentTokenType = CifTokenType.End;
return;
}
state.currentTokenStart = state.position;
state.currentTokenEnd = state.position;
state.isEscaped = false;
let c = state.data.charCodeAt(state.position);
switch (c) {
case 35: // #, comment
skipCommentLine(state);
state.currentTokenType = CifTokenType.Comment;
break;
case 34: // ", escaped value
case 39: // ', escaped value
eatEscaped(state, c);
state.currentTokenType = CifTokenType.Value;
break;
case 59: // ;, possible multiline value
// multiline value must start at the beginning of the line.
if (prev === 10 || prev === 13) { // /n or /r
eatMultiline(state);
} else {
eatValue(state);
}
state.currentTokenType = CifTokenType.Value;
break;
default:
eatValue(state);
// escaped is always Value
if (state.isEscaped) {
state.currentTokenType = CifTokenType.Value;
// _ always means column name
} else if (state.data.charCodeAt(state.currentTokenStart) === 95) { // _
state.currentTokenType = CifTokenType.ColumnName;
// 5th char needs to be _ for data_ or loop_
} else if (state.currentTokenEnd - state.currentTokenStart >= 5 && state.data.charCodeAt(state.currentTokenStart + 4) === 95) {
if (isData(state)) state.currentTokenType = CifTokenType.Data;
else if (isSave(state)) state.currentTokenType = CifTokenType.Save;
else if (isLoop(state)) state.currentTokenType = CifTokenType.Loop;
else state.currentTokenType = CifTokenType.Value;
// all other tests failed, we are at Value token.
} else {
state.currentTokenType = CifTokenType.Value;
}
break;
}
}
/**
* Moves to the next non-comment token.
*/
function moveNext(state: TokenizerState) {
moveNextInternal(state);
while (state.currentTokenType === CifTokenType.Comment) moveNextInternal(state);
}
function createTokenizer(data: string, ctx: Computation.Context): TokenizerState {
return {
data,
length: data.length,
position: 0,
currentTokenStart: 0,
currentTokenEnd: 0,
currentTokenType: CifTokenType.End,
currentLineNumber: 1,
isEscaped: false,
chunker: Computation.chunker(ctx, 1000000)
};
}
/**
* Helper shape of the category result.
*/
interface CifCategoryResult {
hasError: boolean;
errorLine: number;
errorMessage: string;
}
/**
* Reads a category containing a single row.
*/
function handleSingle(tokenizer: TokenizerState, categories: { [name: string]: Data.Category }): CifCategoryResult {
const nsStart = tokenizer.currentTokenStart, nsEnd = getNamespaceEnd(tokenizer);
const name = getNamespace(tokenizer, nsEnd);
const fields = Object.create(null);
let readingNames = true;
while (readingNames) {
if (tokenizer.currentTokenType !== CifTokenType.ColumnName || !isNamespace(tokenizer, nsStart, nsEnd)) {
readingNames = false;
break;
}
const fieldName = getTokenString(tokenizer).substring(name.length + 1);
moveNext(tokenizer);
if (tokenizer.currentTokenType as any !== CifTokenType.Value) {
return {
hasError: true,
errorLine: tokenizer.currentLineNumber,
errorMessage: 'Expected value.'
}
}
fields[fieldName] = Field({ data: tokenizer.data, indices: [tokenizer.currentTokenStart, tokenizer.currentTokenEnd], count: 1 }, 1);
moveNext(tokenizer);
}
categories[name] = Data.Category(1, fields);
return {
hasError: false,
errorLine: 0,
errorMessage: ''
};
}
interface LoopReadState {
tokenizer: TokenizerState,
tokens: Tokens[],
fieldCount: number,
tokenCount: number
}
function readLoopChunk(state: LoopReadState, chunkSize: number) {
//console.log(chunkSize);
const { tokenizer, tokens, fieldCount } = state;
let tokenCount = state.tokenCount;
let counter = 0;
while (tokenizer.currentTokenType === CifTokenType.Value && counter < chunkSize) {
TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.currentTokenStart, tokenizer.currentTokenEnd);
moveNext(tokenizer);
counter++;
}
state.tokenCount = tokenCount;
return counter; //tokenizer.currentTokenType === CifTokenType.Value;
}
function readLoopChunks(state: LoopReadState) {
const { chunker } = state.tokenizer;
// while (readLoopChunk(state, computation.chunkSize)) {
// if (computation.requiresUpdate) {
// await computation.updateProgress('Parsing...', void 0, state.tokenizer.position, state.tokenizer.data.length);
// }
// }
return chunker.process(
chunkSize => readLoopChunk(state, chunkSize),
update => update('Parsing...', void 0, state.tokenizer.position, state.tokenizer.data.length));
}
/**
* Reads a loop.
*/
async function handleLoop(tokenizer: TokenizerState, categories: { [name: string]: Data.Category }): Promise<CifCategoryResult> {
const loopLine = tokenizer.currentLineNumber;
moveNext(tokenizer);
const name = getNamespace(tokenizer, getNamespaceEnd(tokenizer));
const fieldNames: string[] = [];
while (tokenizer.currentTokenType === CifTokenType.ColumnName) {
fieldNames[fieldNames.length] = getTokenString(tokenizer).substring(name.length + 1);
moveNext(tokenizer);
}
const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32;
const tokens: Tokens[] = [];
const fieldCount = fieldNames.length;
for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer, rowCountEstimate);
const state: LoopReadState = {
fieldCount,
tokenCount: 0,
tokenizer,
tokens
};
// let tokenCount = 0;
// while (tokenizer.currentTokenType === CifTokenType.Value) {
// TokenBuilder.add(tokens[(tokenCount++) % fieldCount], tokenizer.currentTokenStart, tokenizer.currentTokenEnd);
// moveNext(tokenizer);
// }
await readLoopChunks(state);
if (state.tokenCount % fieldCount !== 0) {
return {
hasError: true,
errorLine: tokenizer.currentLineNumber,
errorMessage: 'The number of values for loop starting at line ' + loopLine + ' is not a multiple of the number of columns.'
};
}
const rowCount = (state.tokenCount / fieldCount) | 0;
const fields = Object.create(null);
for (let i = 0; i < fieldCount; i++) {
fields[fieldNames[i]] = Field(tokens[i], rowCount);
}
categories[name] = Data.Category(rowCount, fields);
return {
hasError: false,
errorLine: 0,
errorMessage: ''
};
}
/**
* Creates an error result.
*/
function error(line: number, message: string) {
return Result.error<Data.File>(message, line);
}
/**
* Creates a data result.
*/
function result(data: Data.File) {
return Result.success(data);
}
/**
* Parses an mmCIF file.
*
* @returns CifParserResult wrapper of the result.
*/
async function parseInternal(data: string, ctx: Computation.Context) {
const dataBlocks: Data.Block[] = [];
const tokenizer = createTokenizer(data, ctx);
let blockHeader: string = '';
let blockCategories = Object.create(null);
//saveFrame = new DataBlock(data, "empty"),
//inSaveFrame = false,
//blockSaveFrames: any;
ctx.updateProgress('Parsing...');
moveNext(tokenizer);
while (tokenizer.currentTokenType !== CifTokenType.End) {
let token = tokenizer.currentTokenType;
// Data block
if (token === CifTokenType.Data) {
// if (inSaveFrame) {
// return error(tokenizer.currentLineNumber, "Unexpected data block inside a save frame.");
// }
if (Object.keys(blockCategories).length > 0) {
dataBlocks.push(Data.Block(blockCategories, blockHeader));
}
blockHeader = data.substring(tokenizer.currentTokenStart + 5, tokenizer.currentTokenEnd);
blockCategories = Object.create(null);
moveNext(tokenizer);
}
/* // Save frame
} else if (token === CifTokenType.Save) {
id = data.substring(tokenizer.currentTokenStart + 5, tokenizer.currentTokenEnd);
if (id.length === 0) {
if (saveFrame.categories.length > 0) {
blockSaveFrames = blockCategories.additionalData["saveFrames"];
if (!blockSaveFrames) {
blockSaveFrames = [];
blockCategories.additionalData["saveFrames"] = blockSaveFrames;
}
blockSaveFrames[blockSaveFrames.length] = saveFrame;
}
inSaveFrame = false;
} else {
if (inSaveFrame) {
return error(tokenizer.currentLineNumber, "Save frames cannot be nested.");
}
inSaveFrame = true;
saveFrame = new DataBlock(data, id);
}
moveNext(tokenizer);
// Loop
} */ else if (token === CifTokenType.Loop) {
const cat = await handleLoop(tokenizer, /*inSaveFrame ? saveFrame : */ blockCategories);
if (cat.hasError) {
return error(cat.errorLine, cat.errorMessage);
}
// Single row
} else if (token === CifTokenType.ColumnName) {
const cat = handleSingle(tokenizer, /*inSaveFrame ? saveFrame :*/ blockCategories);
if (cat.hasError) {
return error(cat.errorLine, cat.errorMessage);
}
// Out of options
} else {
return error(tokenizer.currentLineNumber, 'Unexpected token. Expected data_, loop_, or data name.');
}
}
// Check if the latest save frame was closed.
// if (inSaveFrame) {
// return error(tokenizer.currentLineNumber, "Unfinished save frame (`" + saveFrame.header + "`).");
// }
if (Object.keys(blockCategories).length > 0) {
dataBlocks.push(Data.Block(blockCategories, blockHeader));
}
return result(Data.File(dataBlocks));
}
export default function parse(data: string) {
return Computation.create<Result<Data.File>>(async ctx => {
return await parseInternal(data, ctx);
});
}