diff --git a/src/mol-io/reader/ccp4/parser.ts b/src/mol-io/reader/ccp4/parser.ts index 43cc24a8302bba97fc893f7eaee26ebcdabfb277..d48c343e5a63e4df3623551bd71b0f95f9aff4ce 100644 --- a/src/mol-io/reader/ccp4/parser.ts +++ b/src/mol-io/reader/ccp4/parser.ts @@ -6,7 +6,7 @@ import { Task, RuntimeContext } from 'mol-task'; import { Ccp4File, Ccp4Header } from './schema' -import Result from '../result' +import { ReaderResult as Result } from '../result' import { FileHandle } from '../../common/file-handle'; async function parseInternal(file: FileHandle, ctx: RuntimeContext): Promise<Result<Ccp4File>> { diff --git a/src/mol-io/reader/cif/binary/parser.ts b/src/mol-io/reader/cif/binary/parser.ts index 8a5f0ea1a8133aed0254577b2791d75952042183..4cf3ae57f882844f856d28612a943fd7a3515179 100644 --- a/src/mol-io/reader/cif/binary/parser.ts +++ b/src/mol-io/reader/cif/binary/parser.ts @@ -7,7 +7,7 @@ import * as Data from '../data-model' import { EncodedCategory, EncodedFile } from '../../../common/binary-cif' import Field from './field' -import Result from '../../result' +import { ReaderResult as Result } from '../../result' import decodeMsgPack from '../../../common/msgpack/decode' import { Task } from 'mol-task' diff --git a/src/mol-io/reader/cif/text/field.ts b/src/mol-io/reader/cif/text/field.ts index f4e08a090352acb330abfba07c340e820cbd8d9e..3248cd1eff26dd48478d9dba29184b23f71cf04d 100644 --- a/src/mol-io/reader/cif/text/field.ts +++ b/src/mol-io/reader/cif/text/field.ts @@ -29,8 +29,9 @@ export default function CifTextField(tokens: Tokens, rowCount: number): Data.Cif }; const valueKind: Data.CifField['valueKind'] = row => { - const s = indices[2 * row]; - if (indices[2 * row + 1] - s !== 1) return Column.ValueKind.Present; + const s = indices[2 * row], l = indices[2 * row + 1] - s; + if (l > 1) return Column.ValueKind.Present; + if (l === 0) return Column.ValueKind.NotPresent; const v = data.charCodeAt(s); if (v === 46 /* . */) return Column.ValueKind.NotPresent; if (v === 63 /* ? */) return Column.ValueKind.Unknown; @@ -51,4 +52,49 @@ export default function CifTextField(tokens: Tokens, rowCount: number): Data.Cif toIntArray: params => ColumnHelpers.createAndFillArray(rowCount, int, params), toFloatArray: params => ColumnHelpers.createAndFillArray(rowCount, float, params) } +} + +export function CifTextValueField(values: string[]): Data.CifField { + const rowCount = values.length; + + const str: Data.CifField['str'] = row => { + const ret = values[row]; + if (!ret || ret === '.' || ret === '?') return ''; + return ret; + }; + + const int: Data.CifField['int'] = row => { + const v = values[row]; + return fastParseInt(v, 0, v.length) || 0; + }; + + const float: Data.CifField['float'] = row => { + const v = values[row]; + return fastParseFloat(v, 0, v.length) || 0; + }; + + const valueKind: Data.CifField['valueKind'] = row => { + const v = values[row], l = v.length; + if (l > 1) return Column.ValueKind.Present; + if (l === 0) return Column.ValueKind.NotPresent; + const c = v.charCodeAt(0); + if (c === 46 /* . */) return Column.ValueKind.NotPresent; + if (c === 63 /* ? */) return Column.ValueKind.Unknown; + return Column.ValueKind.Present; + }; + + return { + __array: void 0, + binaryEncoding: void 0, + isDefined: true, + rowCount, + str, + int, + float, + valueKind, + areValuesEqual: (rowA, rowB) => values[rowA] === values[rowB], + toStringArray: params => ColumnHelpers.createAndFillArray(rowCount, str, params), + toIntArray: params => ColumnHelpers.createAndFillArray(rowCount, int, params), + toFloatArray: params => ColumnHelpers.createAndFillArray(rowCount, float, params) + } } \ No newline at end of file diff --git a/src/mol-io/reader/cif/text/parser.ts b/src/mol-io/reader/cif/text/parser.ts index 3ee75e270731600d06f6ab72b39d9359c1227f06..ae617076ed75406d12b3925be29a381b5ea60c82 100644 --- a/src/mol-io/reader/cif/text/parser.ts +++ b/src/mol-io/reader/cif/text/parser.ts @@ -25,7 +25,7 @@ import * as Data from '../data-model' import Field from './field' import { Tokens, TokenBuilder } from '../../common/text/tokenizer' -import Result from '../../result' +import { ReaderResult as Result } from '../../result' import { Task, RuntimeContext, chunkedSubtask } from 'mol-task' /** @@ -507,7 +507,7 @@ async function handleLoop(tokenizer: TokenizerState, ctx: FrameContext): Promise const rowCountEstimate = name === '_atom_site' ? (tokenizer.data.length / 100) | 0 : 32; const tokens: Tokens[] = []; const fieldCount = fieldNames.length; - for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer, rowCountEstimate); + for (let i = 0; i < fieldCount; i++) tokens[i] = TokenBuilder.create(tokenizer.data, rowCountEstimate); const state: LoopReadState = { fieldCount, diff --git a/src/mol-io/reader/common/text/tokenizer.ts b/src/mol-io/reader/common/text/tokenizer.ts index 60f14c1b135bfca7d215f3a3d8939997b4ec2ae5..8664601d45a214e2a15e934af85a8584d827444c 100644 --- a/src/mol-io/reader/common/text/tokenizer.ts +++ b/src/mol-io/reader/common/text/tokenizer.ts @@ -8,7 +8,9 @@ import { chunkedSubtask, RuntimeContext } from 'mol-task' -export interface Tokenizer { +export { Tokenizer } + +interface Tokenizer { data: string, position: number, @@ -25,7 +27,7 @@ export interface Tokens { indices: ArrayLike<number> } -export function Tokenizer(data: string): Tokenizer { +function Tokenizer(data: string): Tokenizer { return { data, position: 0, @@ -36,7 +38,7 @@ export function Tokenizer(data: string): Tokenizer { }; } -export namespace Tokenizer { +namespace Tokenizer { export function getTokenString(state: Tokenizer) { return state.data.substring(state.tokenStart, state.tokenEnd); } @@ -52,7 +54,7 @@ export namespace Tokenizer { /** * Eat everything until a newline occurs. */ - export function eatLine(state: Tokenizer) { + export function eatLine(state: Tokenizer): boolean { const { data } = state; while (state.position < state.length) { switch (data.charCodeAt(state.position)) { @@ -60,7 +62,7 @@ export namespace Tokenizer { state.tokenEnd = state.position; ++state.position; ++state.lineNumber; - return; + return true; case 13: // \r state.tokenEnd = state.position; ++state.position; @@ -68,13 +70,14 @@ export namespace Tokenizer { if (data.charCodeAt(state.position) === 10) { ++state.position; } - return; + return true; default: ++state.position; break; } } state.tokenEnd = state.position; + return state.tokenStart !== state.tokenEnd; } /** Sets the current token start to the current position */ @@ -85,7 +88,7 @@ export namespace Tokenizer { /** Sets the current token start to current position and moves to the next line. */ export function markLine(state: Tokenizer) { state.tokenStart = state.position; - eatLine(state); + return eatLine(state); } /** Advance the state by the given number of lines and return line starts/ends as tokens. */ @@ -95,15 +98,18 @@ export namespace Tokenizer { } function readLinesChunk(state: Tokenizer, count: number, tokens: Tokens) { + let read = 0; for (let i = 0; i < count; i++) { - markLine(state); + if (!markLine(state)) return read; TokenBuilder.addUnchecked(tokens, state.tokenStart, state.tokenEnd); + read++; } + return read; } /** Advance the state by the given number of lines and return line starts/ends as tokens. */ export function readLines(state: Tokenizer, count: number): Tokens { - const lineTokens = TokenBuilder.create(state, count * 2); + const lineTokens = TokenBuilder.create(state.data, count * 2); readLinesChunk(state, count, lineTokens); return lineTokens; } @@ -111,7 +117,7 @@ export namespace Tokenizer { /** Advance the state by the given number of lines and return line starts/ends as tokens. */ export async function readLinesAsync(state: Tokenizer, count: number, ctx: RuntimeContext, initialLineCount = 100000): Promise<Tokens> { const { length } = state; - const lineTokens = TokenBuilder.create(state, count * 2); + const lineTokens = TokenBuilder.create(state.data, count * 2); let linesAlreadyRead = 0; await chunkedSubtask(ctx, initialLineCount, state, (chunkSize, state) => { @@ -124,6 +130,37 @@ export namespace Tokenizer { return lineTokens; } + export function readAllLines(data: string) { + const state = Tokenizer(data); + const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2)) + while (markLine(state)) { + TokenBuilder.add(tokens, state.tokenStart, state.tokenEnd); + } + return tokens; + } + + function readLinesChunkChecked(state: Tokenizer, count: number, tokens: Tokens) { + let read = 0; + for (let i = 0; i < count; i++) { + if (!markLine(state)) return read; + TokenBuilder.add(tokens, state.tokenStart, state.tokenEnd); + read++; + } + return read; + } + + export async function readAllLinesAsync(data: string, ctx: RuntimeContext, chunkSize = 100000) { + const state = Tokenizer(data); + const tokens = TokenBuilder.create(state.data, Math.max(data.length / 80, 2)); + + await chunkedSubtask(ctx, chunkSize, state, (chunkSize, state) => { + readLinesChunkChecked(state, chunkSize, tokens); + return state.position < state.length ? chunkSize : 0; + }, (ctx, state) => ctx.update({ message: 'Parsing...', current: state.position, max: length })); + + return tokens; + } + /** * Eat everything until a whitespace/newline occurs. */ @@ -234,16 +271,14 @@ export namespace TokenBuilder { tokens.count++; } - export function create(tokenizer: Tokenizer, size: number): Tokens { + export function create(data: string, size: number): Tokens { size = Math.max(10, size) return <Builder>{ - data: tokenizer.data, + data, indicesLenMinus2: (size - 2) | 0, count: 0, offset: 0, indices: new Uint32Array(size) } } -} - -export default Tokenizer \ No newline at end of file +} \ No newline at end of file diff --git a/src/mol-io/reader/csv/parser.ts b/src/mol-io/reader/csv/parser.ts index d5bc68535344ff6c9d7aed63445b26e19a9220c7..6b7e14c5fe5e137206b98d5e6a17e0d4091aa7c4 100644 --- a/src/mol-io/reader/csv/parser.ts +++ b/src/mol-io/reader/csv/parser.ts @@ -8,7 +8,7 @@ import { Tokens, TokenBuilder, Tokenizer } from '../common/text/tokenizer' import * as Data from './data-model' import Field from './field' -import Result from '../result' +import { ReaderResult as Result } from '../result' import { Task, RuntimeContext, chunkedSubtask, } from 'mol-task' const enum CsvTokenType { @@ -231,7 +231,7 @@ function readRecordsChunks(state: State) { function addColumn (state: State) { state.columnNames.push(Tokenizer.getTokenString(state.tokenizer)) - state.tokens.push(TokenBuilder.create(state.tokenizer, state.data.length / 80)) + state.tokens.push(TokenBuilder.create(state.tokenizer.data, state.data.length / 80)) } function init(state: State) { diff --git a/src/mol-io/reader/dsn6/parser.ts b/src/mol-io/reader/dsn6/parser.ts index a77c968662036b64dab1f21d0df9a90edd3d117d..35416d7a82ec686f5846ce84eabacc448ced77d3 100644 --- a/src/mol-io/reader/dsn6/parser.ts +++ b/src/mol-io/reader/dsn6/parser.ts @@ -6,7 +6,7 @@ import { Task, RuntimeContext } from 'mol-task'; import { Dsn6File, Dsn6Header } from './schema' -import Result from '../result' +import { ReaderResult as Result } from '../result' import { FileHandle } from '../../common/file-handle'; function parseBrixHeader(str: string): Dsn6Header { diff --git a/src/mol-io/reader/gro/parser.ts b/src/mol-io/reader/gro/parser.ts index 6183a9a5fee6e887e889b4bd6efd4cec5b3de42b..0367a3ee8bbb27a676679345d9d7551edbd9fdbe 100644 --- a/src/mol-io/reader/gro/parser.ts +++ b/src/mol-io/reader/gro/parser.ts @@ -6,10 +6,10 @@ */ import { Column } from 'mol-data/db' -import Tokenizer from '../common/text/tokenizer' +import { Tokenizer } from '../common/text/tokenizer' import FixedColumn from '../common/text/column/fixed' import * as Schema from './schema' -import Result from '../result' +import { ReaderResult as Result } from '../result' import { Task, RuntimeContext } from 'mol-task' interface State { diff --git a/src/mol-io/reader/mol2/parser.ts b/src/mol-io/reader/mol2/parser.ts index 297e1502618329594b0966427649ba7b014aaa87..0a11a9a0aa93da4402fcf43f2c16270e4728d16e 100644 --- a/src/mol-io/reader/mol2/parser.ts +++ b/src/mol-io/reader/mol2/parser.ts @@ -15,7 +15,7 @@ import { Column } from 'mol-data/db' import { TokenBuilder, Tokenizer } from '../common/text/tokenizer' import TokenColumn from '../common/text/column/token' import * as Schema from './schema' -import Result from '../result' +import { ReaderResult as Result } from '../result' import { Task, RuntimeContext, chunkedSubtask } from 'mol-task' const { skipWhitespace, eatValue, markLine, getTokenString, readLine } = Tokenizer; @@ -130,12 +130,12 @@ async function handleAtoms(state: State): Promise<Schema.Mol2Atoms> { } // required columns - const atom_idTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const atom_nameTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const xTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const yTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const zTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const atom_typeTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); + const atom_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const atom_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const xTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const yTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const zTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const atom_typeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const atom_idTokenColumn = TokenColumn(atom_idTokens); const atom_nameTokenColumn = TokenColumn(atom_nameTokens); @@ -145,10 +145,10 @@ async function handleAtoms(state: State): Promise<Schema.Mol2Atoms> { const atom_typeColumn = TokenColumn(atom_typeTokens); // optional columns - const subst_idTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const subst_nameTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const chargeTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); - const status_bitTokens = TokenBuilder.create(tokenizer, molecule.num_atoms * 2); + const subst_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const subst_nameTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const chargeTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); + const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_atoms * 2); const subst_idTokenColumn = TokenColumn(subst_idTokens); const subst_nameTokenColumn = TokenColumn(subst_nameTokens); @@ -257,10 +257,10 @@ async function handleBonds(state: State): Promise<Schema.Mol2Bonds> { } // required columns - const bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2); - const origin_bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2); - const target_bond_idTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2); - const bondTypeTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2); + const bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); + const origin_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); + const target_bond_idTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); + const bondTypeTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); const bond_idTokenColumn = TokenColumn(bond_idTokens); const origin_bond_idTokenColumn = TokenColumn(origin_bond_idTokens); @@ -268,7 +268,7 @@ async function handleBonds(state: State): Promise<Schema.Mol2Bonds> { const bondTypeTokenColumn = TokenColumn(bondTypeTokens); // optional columns - const status_bitTokens = TokenBuilder.create(tokenizer, molecule.num_bonds * 2); + const status_bitTokens = TokenBuilder.create(tokenizer.data, molecule.num_bonds * 2); const status_bitTokenColumn = TokenColumn(status_bitTokens); const undefStr = Column.Undefined(molecule.num_bonds, Column.Schema.str); diff --git a/src/mol-io/reader/obj/parser.ts b/src/mol-io/reader/obj/parser.ts index 046143b305b51530b1e8d0e1cad7f8cb097fa50c..a8b113a33d0c88f5af90189e79359df8b5f097ae 100644 --- a/src/mol-io/reader/obj/parser.ts +++ b/src/mol-io/reader/obj/parser.ts @@ -4,7 +4,7 @@ * @author Alexander Rose <alexander.rose@weirdbyte.de> */ -import Result from '../result' +import { ReaderResult as Result } from '../result' import { Task, RuntimeContext } from 'mol-task' import { Mesh } from 'mol-geo/geometry/mesh/mesh'; diff --git a/src/mol-io/reader/pdb/parser.ts b/src/mol-io/reader/pdb/parser.ts new file mode 100644 index 0000000000000000000000000000000000000000..600ac278e0bbf6290e773c028bd5d0140dd5d201 --- /dev/null +++ b/src/mol-io/reader/pdb/parser.ts @@ -0,0 +1,14 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +import { PdbFile } from './schema'; +import { Task } from 'mol-task'; +import { ReaderResult } from '../result'; +import { Tokenizer } from '../common/text/tokenizer'; + +export function parsePDB(data: string, id?: string): Task<ReaderResult<PdbFile>> { + return Task.create('Parse PDB', async ctx => ReaderResult.success({ id, lines: await Tokenizer.readAllLinesAsync(data, ctx) })); +} \ No newline at end of file diff --git a/src/mol-io/reader/pdb/schema.ts b/src/mol-io/reader/pdb/schema.ts new file mode 100644 index 0000000000000000000000000000000000000000..3031f1aea0e085a50ec47a7ae352534ef4e2ce4a --- /dev/null +++ b/src/mol-io/reader/pdb/schema.ts @@ -0,0 +1,12 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +import { Tokens } from '../common/text/tokenizer'; + +export interface PdbFile { + id?: string, + lines: Tokens +} \ No newline at end of file diff --git a/src/mol-io/reader/pdb/to-cif.ts b/src/mol-io/reader/pdb/to-cif.ts new file mode 100644 index 0000000000000000000000000000000000000000..f206a7bb26744508ee2f5e304ff721917d2b2e49 --- /dev/null +++ b/src/mol-io/reader/pdb/to-cif.ts @@ -0,0 +1,280 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +import { CifField, CifCategory } from '../cif'; +import { mmCIF_Schema } from '../cif/schema/mmcif'; +import CifTextField, { CifTextValueField } from '../cif/text/field'; +import { TokenBuilder, Tokenizer } from '../common/text/tokenizer'; +import { PdbFile } from './schema'; +import { CifFile } from '../cif/data-model'; +import { substringStartsWith } from 'mol-util/string'; +import { Task } from 'mol-task'; + +function toCategory(name: string, fields: { [name: string]: CifField | undefined }, rowCount: number): CifCategory { + return { + name, + fieldNames: Object.keys(fields), + rowCount, + getField(f: string) { + return fields[f]; + } + } +} + +function _entity(): { [K in keyof mmCIF_Schema['entity']]?: CifField } { + return { + id: CifTextValueField(['1', '2', '3']), + type: CifTextValueField(['polymer', 'non-polymer', 'water']) + } +} + +function atom_site_template(data: string, count: number) { + const str = () => new Array(count) as string[]; + const ts = () => TokenBuilder.create(data, 2 * count); + return { + index: 0, + count, + group_PDB: ts(), + id: str(), + auth_atom_id: ts(), + label_alt_id: ts(), + auth_comp_id: ts(), + auth_asym_id: ts(), + auth_seq_id: ts(), + pdbx_PDB_ins_code: ts(), + Cartn_x: ts(), + Cartn_y: ts(), + Cartn_z: ts(), + occupancy: ts(), + B_iso_or_equiv: ts(), + type_symbol: ts(), + pdbx_PDB_model_num: str(), + label_entity_id: str() + }; +} + +function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } { + const auth_asym_id = CifTextField(sites.auth_asym_id, sites.count); + const auth_atom_id = CifTextField(sites.auth_atom_id, sites.count); + const auth_comp_id = CifTextField(sites.auth_comp_id, sites.count); + const auth_seq_id = CifTextField(sites.auth_seq_id, sites.count); + + return { + auth_asym_id, + auth_atom_id, + auth_comp_id, + auth_seq_id, + B_iso_or_equiv: CifTextField(sites.B_iso_or_equiv, sites.count), + Cartn_x: CifTextField(sites.Cartn_x, sites.count), + Cartn_y: CifTextField(sites.Cartn_y, sites.count), + Cartn_z: CifTextField(sites.Cartn_z, sites.count), + group_PDB: CifTextField(sites.group_PDB, sites.count), + id: CifTextValueField(sites.id), + + label_alt_id: CifTextField(sites.label_alt_id, sites.count), + + label_asym_id: auth_asym_id, + label_atom_id: auth_atom_id, + label_comp_id: auth_comp_id, + label_seq_id: auth_seq_id, + label_entity_id: CifTextValueField(sites.label_entity_id), + + occupancy: CifTextField(sites.occupancy, sites.count), + type_symbol: CifTextField(sites.type_symbol, sites.count), + + pdbx_PDB_ins_code: CifTextField(sites.pdbx_PDB_ins_code, sites.count), + pdbx_PDB_model_num: CifTextValueField(sites.pdbx_PDB_model_num) + }; +} + +const WaterNames = new Set([ 'SOL', 'WAT', 'HOH', 'H2O', 'W', 'DOD', 'D3O', 'TIP3', 'TIP4', 'SPC' ]); + +function getEntityId(residueName: string, isHet: boolean) { + if (isHet) { + if (WaterNames.has(residueName)) return '3'; + return '2'; + } + return '1'; +} + +function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number, isHet: boolean) { + const { data: str } = data; + let startPos = s; + let start = s; + const end = e; + const length = end - start; + + // TODO: filter invalid atoms + + // COLUMNS DATA TYPE CONTENTS + // -------------------------------------------------------------------------------- + // 1 - 6 Record name "ATOM " + Tokenizer.trim(data, start, start + 6); + TokenBuilder.add(sites.group_PDB, data.tokenStart, data.tokenEnd); + + // 7 - 11 Integer Atom serial number. + // TODO: support HEX + start = startPos + 6; + Tokenizer.trim(data, start, start + 5); + sites.id[sites.index] = data.data.substring(data.tokenStart, data.tokenEnd); + + // 13 - 16 Atom Atom name. + start = startPos + 12; + Tokenizer.trim(data, start, start + 4); + TokenBuilder.add(sites.auth_atom_id, data.tokenStart, data.tokenEnd); + + // 17 Character Alternate location indicator. + if (str.charCodeAt(startPos + 16) === 32) { // ' ' + TokenBuilder.add(sites.label_alt_id, 0, 0); + } else { + TokenBuilder.add(sites.label_alt_id, startPos + 16, startPos + 17); + } + + // 18 - 20 Residue name Residue name. + start = startPos + 17; + Tokenizer.trim(data, start, start + 3); + TokenBuilder.add(sites.auth_comp_id, data.tokenStart, data.tokenEnd); + const residueName = str.substring(data.tokenStart, data.tokenEnd); + + // 22 Character Chain identifier. + TokenBuilder.add(sites.auth_asym_id, startPos + 21, startPos + 22); + + // 23 - 26 Integer Residue sequence number. + // TODO: support HEX + start = startPos + 22; + Tokenizer.trim(data, start, start + 4); + TokenBuilder.add(sites.auth_seq_id, data.tokenStart, data.tokenEnd); + + // 27 AChar Code for insertion of residues. + if (str.charCodeAt(startPos + 26) === 32) { // ' ' + TokenBuilder.add(sites.label_alt_id, 0, 0); + } else { + TokenBuilder.add(sites.label_alt_id, startPos + 26, startPos + 27); + } + + // 31 - 38 Real(8.3) Orthogonal coordinates for X in Angstroms. + start = startPos + 30; + Tokenizer.trim(data, start, start + 8); + TokenBuilder.add(sites.Cartn_x, data.tokenStart, data.tokenEnd); + + // 39 - 46 Real(8.3) Orthogonal coordinates for Y in Angstroms. + start = startPos + 38; + Tokenizer.trim(data, start, start + 8); + TokenBuilder.add(sites.Cartn_y, data.tokenStart, data.tokenEnd); + + // 47 - 54 Real(8.3) Orthogonal coordinates for Z in Angstroms. + start = startPos + 46; + Tokenizer.trim(data, start, start + 8); + TokenBuilder.add(sites.Cartn_z, data.tokenStart, data.tokenEnd); + + // 55 - 60 Real(6.2) Occupancy. + start = startPos + 54; + Tokenizer.trim(data, start, start + 6); + TokenBuilder.add(sites.occupancy, data.tokenStart, data.tokenEnd); + + // 61 - 66 Real(6.2) Temperature factor (Default = 0.0). + if (length >= 66) { + start = startPos + 60; + Tokenizer.trim(data, start, start + 6); + TokenBuilder.add(sites.B_iso_or_equiv, data.tokenStart, data.tokenEnd); + } else { + TokenBuilder.add(sites.label_alt_id, 0, 0); + } + + // 73 - 76 LString(4) Segment identifier, left-justified. + // ignored + + // 77 - 78 LString(2) Element symbol, right-justified. + if (length >= 78) { + start = startPos + 76; + Tokenizer.trim(data, start, start + 2); + + if (data.tokenStart < data.tokenEnd) { + TokenBuilder.add(sites.type_symbol, data.tokenStart, data.tokenEnd); + } else { + // "guess" the symbol + TokenBuilder.add(sites.type_symbol, startPos + 12, startPos + 13); + } + } else { + TokenBuilder.add(sites.type_symbol, startPos + 12, startPos + 13); + } + + sites.label_entity_id[sites.index] = getEntityId(residueName, isHet); + sites.pdbx_PDB_model_num[sites.index] = model; + + sites.index++; +} + +type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never + +async function pdbToMmCIF(pdb: PdbFile): Promise<CifFile> { + const { lines } = pdb; + const { data, indices } = lines; + const tokenizer = Tokenizer(data); + + // Count the atoms + let atomCount = 0; + for (let i = 0, _i = lines.count; i < _i; i++) { + const s = indices[2 * i], e = indices[2 * i + 1]; + switch (data[s]) { + case 'A': + if (substringStartsWith(data, s, e, 'ATOM ')) atomCount++; + break; + case 'H': + if (substringStartsWith(data, s, e, 'HETATM')) atomCount++; + break; + } + } + + const atom_site = atom_site_template(data, atomCount); + + let modelNum = 0, modelStr = ''; + + for (let i = 0, _i = lines.count; i < _i; i++) { + const s = indices[2 * i], e = indices[2 * i + 1]; + switch (data[s]) { + case 'A': + if (!substringStartsWith(data, s, e, 'ATOM ')) continue; + if (!modelNum) { modelNum++; modelStr = '' + modelNum; } + addAtom(atom_site, modelStr, tokenizer, s, e, false); + break; + case 'H': + if (!substringStartsWith(data, s, e, 'HETATM')) continue; + if (!modelNum) { modelNum++; modelStr = '' + modelNum; } + addAtom(atom_site, modelStr, tokenizer, s, e, true); + break; + case 'M': + if (substringStartsWith(data, s, e, 'MODEL ')) { + modelNum++; + modelStr = '' + modelNum; + } + break; + + } + } + + const categories = { + entity: toCategory('entity', _entity(), 3), + atom_site: toCategory('atom_site', _atom_site(atom_site), atomCount) + } + + return { + name: pdb.id, + blocks: [{ + saveFrames: [], + header: pdb.id || 'PDB', + categoryNames: Object.keys(categories), + categories + }] + }; +} + +export function convertPDBtoMmCif(pdb: PdbFile): Task<CifFile> { + return Task.create('Convert PDB to mmCIF', async ctx => { + await ctx.update('Converting to mmCIF...'); + return pdbToMmCIF(pdb); + }); +} \ No newline at end of file diff --git a/src/mol-io/reader/result.ts b/src/mol-io/reader/result.ts index 4eb76dd373929b858e09e2de3d7f649abd078f86..255ae0c9eac4e20d0068d2c266db1fa47e696b8c 100644 --- a/src/mol-io/reader/result.ts +++ b/src/mol-io/reader/result.ts @@ -5,7 +5,7 @@ * @author David Sehnal <david.sehnal@gmail.com> */ -type ReaderResult<T> = Success<T> | Error +type ReaderResult<T> = ReaderResult.Success<T> | ReaderResult.Error namespace ReaderResult { export function error<T>(message: string, line = -1): ReaderResult<T> { @@ -15,28 +15,28 @@ namespace ReaderResult { export function success<T>(result: T, warnings: string[] = []): ReaderResult<T> { return new Success<T>(result, warnings); } -} -export class Error { - isError: true = true; + export class Error { + isError: true = true; - toString() { - if (this.line >= 0) { - return `[Line ${this.line}] ${this.message}`; + toString() { + if (this.line >= 0) { + return `[Line ${this.line}] ${this.message}`; + } + return this.message; } - return this.message; - } - constructor( - public message: string, - public line: number) { + constructor( + public message: string, + public line: number) { + } } -} -export class Success<T> { - isError: false = false; + export class Success<T> { + isError: false = false; - constructor(public result: T, public warnings: string[]) { } + constructor(public result: T, public warnings: string[]) { } + } } -export default ReaderResult \ No newline at end of file +export { ReaderResult } \ No newline at end of file diff --git a/src/mol-model/structure/model/format.ts b/src/mol-model/structure/model/format.ts index d2053f170306e4e704fc6399dda1f0914a276b8e..18f242c7caba2ca51dfb8e920141342a0afb5c33 100644 --- a/src/mol-model/structure/model/format.ts +++ b/src/mol-model/structure/model/format.ts @@ -7,6 +7,7 @@ // import { File as GroFile } from 'mol-io/reader/gro/schema' import { mmCIF_Database } from 'mol-io/reader/cif/schema/mmcif' import CIF, { CifFrame } from 'mol-io/reader/cif'; +import { PdbFile } from 'mol-io/reader/pdb/schema'; type Format = // | Format.gro @@ -15,10 +16,10 @@ type Format = namespace Format { // export interface gro { kind: 'gro', data: GroFile } export interface mmCIF { kind: 'mmCIF', data: mmCIF_Database, frame: CifFrame } + export function mmCIF(frame: CifFrame, data?: mmCIF_Database): mmCIF { return { kind: 'mmCIF', data: data || CIF.schema.mmCIF(frame), frame }; } - export function mmCIF(frame: CifFrame, data?: mmCIF_Database): mmCIF { - return { kind: 'mmCIF', data: data || CIF.schema.mmCIF(frame), frame }; - } + export interface PDB { kind: 'PDB', data: PdbFile } + export function PDB(data: PdbFile) { return { kind: 'PDB', data }; } } export default Format \ No newline at end of file diff --git a/src/mol-model/structure/model/formats/pdb.ts b/src/mol-model/structure/model/formats/pdb.ts new file mode 100644 index 0000000000000000000000000000000000000000..6ff81ec997b08de3118495b0167ca80b8a327a47 --- /dev/null +++ b/src/mol-model/structure/model/formats/pdb.ts @@ -0,0 +1,269 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + */ + +import Format from '../format'; +import { Model } from '../model'; +import { Task } from 'mol-task'; +import { PdbFile } from 'mol-io/reader/pdb/schema'; +import from_mmCIF from './mmcif'; +import { mmCIF_Schema } from 'mol-io/reader/cif/schema/mmcif'; +import { substringStartsWith } from 'mol-util/string'; +import { TokenBuilder, Tokenizer } from 'mol-io/reader/common/text/tokenizer'; +import { CifField, CifCategory } from 'mol-io/reader/cif'; +import CifTextField, { CifTextValueField } from 'mol-io/reader/cif/text/field'; + +function toCategory(name: string, fields: { [name: string]: CifField | undefined }, rowCount: number): CifCategory { + return { + name, + fieldNames: Object.keys(fields), + rowCount, + getField(f: string) { + return fields[f]; + } + } +} + +function _entity(): { [K in keyof mmCIF_Schema['entity']]?: CifField } { + return { + id: CifTextValueField(['1', '2', '3']), + type: CifTextValueField(['polymer', 'non-polymer', 'water']) + } +} + +function atom_site_template(data: string, count: number) { + const str = () => new Array(count) as string[]; + const ts = () => TokenBuilder.create(data, 2 * count); + return { + count, + group_PDB: ts(), + id: str(), + auth_atom_id: ts(), + label_alt_id: ts(), + auth_comp_id: ts(), + auth_asym_id: ts(), + auth_seq_id: ts(), + pdbx_PDB_ins_code: ts(), + Cartn_x: ts(), + Cartn_y: ts(), + Cartn_z: ts(), + occupancy: ts(), + B_iso_or_equiv: ts(), + type_symbol: ts(), + pdbx_PDB_model_num: str(), + label_entity_id: str() + }; +} + +function _atom_site(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site']]?: CifField } { + const auth_asym_id = CifTextField(sites.auth_asym_id, sites.count); + const auth_atom_id = CifTextField(sites.auth_atom_id, sites.count); + const auth_comp_id = CifTextField(sites.auth_comp_id, sites.count); + const auth_seq_id = CifTextField(sites.auth_seq_id, sites.count); + + return { + auth_asym_id, + auth_atom_id, + auth_comp_id, + auth_seq_id, + B_iso_or_equiv: CifTextField(sites.B_iso_or_equiv, sites.count), + Cartn_x: CifTextField(sites.Cartn_x, sites.count), + Cartn_y: CifTextField(sites.Cartn_y, sites.count), + Cartn_z: CifTextField(sites.Cartn_z, sites.count), + group_PDB: CifTextField(sites.group_PDB, sites.count), + id: CifTextValueField(sites.id), + + label_alt_id: CifTextField(sites.label_alt_id, sites.count), + + label_asym_id: auth_asym_id, + label_atom_id: auth_atom_id, + label_comp_id: auth_comp_id, + label_seq_id: auth_seq_id, + label_entity_id: CifTextValueField(sites.label_entity_id), + + occupancy: CifTextField(sites.occupancy, sites.count), + type_symbol: CifTextField(sites.type_symbol, sites.count), + + pdbx_PDB_ins_code: CifTextField(sites.pdbx_PDB_ins_code, sites.count), + pdbx_PDB_model_num: CifTextValueField(sites.pdbx_PDB_model_num) + }; +} + +function addAtom(sites: AtomSiteTemplate, model: string, data: Tokenizer, s: number, e: number) { + const { data: str } = data; + let startPos = s; + let start = s; + const end = e; + const length = end - start; + + // TODO: filter invalid atoms + + // COLUMNS DATA TYPE CONTENTS + // -------------------------------------------------------------------------------- + // 1 - 6 Record name "ATOM " + Tokenizer.trim(data, start, start + 6); + TokenBuilder.add(sites.group_PDB, data.tokenStart, data.tokenEnd); + + // 7 - 11 Integer Atom serial number. + // TODO: support HEX + start = startPos + 6; + Tokenizer.trim(data, start, start + 5); + sites.id[sites.id.length] = data.data.substring(data.tokenStart, data.tokenEnd); + + // 13 - 16 Atom Atom name. + start = startPos + 12; + Tokenizer.trim(data, start, start + 4); + TokenBuilder.add(sites.auth_atom_id, data.tokenStart, data.tokenEnd); + + // 17 Character Alternate location indicator. + if (str.charCodeAt(startPos + 16) === 32) { // ' ' + TokenBuilder.add(sites.label_alt_id, 0, 0); + } else { + TokenBuilder.add(sites.label_alt_id, startPos + 16, startPos + 17); + } + + // 18 - 20 Residue name Residue name. + start = startPos + 17; + Tokenizer.trim(data, start, start + 3); + TokenBuilder.add(sites.auth_comp_id, data.tokenStart, data.tokenEnd); + + // 22 Character Chain identifier. + TokenBuilder.add(sites.auth_asym_id, startPos + 21, startPos + 22); + + // 23 - 26 Integer Residue sequence number. + // TODO: support HEX + start = startPos + 22; + Tokenizer.trim(data, start, start + 4); + TokenBuilder.add(sites.auth_seq_id, data.tokenStart, data.tokenEnd); + + // 27 AChar Code for insertion of residues. + if (str.charCodeAt(startPos + 26) === 32) { // ' ' + TokenBuilder.add(sites.label_alt_id, 0, 0); + } else { + TokenBuilder.add(sites.label_alt_id, startPos + 26, startPos + 27); + } + + // 31 - 38 Real(8.3) Orthogonal coordinates for X in Angstroms. + start = startPos + 30; + Tokenizer.trim(data, start, start + 8); + TokenBuilder.add(sites.Cartn_x, data.tokenStart, data.tokenEnd); + + // 39 - 46 Real(8.3) Orthogonal coordinates for Y in Angstroms. + start = startPos + 38; + Tokenizer.trim(data, start, start + 8); + TokenBuilder.add(sites.Cartn_y, data.tokenStart, data.tokenEnd); + + // 47 - 54 Real(8.3) Orthogonal coordinates for Z in Angstroms. + start = startPos + 46; + Tokenizer.trim(data, start, start + 8); + TokenBuilder.add(sites.Cartn_z, data.tokenStart, data.tokenEnd); + + // 55 - 60 Real(6.2) Occupancy. + start = startPos + 54; + Tokenizer.trim(data, start, start + 6); + TokenBuilder.add(sites.occupancy, data.tokenStart, data.tokenEnd); + + // 61 - 66 Real(6.2) Temperature factor (Default = 0.0). + if (length >= 66) { + start = startPos + 60; + Tokenizer.trim(data, start, start + 6); + TokenBuilder.add(sites.B_iso_or_equiv, data.tokenStart, data.tokenEnd); + } else { + TokenBuilder.add(sites.label_alt_id, 0, 0); + } + + // 73 - 76 LString(4) Segment identifier, left-justified. + // ignored + + // 77 - 78 LString(2) Element symbol, right-justified. + if (length >= 78) { + start = startPos + 76; + Tokenizer.trim(data, start, start + 2); + + if (data.tokenStart < data.tokenEnd) { + TokenBuilder.add(sites.type_symbol, data.tokenStart, data.tokenEnd); + } else { + // "guess" the symbol + TokenBuilder.add(sites.type_symbol, startPos + 12, startPos + 13); + } + } else { + TokenBuilder.add(sites.type_symbol, startPos + 12, startPos + 13); + } + + // TODO + sites.label_entity_id.push('1'); + sites.pdbx_PDB_model_num.push(model); + +} + +type AtomSiteTemplate = typeof atom_site_template extends (...args: any) => infer T ? T : never + +async function pdbToMmCIF(pdb: PdbFile): Promise<Format.mmCIF> { + const { lines } = pdb; + const { data, indices } = lines; + const tokenizer = Tokenizer(data); + + // Count the atoms + let atomCount = 0; + for (let i = 0, _i = lines.count; i < _i; i++) { + const s = indices[2 * i], e = indices[2 * i + 1]; + switch (data[s]) { + case 'A': + if (substringStartsWith(data, s, e, 'ATOM ')) atomCount++; + break; + case 'H': + if (!substringStartsWith(data, s, e, 'HETATM')) atomCount++; + break; + } + } + + const atom_site = atom_site_template(data, atomCount); + + let modelNum = 0, modelStr = ''; + + for (let i = 0, _i = lines.count; i < _i; i++) { + const s = indices[2 * i], e = indices[2 * i + 1]; + switch (data[s]) { + case 'A': + if (!substringStartsWith(data, s, e, 'ATOM ')) continue; + if (!modelNum) { modelNum++; modelStr = '' + modelNum; } + addAtom(atom_site, modelStr, tokenizer, s, e); + break; + case 'H': + if (!substringStartsWith(data, s, e, 'HETATM')) continue; + if (!modelNum) { modelNum++; modelStr = '' + modelNum; } + addAtom(atom_site, modelStr, tokenizer, s, e); + break; + case 'M': + if (substringStartsWith(data, s, e, 'MODEL ')) { + modelNum++; + modelStr = '' + modelNum; + } + break; + + } + } + + const categories = { + entity: toCategory('entity', _entity(), 3), + atom_site: toCategory('atom_site', _atom_site(atom_site), atomCount) + } + + return Format.mmCIF({ + header: pdb.id || 'PDB', + categoryNames: Object.keys(categories), + categories + }); +} + +function buildModels(format: Format.PDB): Task<ReadonlyArray<Model>> { + return Task.create('Create PDB Model', async ctx => { + await ctx.update('Converting to mmCIF...'); + const cif = await pdbToMmCIF(format.data); + return from_mmCIF(cif).runInContext(ctx); + }); +} + +export default buildModels; \ No newline at end of file diff --git a/src/mol-plugin/state/actions/basic.ts b/src/mol-plugin/state/actions/basic.ts index 68bef84e34dfa245565b30aa2d25e2c4ff484a9f..48c855fbcc57c4d8e3892d0715c743aee30c30b6 100644 --- a/src/mol-plugin/state/actions/basic.ts +++ b/src/mol-plugin/state/actions/basic.ts @@ -41,6 +41,7 @@ const DownloadStructure = StateAction.build({ }, { isFlat: true }), 'url': PD.Group({ url: PD.Text(''), + format: PD.Select('cif', [['cif', 'CIF'], ['pdb', 'PDB']]), isBinary: PD.Boolean(false), supportProps: PD.Boolean(false) }, { isFlat: true }) @@ -60,7 +61,7 @@ const DownloadStructure = StateAction.build({ switch (src.name) { case 'url': - downloadParams = src.params; + downloadParams = { url: src.params.url, isBinary: src.params.isBinary }; break; case 'pdbe-updated': downloadParams = { url: `https://www.ebi.ac.uk/pdbe/static/entry/${src.params.id.toLowerCase()}_updated.cif`, isBinary: false, label: `PDBe: ${src.params.id}` }; @@ -75,7 +76,8 @@ const DownloadStructure = StateAction.build({ } const data = b.toRoot().apply(StateTransforms.Data.Download, downloadParams); - return state.updateTree(createStructureTree(ctx, data, params.source.params.supportProps)); + const traj = createModelTree(data, src.name === 'url' ? src.params.format : 'cif'); + return state.updateTree(createStructureTree(ctx, traj, params.source.params.supportProps)); }); export const OpenStructure = StateAction.build({ @@ -85,15 +87,20 @@ export const OpenStructure = StateAction.build({ })(({ params, state }, ctx: PluginContext) => { const b = state.build(); const data = b.toRoot().apply(StateTransforms.Data.ReadFile, { file: params.file, isBinary: /\.bcif$/i.test(params.file.name) }); - return state.updateTree(createStructureTree(ctx, data, false)); + const traj = createModelTree(data, 'cif'); + return state.updateTree(createStructureTree(ctx, traj, false)); }); -function createStructureTree(ctx: PluginContext, b: StateTreeBuilder.To<PluginStateObject.Data.Binary | PluginStateObject.Data.String>, supportProps: boolean): StateTree { - let root = b - .apply(StateTransforms.Data.ParseCif) - .apply(StateTransforms.Model.TrajectoryFromMmCif) - .apply(StateTransforms.Model.ModelFromTrajectory, { modelIndex: 0 }); +function createModelTree(b: StateTreeBuilder.To<PluginStateObject.Data.Binary | PluginStateObject.Data.String>, format: 'pdb' | 'cif' = 'cif') { + const parsed = format === 'cif' + ? b.apply(StateTransforms.Data.ParseCif).apply(StateTransforms.Model.TrajectoryFromMmCif) + : b.apply(StateTransforms.Data.ConvertPDBtoMmCif).apply(StateTransforms.Model.TrajectoryFromMmCif); + + return parsed.apply(StateTransforms.Model.ModelFromTrajectory, { modelIndex: 0 }); +} +function createStructureTree(ctx: PluginContext, b: StateTreeBuilder.To<PluginStateObject.Molecule.Model>, supportProps: boolean): StateTree { + let root = b; if (supportProps) { root = root.apply(StateTransforms.Model.CustomModelProperties); } diff --git a/src/mol-plugin/state/transforms/data.ts b/src/mol-plugin/state/transforms/data.ts index 59ed1b3a0d9e8e3eb3fb6fb615a4a3bfbb18b017..c8395300b2d6b25a0e000220390ef33b51b0949f 100644 --- a/src/mol-plugin/state/transforms/data.ts +++ b/src/mol-plugin/state/transforms/data.ts @@ -15,6 +15,8 @@ import { Transformer } from 'mol-state'; import { readFromFile } from 'mol-util/data-source'; import * as CCP4 from 'mol-io/reader/ccp4/parser' import * as DSN6 from 'mol-io/reader/dsn6/parser' +import { parsePDB } from 'mol-io/reader/pdb/parser'; +import { convertPDBtoMmCif } from 'mol-io/reader/pdb/to-cif'; export { Download } type Download = typeof Download @@ -95,6 +97,24 @@ const ParseCif = PluginStateTransform.BuiltIn({ } }); +export { ConvertPDBtoMmCif } +type ConvertPDBtoMmCif = typeof ConvertPDBtoMmCif +const ConvertPDBtoMmCif = PluginStateTransform.BuiltIn({ + name: 'convert-pdb-to-mmcif', + display: { name: 'Convert PDB string to mmCIF' }, + from: [SO.Data.String], + to: SO.Format.Cif +})({ + apply({ a }) { + return Task.create('Parse CIF', async ctx => { + const parsed = await parsePDB(a.data).runInContext(ctx); + if (parsed.isError) throw new Error(parsed.message); + const cif = await convertPDBtoMmCif(parsed.result).runInContext(ctx); + return new SO.Format.Cif(cif); + }); + } +}); + export { ParseCcp4 } type ParseCcp4 = typeof ParseCcp4 const ParseCcp4 = PluginStateTransform.BuiltIn({ diff --git a/src/mol-util/string.ts b/src/mol-util/string.ts index f2e8f5958ba30fd56124d1844cb478f45e1caa14..9e33986c6bf8f4c11fc773cb6e068df977b9dbaf 100644 --- a/src/mol-util/string.ts +++ b/src/mol-util/string.ts @@ -37,4 +37,13 @@ export function snakeCaseToWords(str: string) { export function stringToWords(str: string) { return capitalize(splitCamelCase(splitSnakeCase(str))) +} + +export function substringStartsWith(str: string, start: number, end: number, target: string) { + let len = target.length; + if (len > end - start) return false; + for (let i = 0; i < len; i++) { + if (str.charCodeAt(start + i) !== target.charCodeAt(i)) return false; + } + return true; } \ No newline at end of file