Skip to content
Snippets Groups Projects
Commit 06b9c5f2 authored by dsehnal's avatar dsehnal
Browse files

change SDF data header parsing

- do not trim <> around field
- store whole line staring with '> ' as data header (without the staring '> ')
parent e03b689f
No related branches found
No related tags found
No related merge requests found
...@@ -7,6 +7,7 @@ Note that since we don't clearly distinguish between a public and private interf ...@@ -7,6 +7,7 @@ Note that since we don't clearly distinguish between a public and private interf
## [Unreleased] ## [Unreleased]
- Add `tubularHelices` parameter to Cartoon representation - Add `tubularHelices` parameter to Cartoon representation
- Add `SdfFormat` and update SDF parser to be able to parse data headers according to spec (hopefully :)) #230
## [v2.1.0] - 2021-07-05 ## [v2.1.0] - 2021-07-05
......
...@@ -22,8 +22,8 @@ M END ...@@ -22,8 +22,8 @@ M END
> <DATABASE_NAME> > <DATABASE_NAME>
drugbank drugbank
> <SMILES> > 5225 <TEST_FIELD>
[O-]P([O-])([O-])=O whatever
> <INCHI_IDENTIFIER> > <INCHI_IDENTIFIER>
InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-3 InChI=1S/H3O4P/c1-5(2,3)4/h(H3,1,2,3,4)/p-3
...@@ -362,22 +362,25 @@ describe('sdf reader', () => { ...@@ -362,22 +362,25 @@ describe('sdf reader', () => {
expect(bonds.atomIdxB.value(3)).toBe(5); expect(bonds.atomIdxB.value(3)).toBe(5);
expect(bonds.order.value(3)).toBe(1); expect(bonds.order.value(3)).toBe(1);
expect(dataItems.dataHeader.value(0)).toBe('DATABASE_ID'); expect(dataItems.dataHeader.value(0)).toBe('<DATABASE_ID>');
expect(dataItems.data.value(0)).toBe('0'); expect(dataItems.data.value(0)).toBe('0');
expect(dataItems.dataHeader.value(1)).toBe('DATABASE_NAME'); expect(dataItems.dataHeader.value(1)).toBe('<DATABASE_NAME>');
expect(dataItems.data.value(1)).toBe('drugbank'); expect(dataItems.data.value(1)).toBe('drugbank');
expect(dataItems.dataHeader.value(31)).toBe('SYNONYMS'); expect(dataItems.dataHeader.value(2)).toBe('5225 <TEST_FIELD>');
expect(dataItems.data.value(2)).toBe('whatever');
expect(dataItems.dataHeader.value(31)).toBe('<SYNONYMS>');
expect(dataItems.data.value(31)).toBe('Orthophosphate; Phosphate'); expect(dataItems.data.value(31)).toBe('Orthophosphate; Phosphate');
expect(compound1.dataItems.data.value(0)).toBe('0'); expect(compound1.dataItems.data.value(0)).toBe('0');
expect(compound2.dataItems.data.value(0)).toBe('1'); expect(compound2.dataItems.data.value(0)).toBe('1');
expect(compound3.dataItems.dataHeader.value(2)).toBe('PUBCHEM_CONFORMER_DIVERSEORDER'); expect(compound3.dataItems.dataHeader.value(2)).toBe('<PUBCHEM_CONFORMER_DIVERSEORDER>');
expect(compound3.dataItems.data.value(2)).toBe('1\n11\n10\n3\n15\n17\n13\n5\n16\n7\n14\n9\n8\n4\n18\n6\n12\n2'); expect(compound3.dataItems.data.value(2)).toBe('1\n11\n10\n3\n15\n17\n13\n5\n16\n7\n14\n9\n8\n4\n18\n6\n12\n2');
expect(compound3.dataItems.dataHeader.value(21)).toBe('PUBCHEM_COORDINATE_TYPE'); expect(compound3.dataItems.dataHeader.value(21)).toBe('<PUBCHEM_COORDINATE_TYPE>');
expect(compound3.dataItems.data.value(21)).toBe('2\n5\n10'); expect(compound3.dataItems.data.value(21)).toBe('2\n5\n10');
}); });
}); });
...@@ -26,6 +26,7 @@ export interface SdfFile { ...@@ -26,6 +26,7 @@ export interface SdfFile {
readonly compounds: SdfFileCompound[] readonly compounds: SdfFileCompound[]
} }
const delimiter = '$$$$'; const delimiter = '$$$$';
function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, data: Column<string> } { function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, data: Column<string> } {
const dataHeader = TokenBuilder.create(tokenizer.data, 32); const dataHeader = TokenBuilder.create(tokenizer.data, 32);
...@@ -36,8 +37,8 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da ...@@ -36,8 +37,8 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da
if (line.startsWith(delimiter)) break; if (line.startsWith(delimiter)) break;
if (!line) continue; if (!line) continue;
if (line.startsWith('> <')) { if (line.startsWith('> ')) {
TokenBuilder.add(dataHeader, tokenizer.tokenStart + 3, tokenizer.tokenEnd - 1); TokenBuilder.add(dataHeader, tokenizer.tokenStart + 2, tokenizer.tokenEnd);
Tokenizer.markLine(tokenizer); Tokenizer.markLine(tokenizer);
const start = tokenizer.tokenStart; const start = tokenizer.tokenStart;
...@@ -45,7 +46,7 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da ...@@ -45,7 +46,7 @@ function handleDataItems(tokenizer: Tokenizer): { dataHeader: Column<string>, da
let added = false; let added = false;
while (tokenizer.position < tokenizer.length) { while (tokenizer.position < tokenizer.length) {
const line2 = Tokenizer.readLine(tokenizer); const line2 = Tokenizer.readLine(tokenizer);
if (!line2 || line2.startsWith(delimiter) || line2.startsWith('> <')) { if (!line2 || line2.startsWith(delimiter) || line2.startsWith('> ')) {
TokenBuilder.add(data, start, end); TokenBuilder.add(data, start, end);
added = true; added = true;
break; break;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment