Skip to content
Snippets Groups Projects
Commit 0e86ac54 authored by David Sehnal's avatar David Sehnal
Browse files

cif2bcif column classifier

parent 31c3955f
No related branches found
No related tags found
No related merge requests found
......@@ -8,36 +8,65 @@ import CIF, { CifCategory } from 'mol-io/reader/cif'
import { CifWriter } from 'mol-io/writer/cif'
import * as fs from 'fs'
import classify from './field-classifier'
import { Progress, Task, RuntimeContext } from 'mol-task';
async function getCIF(path: string) {
function showProgress(p: Progress) {
process.stdout.write(`\r${new Array(80).join(' ')}`);
process.stdout.write(`\r${Progress.format(p)}`);
}
async function getCIF(ctx: RuntimeContext, path: string) {
const str = fs.readFileSync(path, 'utf8');
const parsed = await CIF.parseText(str).run();
const parsed = await CIF.parseText(str).runInContext(ctx);
if (parsed.isError) {
throw new Error(parsed.toString());
}
return parsed.result;
}
function getCategoryInstanceProvider(cat: CifCategory): CifWriter.Category.Provider {
function getCategoryInstanceProvider(cat: CifCategory, fields: CifWriter.Field[]): CifWriter.Category.Provider {
return function (ctx: any) {
return {
data: cat,
name: cat.name,
fields: cat.fieldNames.map(n => classify(n, cat.getField(n)!)),
fields,
rowCount: cat.rowCount
};
}
}
export default async function convert(path: string, asText = false) {
const cif = await getCIF(path);
export default function convert(path: string, asText = false) {
return Task.create<Uint8Array>('BinaryCIF', async ctx => {
const cif = await getCIF(ctx, path);
const encoder = CifWriter.createEncoder({ binary: !asText, encoderName: 'mol* cif2bcif' });
for (const b of cif.blocks) {
encoder.startDataBlock(b.header);
for (const c of b.categoryNames) {
encoder.writeCategory(getCategoryInstanceProvider(b.categories[c]));
const encoder = CifWriter.createEncoder({ binary: !asText, encoderName: 'mol* cif2bcif' });
let maxProgress = 0;
for (const b of cif.blocks) {
maxProgress += b.categoryNames.length;
for (const c of b.categoryNames) maxProgress += b.categories[c].fieldNames.length;
}
}
return encoder.getData();
}
let current = 0;
for (const b of cif.blocks) {
encoder.startDataBlock(b.header);
for (const c of b.categoryNames) {
const cat = b.categories[c];
const fields: CifWriter.Field[] = [];
for (const f of cat.fieldNames) {
fields.push(classify(f, cat.getField(f)!))
current++;
if (ctx.shouldUpdate) await ctx.update({ message: 'Encoding...', current, max: maxProgress });
}
encoder.writeCategory(getCategoryInstanceProvider(b.categories[c], fields));
current++;
if (ctx.shouldUpdate) await ctx.update({ message: 'Encoding...', current, max: maxProgress });
}
}
await ctx.update('Exporting...');
const ret = encoder.getData() as Uint8Array;
await ctx.update('Done.');
return ret;
}).run(showProgress, 250);
}
\ No newline at end of file
......@@ -7,9 +7,172 @@
import { Column } from 'mol-data/db'
import { CifField } from 'mol-io/reader/cif/data-model'
import { CifWriter } from 'mol-io/writer/cif'
import { ArrayEncoder, ArrayEncoding as E } from 'mol-io/common/binary-cif';
namespace IntClassifier {
function packSize(value: number, upperLimit: number) {
return value >= 0
? Math.ceil((value + 1) / upperLimit)
: Math.ceil((value + 1) / (-upperLimit - 1));
}
type IntColumnInfo = { signed: boolean, limit8: number, limit16: number };
function getInfo(data: number[]): IntColumnInfo {
let signed = false;
for (let i = 0, n = data.length; i < n; i++) {
if (data[i] < 0) {
signed = true;
break;
}
}
return signed ? { signed, limit8: 0x7F, limit16: 0x7FFF } : { signed, limit8: 0xFF, limit16: 0xFFFF };
}
type SizeInfo = { pack8: number, pack16: number, count: number }
function SizeInfo(): SizeInfo { return { pack8: 0, pack16: 0, count: 0 } };
function incSize({ limit8, limit16 }: IntColumnInfo, info: SizeInfo, value: number) {
info.pack8 += packSize(value, limit8);
info.pack16 += packSize(value, limit16);
info.count += 1;
}
function incSizeSigned(info: SizeInfo, value: number) {
info.pack8 += packSize(value, 0x7F);
info.pack16 += packSize(value, 0x7FFF);
info.count += 1;
}
function byteSize(info: SizeInfo) {
if (info.count * 4 < info.pack16 * 2) return { length: info.count * 4, elem: 4 };
if (info.pack16 * 2 < info.pack8) return { length: info.pack16 * 2, elem: 2 };
return { length: info.pack8, elem: 1 };
}
function packingSize(data: number[], info: IntColumnInfo) {
const size = SizeInfo();
for (let i = 0, n = data.length; i < n; i++) {
incSize(info, size, data[i]);
}
return { ...byteSize(size), kind: 'pack' };
}
function deltaSize(data: number[], info: IntColumnInfo) {
const size = SizeInfo();
let prev = data[0];
for (let i = 1, n = data.length; i < n; i++) {
incSizeSigned(size, data[i] - prev);
prev = data[i];
}
return { ...byteSize(size), kind: 'delta' };
}
function rleSize(data: number[], info: IntColumnInfo) {
const size = SizeInfo();
let run = 1;
for (let i = 1, n = data.length; i < n; i++) {
if (data[i - 1] !== data[i]) {
incSize(info, size, data[i - 1]);
incSize(info, size, run);
run = 1;
} else {
run++;
}
}
incSize(info, size, data[data.length - 1]);
incSize(info, size, run);
return { ...byteSize(size), kind: 'rle' };
}
function deltaRleSize(data: number[], info: IntColumnInfo) {
const size = SizeInfo();
let run = 1, prev = 0, prevValue = 0;
for (let i = 1, n = data.length; i < n; i++) {
const v = data[i] - prev;
if (prevValue !== v) {
incSizeSigned(size, prevValue);
incSizeSigned(size, run);
run = 1;
} else {
run++;
}
prevValue = v;
prev = data[i];
}
incSizeSigned(size, prevValue);
incSizeSigned(size, run);
return { ...byteSize(size), kind: 'delta-rle' };
}
export function getSize(data: number[]) {
const info = getInfo(data);
const sizes = [packingSize(data, info), rleSize(data, info), deltaSize(data, info), deltaRleSize(data, info)];
sizes.sort((a, b) => a.length - b.length);
return sizes;
}
export function classify(data: number[], name: string): ArrayEncoder {
if (data.length < 2) return E.by(E.byteArray);
const sizes = getSize(data);
const size = sizes[0];
// console.log(`${name}: ${size.kind} ${size.length}b ${data.length}`);
// console.log(`${name}: ${sizes.map(s => `${s.kind}: ${s.length}b`).join(' | ')}`);
switch (size.kind) {
case 'pack': return E.by(E.integerPacking);
case 'rle': return E.by(E.runLength).and(E.integerPacking);
case 'delta': return E.by(E.delta).and(E.integerPacking);
case 'delta-rle': return E.by(E.delta).and(E.runLength).and(E.integerPacking);
}
throw 'bug';
}
}
namespace FloatClassifier {
const delta = 1e-6;
function digitCount(v: number) {
let m = 1;
for (let i = 0; i < 5; i++) {
const r = Math.round(m * v) / m;
if (Math.abs(v - r) < delta) return m;
m *= 10;
}
return 10000;
}
export function classify(data: number[], name: string) {
let dc = 10;
for (let i = 0, n = data.length; i < n; i++) dc = Math.max(dc, digitCount(data[i]));
if (dc >= 10000) return { encoder: E.by(E.byteArray), typedArray: Float64Array };
const intArray = new Int32Array(data.length);
for (let i = 0, n = data.length; i < n; i++) intArray[i] = data[i] * dc;
const sizes = IntClassifier.getSize(intArray as any);
const size = sizes[0];
// console.log(`>> ${name}: ${size.kind} ${size.length}b ${data.length} x${dc}`);
// console.log(` ${name}: ${sizes.map(s => `${s.kind}: ${s.length}b`).join(' | ')}`);
switch (size.kind) {
case 'pack': return { encoder: E.by(E.fixedPoint(dc)).and(E.integerPacking), typedArray: Float32Array };
case 'rle': return { encoder: E.by(E.fixedPoint(dc)).and(E.runLength).and(E.integerPacking), typedArray: Float32Array };
case 'delta': return { encoder: E.by(E.fixedPoint(dc)).and(E.delta).and(E.integerPacking), typedArray: Float32Array };
case 'delta-rle': return { encoder: E.by(E.fixedPoint(dc)).and(E.delta).and(E.runLength).and(E.integerPacking), typedArray: Float32Array };
}
throw 'bug';
}
}
const intRegex = /^-?\d+$/
const floatRegex = /^-?(([0-9]+)[.]?|([0-9]*[.][0-9]+))([(][0-9]+[)])?([eE][+-]?[0-9]+)?/
const floatRegex = /^-?(([0-9]+)[.]?|([0-9]*[.][0-9]+))([(][0-9]+[)])?([eE][+-]?[0-9]+)?$/
// Classify a cif field as str, int or float based the data it contains.
// To classify a field as int or float all items are checked.
......@@ -25,8 +188,13 @@ function classify(name: string, field: CifField): CifWriter.Field {
}
if (hasString) return { name, type: CifWriter.Field.Type.Str, value: field.str, valueKind: field.valueKind };
if (floatCount > 0) return { name, type: CifWriter.Field.Type.Float, value: field.float, valueKind: field.valueKind };
return { name, type: CifWriter.Field.Type.Int, value: field.int, valueKind: field.valueKind };
if (floatCount > 0) {
const { encoder, typedArray } = FloatClassifier.classify(field.toFloatArray({ array: Float64Array }) as number[], name)
return CifWriter.Field.float(name, field.float, { valueKind: field.valueKind, encoder, typedArray });
} else {
const encoder = IntClassifier.classify(field.toIntArray({ array: Int32Array }) as number[], name);
return CifWriter.Field.int(name, field.int, { valueKind: field.valueKind, encoder, typedArray: Int32Array });
}
}
export default classify;
\ No newline at end of file
......@@ -174,6 +174,7 @@ class ObservableRuntimeContext implements RuntimeContext {
const progress = this.node.progress;
if (typeof update === 'string') {
progress.message = update;
progress.isIndeterminate = true;
} else {
if (typeof update.canAbort !== 'undefined') progress.canAbort = update.canAbort;
if (typeof update.message !== 'undefined') progress.message = update.message;
......@@ -193,7 +194,7 @@ class ObservableRuntimeContext implements RuntimeContext {
this.lastUpdatedTime = now();
this.updateProgress(progress);
if (!!dontNotify || !shouldNotify(this.info, this.lastUpdatedTime)) return;
if (!!dontNotify /*|| !shouldNotify(this.info, this.lastUpdatedTime)*/) return;
notifyObserver(this.info, this.lastUpdatedTime);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment