Skip to content
Snippets Groups Projects
Commit e9b57bb8 authored by Alexander Rose's avatar Alexander Rose
Browse files

improved schema generation from mmcif dic

parent 75e1dac4
No related branches found
No related tags found
No related merge requests found
pdbx_reference_molecule.prd_id
pdbx_reference_molecule.name
pdbx_reference_molecule.represent_as
pdbx_reference_molecule.type
pdbx_reference_molecule.type_evidence_code
pdbx_reference_molecule.class
pdbx_reference_molecule.class_evidence_code
pdbx_reference_molecule.formula
pdbx_reference_molecule.chem_comp_id
pdbx_reference_molecule.formula_weight
pdbx_reference_molecule.release_status
pdbx_reference_molecule.replaces
pdbx_reference_molecule.replaced_by
pdbx_reference_molecule.compound_detail
pdbx_reference_molecule.description
pdbx_reference_molecule.representative_PDB_id_code
pdbx_reference_entity_list.prd_id
pdbx_reference_entity_list.ref_entity_id
pdbx_reference_entity_list.component_id
pdbx_reference_entity_list.type
pdbx_reference_entity_list.details
pdbx_reference_entity_nonpoly.prd_id
pdbx_reference_entity_nonpoly.ref_entity_id
pdbx_reference_entity_nonpoly.name
pdbx_reference_entity_nonpoly.chem_comp_id
pdbx_reference_entity_link.prd_id
pdbx_reference_entity_link.link_id
pdbx_reference_entity_link.link_class
pdbx_reference_entity_link.ref_entity_id_1
pdbx_reference_entity_link.entity_seq_num_1
pdbx_reference_entity_link.comp_id_1
pdbx_reference_entity_link.atom_id_1
pdbx_reference_entity_link.ref_entity_id_2
pdbx_reference_entity_link.entity_seq_num_2
pdbx_reference_entity_link.comp_id_2
pdbx_reference_entity_link.atom_id_2
pdbx_reference_entity_link.value_order
pdbx_reference_entity_link.component_1
pdbx_reference_entity_link.component_2
pdbx_reference_entity_link.details
pdbx_reference_entity_poly_link.prd_id
pdbx_reference_entity_poly_link.ref_entity_id
pdbx_reference_entity_poly_link.link_id
pdbx_reference_entity_poly_link.atom_id_1
pdbx_reference_entity_poly_link.comp_id_1
pdbx_reference_entity_poly_link.entity_seq_num_1
pdbx_reference_entity_poly_link.atom_id_2
pdbx_reference_entity_poly_link.comp_id_2
pdbx_reference_entity_poly_link.entity_seq_num_2
pdbx_reference_entity_poly_link.value_order
pdbx_reference_entity_poly_link.component_id
pdbx_reference_entity_poly.prd_id
pdbx_reference_entity_poly.ref_entity_id
pdbx_reference_entity_poly.db_code
pdbx_reference_entity_poly.db_name
pdbx_reference_entity_poly.type
pdbx_reference_entity_sequence.prd_id
pdbx_reference_entity_sequence.ref_entity_id
pdbx_reference_entity_sequence.type
pdbx_reference_entity_sequence.NRP_flag
pdbx_reference_entity_sequence.one_letter_codes
pdbx_reference_entity_poly_seq.prd_id
pdbx_reference_entity_poly_seq.ref_entity_id
pdbx_reference_entity_poly_seq.num
pdbx_reference_entity_poly_seq.mon_id
pdbx_reference_entity_poly_seq.parent_mon_id
pdbx_reference_entity_poly_seq.hetero
pdbx_reference_entity_poly_seq.observed
pdbx_reference_entity_src_nat.prd_id
pdbx_reference_entity_src_nat.ref_entity_id
pdbx_reference_entity_src_nat.ordinal
pdbx_reference_entity_src_nat.taxid
pdbx_reference_entity_src_nat.organism_scientific
pdbx_reference_entity_src_nat.db_code
pdbx_reference_entity_src_nat.db_name
pdbx_prd_audit.prd_id
pdbx_prd_audit.date
pdbx_prd_audit.processing_site
pdbx_prd_audit.action_type
\ No newline at end of file
...@@ -21,7 +21,7 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount ...@@ -21,7 +21,7 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
const parsed = await comp(); const parsed = await comp();
if (parsed.isError) throw parsed if (parsed.isError) throw parsed
console.log(fieldNamesPath, minCount) // console.log(fieldNamesPath, minCount)
let filter: Filter | undefined let filter: Filter | undefined
if (minCount && fieldNamesPath) { if (minCount && fieldNamesPath) {
...@@ -32,7 +32,6 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount ...@@ -32,7 +32,6 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount
} else if (minCount) { } else if (minCount) {
filter = await getUsageCountsFilter(minCount) filter = await getUsageCountsFilter(minCount)
} else if (fieldNamesPath) { } else if (fieldNamesPath) {
console.log('MOIN')
filter = await getFieldNamesFilter(fieldNamesPath) filter = await getFieldNamesFilter(fieldNamesPath)
} }
...@@ -59,11 +58,11 @@ async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> { ...@@ -59,11 +58,11 @@ async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> {
const filter: Filter = {} const filter: Filter = {}
fieldNames.forEach((name, i) => { fieldNames.forEach((name, i) => {
const [ category, field ] = name.split('.') const [ category, field ] = name.split('.')
console.log(category, field) // console.log(category, field)
if (!filter[ category ]) filter[ category ] = {} if (!filter[ category ]) filter[ category ] = {}
filter[ category ][ field ] = true filter[ category ][ field ] = true
}) })
console.log(filter) // console.log(filter)
return filter return filter
} }
...@@ -93,7 +92,7 @@ async function ensureMmcifDicAvailable() { ...@@ -93,7 +92,7 @@ async function ensureMmcifDicAvailable() {
if (FORCE_MMCIF_DOWNLOAD || !fs.existsSync(MMCIF_DIC_PATH)) { if (FORCE_MMCIF_DOWNLOAD || !fs.existsSync(MMCIF_DIC_PATH)) {
console.log('downloading mmcif dic...') console.log('downloading mmcif dic...')
const data = await fetch(MMCIF_DIC_URL) const data = await fetch(MMCIF_DIC_URL)
if (!fs.existsSync(MMCIF_DIC_DIR)){ if (!fs.existsSync(MMCIF_DIC_DIR)) {
fs.mkdirSync(MMCIF_DIC_DIR); fs.mkdirSync(MMCIF_DIC_DIR);
} }
fs.writeFileSync(MMCIF_DIC_PATH, await data.text()) fs.writeFileSync(MMCIF_DIC_PATH, await data.text())
......
...@@ -4,22 +4,25 @@ ...@@ -4,22 +4,25 @@
* @author Alexander Rose <alexander.rose@weirdbyte.de> * @author Alexander Rose <alexander.rose@weirdbyte.de>
*/ */
import { Database, Column } from './json-schema' import { Database, ValueColumn, ListColumn } from './json-schema'
import * as Data from 'mol-io/reader/cif/data-model' import * as Data from 'mol-io/reader/cif/data-model'
export function getFieldType (type: string, values?: string[]): Column { export function getFieldType (type: string, values?: string[]): ValueColumn|ListColumn {
switch (type) { switch (type) {
case 'code': case 'code':
case 'ucode': case 'ucode':
if (values && values.length) {
return { 'enum': values }
} else {
return 'str'
}
case 'line': case 'line':
case 'uline': case 'uline':
case 'text': case 'text':
case 'char': case 'char':
case 'uchar3':
case 'uchar1':
case 'boolean':
if (values && values.length) {
return { enum: [ 'str', values ] }
} else {
return 'str'
}
case 'aliasname': case 'aliasname':
case 'name': case 'name':
case 'idname': case 'idname':
...@@ -29,7 +32,6 @@ export function getFieldType (type: string, values?: string[]): Column { ...@@ -29,7 +32,6 @@ export function getFieldType (type: string, values?: string[]): Column {
case 'phone': case 'phone':
case 'email': case 'email':
case 'code30': case 'code30':
case 'ec-type':
case 'seq-one-letter-code': case 'seq-one-letter-code':
case 'author': case 'author':
case 'orcid_id': case 'orcid_id':
...@@ -44,27 +46,30 @@ export function getFieldType (type: string, values?: string[]): Column { ...@@ -44,27 +46,30 @@ export function getFieldType (type: string, values?: string[]): Column {
case 'float-range': case 'float-range':
case 'binary': case 'binary':
case 'operation_expression': case 'operation_expression':
case 'ucode-alphanum-csv':
case 'point_symmetry': case 'point_symmetry':
case 'id_list':
case '4x3_matrix': case '4x3_matrix':
case '3x4_matrices': case '3x4_matrices':
case 'point_group': case 'point_group':
case 'point_group_helical': case 'point_group_helical':
case 'boolean':
case 'symmetry_operation': case 'symmetry_operation':
case 'date_dep': case 'date_dep':
case 'uchar3':
case 'uchar1':
case 'url': case 'url':
case 'symop': case 'symop':
return 'str' return 'str'
case 'int': case 'int':
case 'non_negative_int': case 'non_negative_int':
case 'positive_int': case 'positive_int':
return 'int' if (values && values.length) {
return { enum: [ 'int', values ] }
} else {
return 'int'
}
case 'float': case 'float':
return 'float' return 'float'
case 'ec-type':
case 'ucode-alphanum-csv':
case 'id_list':
return { list: [ 'str', ',' ] }
} }
console.log(`unknown type '${type}'`) console.log(`unknown type '${type}'`)
return 'str' return 'str'
...@@ -94,10 +99,10 @@ function getField ( category: string, field: string, d: Data.Frame, ctx: FrameDa ...@@ -94,10 +99,10 @@ function getField ( category: string, field: string, d: Data.Frame, ctx: FrameDa
} }
} }
function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined { function getEnums (d: Data.Frame, ctx: FrameData) {
const value = getField('item_enumeration', 'value', d, ctx) const value = getField('item_enumeration', 'value', d, ctx)
const enums: string[] = []
if (value) { if (value) {
const enums: string[] = []
for (let i = 0; i < value.rowCount; ++i) { for (let i = 0; i < value.rowCount; ++i) {
enums.push(value.str(i)) enums.push(value.str(i))
// console.log(value.str(i)) // console.log(value.str(i))
...@@ -108,16 +113,10 @@ function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined { ...@@ -108,16 +113,10 @@ function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined {
} }
} }
function getCode (d: Data.Frame, ctx: FrameData): [string, string[]]|undefined { function getCode (d: Data.Frame, ctx: FrameData): [string, string[]|undefined]|undefined {
const code = getField('item_type', 'code', d, ctx) const code = getField('item_type', 'code', d, ctx)
if (code) { if (code) {
let c = code.str(0) return [ code.str(0), getEnums(d, ctx) ]
let e = []
if (c === 'ucode') {
const enums = getEnums(d, ctx)
if (enums) e.push(...enums)
}
return [c, e]
} else { } else {
console.log(`item_type.code not found for '${d.header}'`) console.log(`item_type.code not found for '${d.header}'`)
} }
...@@ -131,15 +130,46 @@ function getSubCategory (d: Data.Frame, ctx: FrameData): string|undefined { ...@@ -131,15 +130,46 @@ function getSubCategory (d: Data.Frame, ctx: FrameData): string|undefined {
} }
const FORCE_INT_FIELDS = [ const FORCE_INT_FIELDS = [
'_atom_site.id',
'_atom_site.auth_seq_id',
'_pdbx_struct_mod_residue.auth_seq_id',
'_struct_conf.beg_auth_seq_id', '_struct_conf.beg_auth_seq_id',
'_struct_conf.end_auth_seq_id', '_struct_conf.end_auth_seq_id',
'_struct_sheet_range.beg_auth_seq_id',
'_struct_sheet_range.end_auth_seq_id',
'_struct_conn.ptnr1_auth_seq_id', '_struct_conn.ptnr1_auth_seq_id',
'_struct_conn.ptnr2_auth_seq_id', '_struct_conn.ptnr2_auth_seq_id',
'_pdbx_struct_mod_residue.auth_seq_id', '_struct_sheet_range.beg_auth_seq_id',
'_atom_site.id', '_struct_sheet_range.end_auth_seq_id',
'_atom_site.auth_seq_id' ];
const COMMA_SEPARATED_LIST_FIELDS = [
'_atom_site.pdbx_struct_group_id',
'_chem_comp.mon_nstd_parent_comp_id',
'_diffrn_radiation.pdbx_wavelength_list',
'_diffrn_source.pdbx_wavelength_list',
'_em_diffraction.tilt_angle_list', // 20,40,50,55
'_em_entity_assembly.entity_id_list',
'_entity.pdbx_ec',
'_pdbx_depui_entry_details.experimental_methods',
'_pdbx_depui_entry_details.requested_accession_types',
'_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI
'_pdbx_soln_scatter_model.software_author_list', // MSI
'_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC'
'_pdbx_struct_assembly_gen.entity_inst_id',
'_pdbx_struct_assembly_gen.asym_id_list',
'_pdbx_struct_assembly_gen.auth_asym_id_list',
'_pdbx_struct_assembly_gen_depositor_info.asym_id_list',
'_pdbx_struct_assembly_gen_depositor_info.chain_id_list',
'_pdbx_struct_group_list.group_enumeration_type',
'_reflns.pdbx_diffrn_id',
'_refine.pdbx_diffrn_id',
'_reflns_shell.pdbx_diffrn_id',
'_struct_keywords.text',
];
const SPACE_SEPARATED_LIST_FIELDS = [
'_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR
'_pdbx_soln_scatter.data_reduction_software_list', // OTOKO
'_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM
]; ];
export function generateSchema (dic: Data.Block) { export function generateSchema (dic: Data.Block) {
...@@ -195,14 +225,24 @@ export function generateSchema (dic: Data.Block) { ...@@ -195,14 +225,24 @@ export function generateSchema (dic: Data.Block) {
} else { } else {
if (itemName.match(/\[[1-3]\]\[[1-3]\]/)) { if (itemName.match(/\[[1-3]\]\[[1-3]\]/)) {
fields[itemName.replace(/\[[1-3]\]\[[1-3]\]/, '')] = { 'matrix': [ 3, 3 ] } fields[itemName.replace(/\[[1-3]\]\[[1-3]\]/, '')] = { 'matrix': [ 3, 3 ] }
// console.log(`${d.header} should have 'matrix' _item_sub_category.id`) console.log(`${d.header} should have 'matrix' _item_sub_category.id`)
} else if (itemName.match(/\[[1-3]\]/)) { } else if (itemName.match(/\[[1-3]\]/)) {
fields[itemName.replace(/\[[1-3]\]/, '')] = { 'vector': [ 3 ] } fields[itemName.replace(/\[[1-3]\]/, '')] = { 'vector': [ 3 ] }
// console.log(`${d.header} should have 'vector' _item_sub_category.id`) console.log(`${d.header} should have 'vector' _item_sub_category.id`)
} else { } else {
const code = getCode(d, ctx) const code = getCode(d, ctx)
if (code) { if (code) {
fields[itemName] = getFieldType(code[0], code[1]) let fieldType = getFieldType(code[0], code[1]);
if (typeof fieldType === 'string') {
if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) {
fieldType = { 'list': [ 'str', ',' ] };
console.log(`comma separated: ${d.header}`)
} else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) {
fieldType = { 'list': [ 'str', ' ' ] };
console.log(`space separated: ${d.header}`)
}
}
fields[itemName] = fieldType
} else { } else {
console.log(`could not determine code for '${d.header}'`) console.log(`could not determine code for '${d.header}'`)
} }
......
...@@ -27,7 +27,8 @@ const coord = Schema.coord; ...@@ -27,7 +27,8 @@ const coord = Schema.coord;
const Aliased = Schema.Aliased; const Aliased = Schema.Aliased;
const Matrix = Schema.Matrix; const Matrix = Schema.Matrix;
const Vector = Schema.Vector;` const Vector = Schema.Vector;
const List = Schema.List;`
} }
function footer (name: string) { function footer (name: string) {
...@@ -37,14 +38,23 @@ export interface ${name}_Database extends Database<${name}_Schema> { }` ...@@ -37,14 +38,23 @@ export interface ${name}_Database extends Database<${name}_Schema> { }`
} }
const value: { [k: string]: (...args: any[]) => string } = { const value: { [k: string]: (...args: any[]) => string } = {
enum: function (...values: string[]) { enum: function (type: string, values: string[]) {
return `Aliased<'${values.join(`' | '`)}'>(str)` return `Aliased<'${values.join(`' | '`)}'>(${type})`
}, },
matrix: function (rows: number, cols: number) { matrix: function (rows: number, cols: number) {
return `Matrix(${rows}, ${cols})` return `Matrix(${rows}, ${cols})`
}, },
vector: function (dim: number) { vector: function (dim: number) {
return `Vector(${dim})` return `Vector(${dim})`
},
list: function (type: 'str'|'int'|'float', separator: string) {
if (type === 'int') {
return `List('${separator}', x => parseInt(x, 10))`
} else if (type === 'float') {
return `List('${separator}', x => parseFloat(x))`
} else {
return `List('${separator}', x => x)`
}
} }
} }
...@@ -64,7 +74,7 @@ export function generate (name: string, schema: Database, fields?: Filter, impor ...@@ -64,7 +74,7 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
codeLines.push(`export const ${name}_Schema = {`) codeLines.push(`export const ${name}_Schema = {`)
Object.keys(schema).forEach(table => { Object.keys(schema).forEach(table => {
if (fields && !fields[ table ]) return if (fields && !fields[ table ]) return
codeLines.push(`\t${safePropertyString(table)}: {`) codeLines.push(` ${safePropertyString(table)}: {`)
const columns = schema[ table ] const columns = schema[ table ]
Object.keys(columns).forEach(columnName => { Object.keys(columns).forEach(columnName => {
if (fields && !fields[ table ][ columnName ]) return if (fields && !fields[ table ][ columnName ]) return
...@@ -76,9 +86,9 @@ export function generate (name: string, schema: Database, fields?: Filter, impor ...@@ -76,9 +86,9 @@ export function generate (name: string, schema: Database, fields?: Filter, impor
} else { } else {
typeDef = fieldType typeDef = fieldType
} }
codeLines.push(`\t\t${safePropertyString(columnName)}: ${typeDef},`) codeLines.push(` ${safePropertyString(columnName)}: ${typeDef},`)
}) })
codeLines.push('\t},') codeLines.push(' },')
}) })
codeLines.push('}') codeLines.push('}')
......
...@@ -12,7 +12,8 @@ export interface Table { ...@@ -12,7 +12,8 @@ export interface Table {
[ columnName: string ]: Column [ columnName: string ]: Column
} }
export type Column = IntCol | StrCol | FloatCol | CoordCol | EnumCol | VectorCol | MatrixCol export type ValueColumn = IntCol | StrCol | FloatCol | CoordCol | EnumCol
export type Column = ValueColumn | VectorCol | MatrixCol | ListColumn
type IntCol = 'int' type IntCol = 'int'
type StrCol = 'str' type StrCol = 'str'
...@@ -24,7 +25,7 @@ interface ComplexColumn { ...@@ -24,7 +25,7 @@ interface ComplexColumn {
} }
interface EnumCol extends ComplexColumn { interface EnumCol extends ComplexColumn {
enum: string[] enum: [ IntCol | StrCol, string[] ]
} }
interface VectorCol extends ComplexColumn { interface VectorCol extends ComplexColumn {
...@@ -35,6 +36,10 @@ interface MatrixCol extends ComplexColumn { ...@@ -35,6 +36,10 @@ interface MatrixCol extends ComplexColumn {
matrix: [ number, number ] matrix: [ number, number ]
} }
export interface ListColumn extends ComplexColumn {
list: [ ValueColumn, string ]
}
export function getTypeAndArgs (column: ComplexColumn) { export function getTypeAndArgs (column: ComplexColumn) {
const type = Object.keys(column)[0] as string const type = Object.keys(column)[0] as string
const args = column[ type ] const args = column[ type ]
......
...@@ -7,12 +7,16 @@ ...@@ -7,12 +7,16 @@
import { Database, Table, Column } from './json-schema' import { Database, Table, Column } from './json-schema'
const SimpleColumnTypes = [ 'str', 'int', 'float', 'coord' ] const SimpleColumnTypes = [ 'str', 'int', 'float', 'coord' ]
const ComplexColumnTypes = [ 'enum', 'vector', 'matrix' ] const ComplexColumnTypes = [ 'enum', 'vector', 'matrix', 'list' ]
function allTrue<T> (list: T[], fn: (e: T) => boolean) { function allTrue<T> (list: T[], fn: (e: T) => boolean) {
return list.reduce((a, v) => a && fn(v), true) return list.reduce((a, v) => a && fn(v), true)
} }
function allString (list: string[]) {
return list.reduce((a, v) => a && typeof v === 'string', true)
}
function validateColumn (column: Column): true|Error { function validateColumn (column: Column): true|Error {
if (typeof column === 'string') { if (typeof column === 'string') {
if (!SimpleColumnTypes.includes(column)) { if (!SimpleColumnTypes.includes(column)) {
...@@ -31,8 +35,8 @@ function validateColumn (column: Column): true|Error { ...@@ -31,8 +35,8 @@ function validateColumn (column: Column): true|Error {
} }
switch (type) { switch (type) {
case 'enum': case 'enum':
if (!args.reduce((a, v) => a && typeof v === 'string', true)) { if (args.length !== 2 && (!allString(args[1]) && !allTrue(args[1], Number.isInteger))) {
return new Error(`enum column must have string args`) return new Error(`enum column must have all string or all integer args ${args}`)
} }
break; break;
case 'vector': case 'vector':
...@@ -45,6 +49,11 @@ function validateColumn (column: Column): true|Error { ...@@ -45,6 +49,11 @@ function validateColumn (column: Column): true|Error {
return new Error(`matrix column must have two integer args`) return new Error(`matrix column must have two integer args`)
} }
break; break;
case 'list':
if (args.length !== 2 || !allString(args)) {
return new Error(`list column must have two string args`)
}
break;
default: default:
return new Error(`complex column types must be one of '${ComplexColumnTypes.join(', ')}' not '${type}'`) return new Error(`complex column types must be one of '${ComplexColumnTypes.join(', ')}' not '${type}'`)
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment