diff --git a/data/bird-field-names.csv b/data/bird-field-names.csv new file mode 100644 index 0000000000000000000000000000000000000000..9b79ef98ee95eff72d83d2ff6be6959863252bc7 --- /dev/null +++ b/data/bird-field-names.csv @@ -0,0 +1,88 @@ +pdbx_reference_molecule.prd_id +pdbx_reference_molecule.name +pdbx_reference_molecule.represent_as +pdbx_reference_molecule.type +pdbx_reference_molecule.type_evidence_code +pdbx_reference_molecule.class +pdbx_reference_molecule.class_evidence_code +pdbx_reference_molecule.formula +pdbx_reference_molecule.chem_comp_id +pdbx_reference_molecule.formula_weight +pdbx_reference_molecule.release_status +pdbx_reference_molecule.replaces +pdbx_reference_molecule.replaced_by +pdbx_reference_molecule.compound_detail +pdbx_reference_molecule.description +pdbx_reference_molecule.representative_PDB_id_code + +pdbx_reference_entity_list.prd_id +pdbx_reference_entity_list.ref_entity_id +pdbx_reference_entity_list.component_id +pdbx_reference_entity_list.type +pdbx_reference_entity_list.details + +pdbx_reference_entity_nonpoly.prd_id +pdbx_reference_entity_nonpoly.ref_entity_id +pdbx_reference_entity_nonpoly.name +pdbx_reference_entity_nonpoly.chem_comp_id + +pdbx_reference_entity_link.prd_id +pdbx_reference_entity_link.link_id +pdbx_reference_entity_link.link_class +pdbx_reference_entity_link.ref_entity_id_1 +pdbx_reference_entity_link.entity_seq_num_1 +pdbx_reference_entity_link.comp_id_1 +pdbx_reference_entity_link.atom_id_1 +pdbx_reference_entity_link.ref_entity_id_2 +pdbx_reference_entity_link.entity_seq_num_2 +pdbx_reference_entity_link.comp_id_2 +pdbx_reference_entity_link.atom_id_2 +pdbx_reference_entity_link.value_order +pdbx_reference_entity_link.component_1 +pdbx_reference_entity_link.component_2 +pdbx_reference_entity_link.details + +pdbx_reference_entity_poly_link.prd_id +pdbx_reference_entity_poly_link.ref_entity_id +pdbx_reference_entity_poly_link.link_id +pdbx_reference_entity_poly_link.atom_id_1 +pdbx_reference_entity_poly_link.comp_id_1 +pdbx_reference_entity_poly_link.entity_seq_num_1 +pdbx_reference_entity_poly_link.atom_id_2 +pdbx_reference_entity_poly_link.comp_id_2 +pdbx_reference_entity_poly_link.entity_seq_num_2 +pdbx_reference_entity_poly_link.value_order +pdbx_reference_entity_poly_link.component_id + +pdbx_reference_entity_poly.prd_id +pdbx_reference_entity_poly.ref_entity_id +pdbx_reference_entity_poly.db_code +pdbx_reference_entity_poly.db_name +pdbx_reference_entity_poly.type + +pdbx_reference_entity_sequence.prd_id +pdbx_reference_entity_sequence.ref_entity_id +pdbx_reference_entity_sequence.type +pdbx_reference_entity_sequence.NRP_flag +pdbx_reference_entity_sequence.one_letter_codes + +pdbx_reference_entity_poly_seq.prd_id +pdbx_reference_entity_poly_seq.ref_entity_id +pdbx_reference_entity_poly_seq.num +pdbx_reference_entity_poly_seq.mon_id +pdbx_reference_entity_poly_seq.parent_mon_id +pdbx_reference_entity_poly_seq.hetero +pdbx_reference_entity_poly_seq.observed + +pdbx_reference_entity_src_nat.prd_id +pdbx_reference_entity_src_nat.ref_entity_id +pdbx_reference_entity_src_nat.ordinal +pdbx_reference_entity_src_nat.taxid +pdbx_reference_entity_src_nat.organism_scientific +pdbx_reference_entity_src_nat.db_code +pdbx_reference_entity_src_nat.db_name + +pdbx_prd_audit.prd_id +pdbx_prd_audit.date +pdbx_prd_audit.processing_site +pdbx_prd_audit.action_type \ No newline at end of file diff --git a/src/apps/schema-generator/schema-from-mmcif-dic.ts b/src/apps/schema-generator/schema-from-mmcif-dic.ts index 5cf4c58cf842a04c23bd266147bdc3076a3015dd..14859b34217695e44b43084a672debc175984e86 100644 --- a/src/apps/schema-generator/schema-from-mmcif-dic.ts +++ b/src/apps/schema-generator/schema-from-mmcif-dic.ts @@ -21,7 +21,7 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount const parsed = await comp(); if (parsed.isError) throw parsed - console.log(fieldNamesPath, minCount) + // console.log(fieldNamesPath, minCount) let filter: Filter | undefined if (minCount && fieldNamesPath) { @@ -32,7 +32,6 @@ async function runGenerateSchema(name: string, fieldNamesPath?: string, minCount } else if (minCount) { filter = await getUsageCountsFilter(minCount) } else if (fieldNamesPath) { - console.log('MOIN') filter = await getFieldNamesFilter(fieldNamesPath) } @@ -59,11 +58,11 @@ async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> { const filter: Filter = {} fieldNames.forEach((name, i) => { const [ category, field ] = name.split('.') - console.log(category, field) + // console.log(category, field) if (!filter[ category ]) filter[ category ] = {} filter[ category ][ field ] = true }) - console.log(filter) + // console.log(filter) return filter } @@ -93,7 +92,7 @@ async function ensureMmcifDicAvailable() { if (FORCE_MMCIF_DOWNLOAD || !fs.existsSync(MMCIF_DIC_PATH)) { console.log('downloading mmcif dic...') const data = await fetch(MMCIF_DIC_URL) - if (!fs.existsSync(MMCIF_DIC_DIR)){ + if (!fs.existsSync(MMCIF_DIC_DIR)) { fs.mkdirSync(MMCIF_DIC_DIR); } fs.writeFileSync(MMCIF_DIC_PATH, await data.text()) diff --git a/src/apps/schema-generator/util/cif-dic.ts b/src/apps/schema-generator/util/cif-dic.ts index 28ac26942ea6acbd6cde714989a09a70dd7b5183..13d1cc670f4709dd8bbfa57b6037b08bec139425 100644 --- a/src/apps/schema-generator/util/cif-dic.ts +++ b/src/apps/schema-generator/util/cif-dic.ts @@ -4,22 +4,25 @@ * @author Alexander Rose <alexander.rose@weirdbyte.de> */ -import { Database, Column } from './json-schema' +import { Database, ValueColumn, ListColumn } from './json-schema' import * as Data from 'mol-io/reader/cif/data-model' -export function getFieldType (type: string, values?: string[]): Column { +export function getFieldType (type: string, values?: string[]): ValueColumn|ListColumn { switch (type) { case 'code': case 'ucode': - if (values && values.length) { - return { 'enum': values } - } else { - return 'str' - } case 'line': case 'uline': case 'text': case 'char': + case 'uchar3': + case 'uchar1': + case 'boolean': + if (values && values.length) { + return { enum: [ 'str', values ] } + } else { + return 'str' + } case 'aliasname': case 'name': case 'idname': @@ -29,7 +32,6 @@ export function getFieldType (type: string, values?: string[]): Column { case 'phone': case 'email': case 'code30': - case 'ec-type': case 'seq-one-letter-code': case 'author': case 'orcid_id': @@ -44,27 +46,30 @@ export function getFieldType (type: string, values?: string[]): Column { case 'float-range': case 'binary': case 'operation_expression': - case 'ucode-alphanum-csv': case 'point_symmetry': - case 'id_list': case '4x3_matrix': case '3x4_matrices': case 'point_group': case 'point_group_helical': - case 'boolean': case 'symmetry_operation': case 'date_dep': - case 'uchar3': - case 'uchar1': case 'url': case 'symop': return 'str' case 'int': case 'non_negative_int': case 'positive_int': - return 'int' + if (values && values.length) { + return { enum: [ 'int', values ] } + } else { + return 'int' + } case 'float': return 'float' + case 'ec-type': + case 'ucode-alphanum-csv': + case 'id_list': + return { list: [ 'str', ',' ] } } console.log(`unknown type '${type}'`) return 'str' @@ -94,10 +99,10 @@ function getField ( category: string, field: string, d: Data.Frame, ctx: FrameDa } } -function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined { +function getEnums (d: Data.Frame, ctx: FrameData) { const value = getField('item_enumeration', 'value', d, ctx) + const enums: string[] = [] if (value) { - const enums: string[] = [] for (let i = 0; i < value.rowCount; ++i) { enums.push(value.str(i)) // console.log(value.str(i)) @@ -108,16 +113,10 @@ function getEnums (d: Data.Frame, ctx: FrameData): string[]|undefined { } } -function getCode (d: Data.Frame, ctx: FrameData): [string, string[]]|undefined { +function getCode (d: Data.Frame, ctx: FrameData): [string, string[]|undefined]|undefined { const code = getField('item_type', 'code', d, ctx) if (code) { - let c = code.str(0) - let e = [] - if (c === 'ucode') { - const enums = getEnums(d, ctx) - if (enums) e.push(...enums) - } - return [c, e] + return [ code.str(0), getEnums(d, ctx) ] } else { console.log(`item_type.code not found for '${d.header}'`) } @@ -131,15 +130,46 @@ function getSubCategory (d: Data.Frame, ctx: FrameData): string|undefined { } const FORCE_INT_FIELDS = [ + '_atom_site.id', + '_atom_site.auth_seq_id', + '_pdbx_struct_mod_residue.auth_seq_id', '_struct_conf.beg_auth_seq_id', '_struct_conf.end_auth_seq_id', - '_struct_sheet_range.beg_auth_seq_id', - '_struct_sheet_range.end_auth_seq_id', '_struct_conn.ptnr1_auth_seq_id', '_struct_conn.ptnr2_auth_seq_id', - '_pdbx_struct_mod_residue.auth_seq_id', - '_atom_site.id', - '_atom_site.auth_seq_id' + '_struct_sheet_range.beg_auth_seq_id', + '_struct_sheet_range.end_auth_seq_id', +]; + +const COMMA_SEPARATED_LIST_FIELDS = [ + '_atom_site.pdbx_struct_group_id', + '_chem_comp.mon_nstd_parent_comp_id', + '_diffrn_radiation.pdbx_wavelength_list', + '_diffrn_source.pdbx_wavelength_list', + '_em_diffraction.tilt_angle_list', // 20,40,50,55 + '_em_entity_assembly.entity_id_list', + '_entity.pdbx_ec', + '_pdbx_depui_entry_details.experimental_methods', + '_pdbx_depui_entry_details.requested_accession_types', + '_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI + '_pdbx_soln_scatter_model.software_author_list', // MSI + '_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC' + '_pdbx_struct_assembly_gen.entity_inst_id', + '_pdbx_struct_assembly_gen.asym_id_list', + '_pdbx_struct_assembly_gen.auth_asym_id_list', + '_pdbx_struct_assembly_gen_depositor_info.asym_id_list', + '_pdbx_struct_assembly_gen_depositor_info.chain_id_list', + '_pdbx_struct_group_list.group_enumeration_type', + '_reflns.pdbx_diffrn_id', + '_refine.pdbx_diffrn_id', + '_reflns_shell.pdbx_diffrn_id', + '_struct_keywords.text', +]; + +const SPACE_SEPARATED_LIST_FIELDS = [ + '_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR + '_pdbx_soln_scatter.data_reduction_software_list', // OTOKO + '_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM ]; export function generateSchema (dic: Data.Block) { @@ -195,14 +225,24 @@ export function generateSchema (dic: Data.Block) { } else { if (itemName.match(/\[[1-3]\]\[[1-3]\]/)) { fields[itemName.replace(/\[[1-3]\]\[[1-3]\]/, '')] = { 'matrix': [ 3, 3 ] } - // console.log(`${d.header} should have 'matrix' _item_sub_category.id`) + console.log(`${d.header} should have 'matrix' _item_sub_category.id`) } else if (itemName.match(/\[[1-3]\]/)) { fields[itemName.replace(/\[[1-3]\]/, '')] = { 'vector': [ 3 ] } - // console.log(`${d.header} should have 'vector' _item_sub_category.id`) + console.log(`${d.header} should have 'vector' _item_sub_category.id`) } else { const code = getCode(d, ctx) if (code) { - fields[itemName] = getFieldType(code[0], code[1]) + let fieldType = getFieldType(code[0], code[1]); + if (typeof fieldType === 'string') { + if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) { + fieldType = { 'list': [ 'str', ',' ] }; + console.log(`comma separated: ${d.header}`) + } else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) { + fieldType = { 'list': [ 'str', ' ' ] }; + console.log(`space separated: ${d.header}`) + } + } + fields[itemName] = fieldType } else { console.log(`could not determine code for '${d.header}'`) } diff --git a/src/apps/schema-generator/util/generate.ts b/src/apps/schema-generator/util/generate.ts index 22364bd6324ec0bf716f882701696747530d7afb..6fd7cbfb96555c0c2995b3e0f61252abc7e58c73 100644 --- a/src/apps/schema-generator/util/generate.ts +++ b/src/apps/schema-generator/util/generate.ts @@ -27,7 +27,8 @@ const coord = Schema.coord; const Aliased = Schema.Aliased; const Matrix = Schema.Matrix; -const Vector = Schema.Vector;` +const Vector = Schema.Vector; +const List = Schema.List;` } function footer (name: string) { @@ -37,14 +38,23 @@ export interface ${name}_Database extends Database<${name}_Schema> { }` } const value: { [k: string]: (...args: any[]) => string } = { - enum: function (...values: string[]) { - return `Aliased<'${values.join(`' | '`)}'>(str)` + enum: function (type: string, values: string[]) { + return `Aliased<'${values.join(`' | '`)}'>(${type})` }, matrix: function (rows: number, cols: number) { return `Matrix(${rows}, ${cols})` }, vector: function (dim: number) { return `Vector(${dim})` + }, + list: function (type: 'str'|'int'|'float', separator: string) { + if (type === 'int') { + return `List('${separator}', x => parseInt(x, 10))` + } else if (type === 'float') { + return `List('${separator}', x => parseFloat(x))` + } else { + return `List('${separator}', x => x)` + } } } @@ -64,7 +74,7 @@ export function generate (name: string, schema: Database, fields?: Filter, impor codeLines.push(`export const ${name}_Schema = {`) Object.keys(schema).forEach(table => { if (fields && !fields[ table ]) return - codeLines.push(`\t${safePropertyString(table)}: {`) + codeLines.push(` ${safePropertyString(table)}: {`) const columns = schema[ table ] Object.keys(columns).forEach(columnName => { if (fields && !fields[ table ][ columnName ]) return @@ -76,9 +86,9 @@ export function generate (name: string, schema: Database, fields?: Filter, impor } else { typeDef = fieldType } - codeLines.push(`\t\t${safePropertyString(columnName)}: ${typeDef},`) + codeLines.push(` ${safePropertyString(columnName)}: ${typeDef},`) }) - codeLines.push('\t},') + codeLines.push(' },') }) codeLines.push('}') diff --git a/src/apps/schema-generator/util/json-schema.ts b/src/apps/schema-generator/util/json-schema.ts index 45c662d93a9b93aabf6eba1cde5ba076e0d69bf7..ed2e3139ed5f235c9514b02ded46f31c3dc2f349 100644 --- a/src/apps/schema-generator/util/json-schema.ts +++ b/src/apps/schema-generator/util/json-schema.ts @@ -12,7 +12,8 @@ export interface Table { [ columnName: string ]: Column } -export type Column = IntCol | StrCol | FloatCol | CoordCol | EnumCol | VectorCol | MatrixCol +export type ValueColumn = IntCol | StrCol | FloatCol | CoordCol | EnumCol +export type Column = ValueColumn | VectorCol | MatrixCol | ListColumn type IntCol = 'int' type StrCol = 'str' @@ -24,7 +25,7 @@ interface ComplexColumn { } interface EnumCol extends ComplexColumn { - enum: string[] + enum: [ IntCol | StrCol, string[] ] } interface VectorCol extends ComplexColumn { @@ -35,6 +36,10 @@ interface MatrixCol extends ComplexColumn { matrix: [ number, number ] } +export interface ListColumn extends ComplexColumn { + list: [ ValueColumn, string ] +} + export function getTypeAndArgs (column: ComplexColumn) { const type = Object.keys(column)[0] as string const args = column[ type ] diff --git a/src/apps/schema-generator/util/validate.ts b/src/apps/schema-generator/util/validate.ts index 08355031d61d86d5ec083af37aee11a18dc4e130..cda7677ed89e05a5b1d5ab94c05473f5cfeba14e 100644 --- a/src/apps/schema-generator/util/validate.ts +++ b/src/apps/schema-generator/util/validate.ts @@ -7,12 +7,16 @@ import { Database, Table, Column } from './json-schema' const SimpleColumnTypes = [ 'str', 'int', 'float', 'coord' ] -const ComplexColumnTypes = [ 'enum', 'vector', 'matrix' ] +const ComplexColumnTypes = [ 'enum', 'vector', 'matrix', 'list' ] function allTrue<T> (list: T[], fn: (e: T) => boolean) { return list.reduce((a, v) => a && fn(v), true) } +function allString (list: string[]) { + return list.reduce((a, v) => a && typeof v === 'string', true) +} + function validateColumn (column: Column): true|Error { if (typeof column === 'string') { if (!SimpleColumnTypes.includes(column)) { @@ -31,8 +35,8 @@ function validateColumn (column: Column): true|Error { } switch (type) { case 'enum': - if (!args.reduce((a, v) => a && typeof v === 'string', true)) { - return new Error(`enum column must have string args`) + if (args.length !== 2 && (!allString(args[1]) && !allTrue(args[1], Number.isInteger))) { + return new Error(`enum column must have all string or all integer args ${args}`) } break; case 'vector': @@ -45,6 +49,11 @@ function validateColumn (column: Column): true|Error { return new Error(`matrix column must have two integer args`) } break; + case 'list': + if (args.length !== 2 || !allString(args)) { + return new Error(`list column must have two string args`) + } + break; default: return new Error(`complex column types must be one of '${ComplexColumnTypes.join(', ')}' not '${type}'`) }