diff --git a/README.md b/README.md index 81160af21366484a03a8b104294ed8adcaf497b7..5b5ead33d2cf3cd0239daa67e14d2cc5833bc64d 100644 --- a/README.md +++ b/README.md @@ -89,12 +89,11 @@ and navigate to `build/viewer` ### Code generation **CIF schemas** -Install CIFTools `npm install ciftools -g` - cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/mmcif.ts -p mmCIF - cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/ccd.ts -p CCD - cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/bird.ts -p BIRD - cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/cif-core.ts -p CifCore -aa + node ./lib/apps/cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/mmcif.ts -p mmCIF + node ./lib/apps/cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/ccd.ts -p CCD + node ./lib/apps/cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/bird.ts -p BIRD + node ./lib/apps/cifschema -mip ../../../../mol-data -o src/mol-io/reader/cif/schema/cif-core.ts -p CifCore -aa **GraphQL schemas** @@ -103,7 +102,7 @@ Install CIFTools `npm install ciftools -g` ### Other scripts **Create chem comp bond table** - export NODE_PATH="lib"; node --max-old-space-size=4096 lib/apps/chem-comp-bond/create-table.js build/data/ccb.bcif -b + node --max-old-space-size=4096 lib/apps/chem-comp-bond/create-table.js build/data/ccb.bcif -b **Test model server** @@ -119,6 +118,10 @@ Install CIFTools `npm install ciftools -g` To see all available commands, use ``node build/model-server/preprocess -h``. +Or + + node ./lib/apps/cif2bcif + ## Development ### Installation diff --git a/data/cif-field-names/bird-field-names.csv b/data/cif-field-names/bird-field-names.csv new file mode 100644 index 0000000000000000000000000000000000000000..c214ad3694cf0ce92eb573244b5a33985c576e3d --- /dev/null +++ b/data/cif-field-names/bird-field-names.csv @@ -0,0 +1,88 @@ +pdbx_reference_molecule.prd_id +pdbx_reference_molecule.name +pdbx_reference_molecule.represent_as +pdbx_reference_molecule.type +pdbx_reference_molecule.type_evidence_code +pdbx_reference_molecule.class +pdbx_reference_molecule.class_evidence_code +pdbx_reference_molecule.formula +pdbx_reference_molecule.chem_comp_id +pdbx_reference_molecule.formula_weight +pdbx_reference_molecule.release_status +pdbx_reference_molecule.replaces +pdbx_reference_molecule.replaced_by +pdbx_reference_molecule.compound_details +pdbx_reference_molecule.description +pdbx_reference_molecule.representative_PDB_id_code + +pdbx_reference_entity_list.prd_id +pdbx_reference_entity_list.ref_entity_id +pdbx_reference_entity_list.component_id +pdbx_reference_entity_list.type +pdbx_reference_entity_list.details + +pdbx_reference_entity_nonpoly.prd_id +pdbx_reference_entity_nonpoly.ref_entity_id +pdbx_reference_entity_nonpoly.name +pdbx_reference_entity_nonpoly.chem_comp_id + +pdbx_reference_entity_link.prd_id +pdbx_reference_entity_link.link_id +pdbx_reference_entity_link.link_class +pdbx_reference_entity_link.ref_entity_id_1 +pdbx_reference_entity_link.entity_seq_num_1 +pdbx_reference_entity_link.comp_id_1 +pdbx_reference_entity_link.atom_id_1 +pdbx_reference_entity_link.ref_entity_id_2 +pdbx_reference_entity_link.entity_seq_num_2 +pdbx_reference_entity_link.comp_id_2 +pdbx_reference_entity_link.atom_id_2 +pdbx_reference_entity_link.value_order +pdbx_reference_entity_link.component_1 +pdbx_reference_entity_link.component_2 +pdbx_reference_entity_link.details + +pdbx_reference_entity_poly_link.prd_id +pdbx_reference_entity_poly_link.ref_entity_id +pdbx_reference_entity_poly_link.link_id +pdbx_reference_entity_poly_link.atom_id_1 +pdbx_reference_entity_poly_link.comp_id_1 +pdbx_reference_entity_poly_link.entity_seq_num_1 +pdbx_reference_entity_poly_link.atom_id_2 +pdbx_reference_entity_poly_link.comp_id_2 +pdbx_reference_entity_poly_link.entity_seq_num_2 +pdbx_reference_entity_poly_link.value_order +pdbx_reference_entity_poly_link.component_id + +pdbx_reference_entity_poly.prd_id +pdbx_reference_entity_poly.ref_entity_id +pdbx_reference_entity_poly.db_code +pdbx_reference_entity_poly.db_name +pdbx_reference_entity_poly.type + +pdbx_reference_entity_sequence.prd_id +pdbx_reference_entity_sequence.ref_entity_id +pdbx_reference_entity_sequence.type +pdbx_reference_entity_sequence.NRP_flag +pdbx_reference_entity_sequence.one_letter_codes + +pdbx_reference_entity_poly_seq.prd_id +pdbx_reference_entity_poly_seq.ref_entity_id +pdbx_reference_entity_poly_seq.num +pdbx_reference_entity_poly_seq.mon_id +pdbx_reference_entity_poly_seq.parent_mon_id +pdbx_reference_entity_poly_seq.hetero +pdbx_reference_entity_poly_seq.observed + +pdbx_reference_entity_src_nat.prd_id +pdbx_reference_entity_src_nat.ref_entity_id +pdbx_reference_entity_src_nat.ordinal +pdbx_reference_entity_src_nat.taxid +pdbx_reference_entity_src_nat.organism_scientific +pdbx_reference_entity_src_nat.db_code +pdbx_reference_entity_src_nat.db_name + +pdbx_prd_audit.prd_id +pdbx_prd_audit.date +pdbx_prd_audit.processing_site +pdbx_prd_audit.action_type \ No newline at end of file diff --git a/data/cif-field-names/ccd-field-names.csv b/data/cif-field-names/ccd-field-names.csv new file mode 100644 index 0000000000000000000000000000000000000000..811f7bdc5aa5865b60c24b30f86ee95d3695c66c --- /dev/null +++ b/data/cif-field-names/ccd-field-names.csv @@ -0,0 +1,60 @@ +chem_comp.id +chem_comp.name +chem_comp.type +chem_comp.pdbx_type +chem_comp.formula +chem_comp.mon_nstd_parent_comp_id +chem_comp.pdbx_synonyms +chem_comp.pdbx_formal_charge +chem_comp.pdbx_initial_date +chem_comp.pdbx_modified_date +chem_comp.pdbx_ambiguous_flag +chem_comp.pdbx_release_status +chem_comp.pdbx_replaced_by +chem_comp.pdbx_replaces +chem_comp.formula_weight +chem_comp.one_letter_code +chem_comp.three_letter_code +chem_comp.pdbx_model_coordinates_details +chem_comp.pdbx_model_coordinates_missing_flag +chem_comp.pdbx_ideal_coordinates_details +chem_comp.pdbx_ideal_coordinates_missing_flag +chem_comp.pdbx_model_coordinates_db_code +chem_comp.pdbx_processing_site + +chem_comp_atom.comp_id +chem_comp_atom.atom_id +chem_comp_atom.alt_atom_id +chem_comp_atom.type_symbol +chem_comp_atom.charge +chem_comp_atom.pdbx_align +chem_comp_atom.pdbx_aromatic_flag +chem_comp_atom.pdbx_leaving_atom_flag +chem_comp_atom.pdbx_stereo_config +chem_comp_atom.model_Cartn_x +chem_comp_atom.model_Cartn_y +chem_comp_atom.model_Cartn_z +chem_comp_atom.pdbx_model_Cartn_x_ideal +chem_comp_atom.pdbx_model_Cartn_y_ideal +chem_comp_atom.pdbx_model_Cartn_z_ideal +chem_comp_atom.pdbx_ordinal + +chem_comp_bond.comp_id +chem_comp_bond.atom_id_1 +chem_comp_bond.atom_id_2 +chem_comp_bond.value_order +chem_comp_bond.pdbx_aromatic_flag +chem_comp_bond.pdbx_stereo_config +chem_comp_bond.pdbx_ordinal + +pdbx_chem_comp_descriptor.comp_id +pdbx_chem_comp_descriptor.type +pdbx_chem_comp_descriptor.program +pdbx_chem_comp_descriptor.program_version +pdbx_chem_comp_descriptor.descriptor + +pdbx_chem_comp_identifier.comp_id +pdbx_chem_comp_identifier.type +pdbx_chem_comp_identifier.program +pdbx_chem_comp_identifier.program_version +pdbx_chem_comp_identifier.identifier \ No newline at end of file diff --git a/data/cif-field-names/cif-core-field-names.csv b/data/cif-field-names/cif-core-field-names.csv new file mode 100644 index 0000000000000000000000000000000000000000..25ce0320c9e0e7c60a833309094d6a27814b7a4c --- /dev/null +++ b/data/cif-field-names/cif-core-field-names.csv @@ -0,0 +1,60 @@ +audit.block_doi + +database_code.depnum_ccdc_archive + +chemical.name_systematic +chemical.name_common +chemical.melting_point + +chemical_formula.moiety +chemical_formula.sum +chemical_formula.weight + +atom_type.symbol +atom_type.description + +atom_type_scat.dispersion_real +atom_type_scat.dispersion_imag +atom_type_scat.source + +space_group.crystal_system +space_group.name_H-M_full +space_group_symop.operation_xyz + +cell.length_a +cell.length_b +cell.length_c +cell.angle_alpha +cell.angle_beta +cell.angle_gamma +cell.volume +cell.formula_units_Z + +atom_site.label +atom_site.type_symbol +atom_site.fract_x +atom_site.fract_y +atom_site.fract_z +atom_site.U_iso_or_equiv +atom_site.adp_type +atom_site.occupancy +atom_site.calc_flag +atom_site.refinement_flags +atom_site.disorder_assembly +atom_site.disorder_group + +atom_site.site_symmetry_multiplicity + +atom_site_aniso.label +atom_site_aniso.U_11 +atom_site_aniso.U_22 +atom_site_aniso.U_33 +atom_site_aniso.U_23 +atom_site_aniso.U_13 +atom_site_aniso.U_12 + +geom_bond.atom_site_label_1 +geom_bond.atom_site_label_2 +geom_bond.distance +geom_bond.site_symmetry_2 +geom_bond.publ_flag \ No newline at end of file diff --git a/data/cif-field-names/mmcif-field-names.csv b/data/cif-field-names/mmcif-field-names.csv new file mode 100644 index 0000000000000000000000000000000000000000..825f2b931474b08d233bd113c7c5854f65b58d20 --- /dev/null +++ b/data/cif-field-names/mmcif-field-names.csv @@ -0,0 +1,805 @@ +atom_sites.entry_id +atom_sites.fract_transf_matrix +atom_sites.fract_transf_vector + +atom_site.group_PDB +atom_site.id +atom_site.type_symbol +atom_site.label_atom_id +atom_site.label_alt_id +atom_site.label_comp_id +atom_site.label_asym_id +atom_site.label_entity_id +atom_site.label_seq_id +atom_site.pdbx_PDB_ins_code +atom_site.pdbx_formal_charge +atom_site.Cartn_x +atom_site.Cartn_y +atom_site.Cartn_z +atom_site.occupancy +atom_site.B_iso_or_equiv +atom_site.auth_atom_id +atom_site.auth_comp_id +atom_site.auth_asym_id +atom_site.auth_seq_id +atom_site.pdbx_PDB_model_num +atom_site.ihm_model_id + +atom_site_anisotrop.id +atom_site_anisotrop.U +atom_site_anisotrop.U_esd +atom_site_anisotrop.pdbx_PDB_ins_code +atom_site_anisotrop.pdbx_auth_asym_id +atom_site_anisotrop.pdbx_auth_atom_id +atom_site_anisotrop.pdbx_auth_comp_id +atom_site_anisotrop.pdbx_auth_seq_id +atom_site_anisotrop.pdbx_label_alt_id +atom_site_anisotrop.pdbx_label_asym_id +atom_site_anisotrop.pdbx_label_atom_id +atom_site_anisotrop.pdbx_label_comp_id +atom_site_anisotrop.pdbx_label_seq_id +atom_site_anisotrop.type_symbol + +chem_comp.id +chem_comp.type +chem_comp.mon_nstd_flag +chem_comp.name +chem_comp.pdbx_synonyms +chem_comp.formula +chem_comp.formula_weight + +chem_comp_bond.comp_id +chem_comp_bond.pdbx_stereo_config +chem_comp_bond.pdbx_ordinal +chem_comp_bond.pdbx_aromatic_flag +chem_comp_bond.atom_id_1 +chem_comp_bond.atom_id_2 +chem_comp_bond.value_order + +pdbx_chem_comp_identifier.comp_id +pdbx_chem_comp_identifier.type +pdbx_chem_comp_identifier.program +pdbx_chem_comp_identifier.program_version +pdbx_chem_comp_identifier.identifier + +pdbx_chem_comp_related.comp_id +pdbx_chem_comp_related.related_comp_id +pdbx_chem_comp_related.relationship_type +pdbx_chem_comp_related.details + +pdbx_chem_comp_synonyms.comp_id +pdbx_chem_comp_synonyms.name +pdbx_chem_comp_synonyms.provenance + +cell.entry_id +cell.length_a +cell.length_b +cell.length_c +cell.angle_alpha +cell.angle_beta +cell.angle_gamma +cell.Z_PDB +cell.pdbx_unique_axis + +pdbx_database_related.db_name +pdbx_database_related.details +pdbx_database_related.db_id +pdbx_database_related.content_type + +pdbx_database_status.status_code +pdbx_database_status.status_code_sf +pdbx_database_status.status_code_mr +pdbx_database_status.entry_id +pdbx_database_status.recvd_initial_deposition_date +pdbx_database_status.SG_entry +pdbx_database_status.deposit_site +pdbx_database_status.process_site +pdbx_database_status.status_code_cs +pdbx_database_status.methods_development_category +pdbx_database_status.pdb_format_compatible + +entity.id +entity.type +entity.src_method +entity.pdbx_description +entity.formula_weight +entity.pdbx_number_of_molecules +entity.details +entity.pdbx_mutation +entity.pdbx_fragment +entity.pdbx_ec + +entity_poly.entity_id +entity_poly.type +entity_poly.nstd_linkage +entity_poly.nstd_monomer +entity_poly.pdbx_seq_one_letter_code +entity_poly.pdbx_seq_one_letter_code_can +entity_poly.pdbx_strand_id +entity_poly.pdbx_target_identifier + +entity_poly_seq.entity_id +entity_poly_seq.num +entity_poly_seq.mon_id +entity_poly_seq.hetero + +entity_src_gen.entity_id +entity_src_gen.pdbx_src_id +entity_src_gen.pdbx_beg_seq_num +entity_src_gen.pdbx_end_seq_num +entity_src_gen.pdbx_gene_src_gene +entity_src_gen.pdbx_gene_src_scientific_name +entity_src_gen.plasmid_name + +entity_src_nat.entity_id +entity_src_nat.pdbx_src_id +entity_src_nat.pdbx_beg_seq_num +entity_src_nat.pdbx_end_seq_num +entity_src_nat.pdbx_organism_scientific +entity_src_nat.pdbx_plasmid_name + +pdbx_entity_instance_feature.ordinal +pdbx_entity_instance_feature.feature_type +pdbx_entity_instance_feature.details +pdbx_entity_instance_feature.asym_id +pdbx_entity_instance_feature.comp_id +pdbx_entity_instance_feature.seq_num +pdbx_entity_instance_feature.auth_asym_id +pdbx_entity_instance_feature.auth_comp_id +pdbx_entity_instance_feature.auth_seq_num + +pdbx_entity_src_syn.entity_id +pdbx_entity_src_syn.pdbx_src_id +pdbx_entity_src_syn.pdbx_beg_seq_num +pdbx_entity_src_syn.pdbx_end_seq_num +pdbx_entity_src_syn.organism_scientific + +pdbx_entity_branch.entity_id +pdbx_entity_branch.type + +pdbx_entity_branch_list.entity_id +pdbx_entity_branch_list.comp_id +pdbx_entity_branch_list.num +pdbx_entity_branch_list.hetero + +pdbx_entity_branch_link.link_id +pdbx_entity_branch_link.entity_id +pdbx_entity_branch_link.entity_branch_list_num_1 +pdbx_entity_branch_link.comp_id_1 +pdbx_entity_branch_link.atom_id_1 +pdbx_entity_branch_link.leaving_atom_id_1 +pdbx_entity_branch_link.atom_stereo_config_1 +pdbx_entity_branch_link.entity_branch_list_num_2 +pdbx_entity_branch_link.comp_id_2 +pdbx_entity_branch_link.atom_id_2 +pdbx_entity_branch_link.leaving_atom_id_2 +pdbx_entity_branch_link.atom_stereo_config_2 +pdbx_entity_branch_link.value_order +pdbx_entity_branch_link.details + +pdbx_branch_scheme.asym_id +pdbx_branch_scheme.entity_id +pdbx_branch_scheme.mon_id +pdbx_branch_scheme.num +pdbx_branch_scheme.auth_asym_id +pdbx_branch_scheme.auth_mon_id +pdbx_branch_scheme.auth_seq_num +pdbx_branch_scheme.hetero +pdbx_branch_scheme.pdb_mon_id +pdbx_branch_scheme.pdb_asym_id +pdbx_branch_scheme.pdb_seq_num + +pdbx_entity_branch_descriptor.ordinal +pdbx_entity_branch_descriptor.entity_id +pdbx_entity_branch_descriptor.descriptor +pdbx_entity_branch_descriptor.type +pdbx_entity_branch_descriptor.program +pdbx_entity_branch_descriptor.program_version + +pdbx_entity_nonpoly.entity_id +pdbx_entity_nonpoly.name +pdbx_entity_nonpoly.comp_id + +pdbx_nonpoly_scheme.asym_id +pdbx_nonpoly_scheme.entity_id +pdbx_nonpoly_scheme.mon_id +pdbx_nonpoly_scheme.ndb_seq_num +pdbx_nonpoly_scheme.pdb_seq_num +pdbx_nonpoly_scheme.auth_seq_num +pdbx_nonpoly_scheme.pdb_mon_id +pdbx_nonpoly_scheme.auth_mon_id +pdbx_nonpoly_scheme.pdb_strand_id +pdbx_nonpoly_scheme.pdb_ins_code + +entry.id + +audit_conform.dict_name +audit_conform.dict_version +audit_conform.dict_location + +database_2.database_id +database_2.database_code + +audit_author.name +audit_author.pdbx_ordinal +audit_author.identifier_ORCID + +citation.id +citation.title +citation.journal_abbrev +citation.journal_volume +citation.page_first +citation.page_last +citation.year +citation.journal_id_ASTM +citation.country +citation.journal_id_ISSN +citation.journal_id_CSD +citation.book_publisher +citation.pdbx_database_id_PubMed +citation.pdbx_database_id_DOI + +citation_author.citation_id +citation_author.name +citation_author.ordinal + +exptl.entry_id +exptl.method + +struct.entry_id +struct.title +struct.pdbx_descriptor + +struct_asym.id +struct_asym.pdbx_blank_PDB_chainid_flag +struct_asym.pdbx_modified +struct_asym.entity_id +struct_asym.details + +struct_conf.conf_type_id +struct_conf.id +struct_conf.pdbx_PDB_helix_id +struct_conf.beg_label_comp_id +struct_conf.beg_label_asym_id +struct_conf.beg_label_seq_id +struct_conf.pdbx_beg_PDB_ins_code +struct_conf.end_label_comp_id +struct_conf.end_label_asym_id +struct_conf.end_label_seq_id +struct_conf.pdbx_end_PDB_ins_code +struct_conf.beg_auth_comp_id +struct_conf.beg_auth_asym_id +struct_conf.beg_auth_seq_id +struct_conf.end_auth_comp_id +struct_conf.end_auth_asym_id +struct_conf.end_auth_seq_id +struct_conf.pdbx_PDB_helix_class +struct_conf.details +struct_conf.pdbx_PDB_helix_length + +struct_conn.id +struct_conn.conn_type_id +struct_conn.pdbx_PDB_id +struct_conn.ptnr1_label_asym_id +struct_conn.ptnr1_label_comp_id +struct_conn.ptnr1_label_seq_id +struct_conn.ptnr1_label_atom_id +struct_conn.pdbx_ptnr1_label_alt_id +struct_conn.pdbx_ptnr1_PDB_ins_code +struct_conn.pdbx_ptnr1_standard_comp_id +struct_conn.ptnr1_symmetry +struct_conn.ptnr2_label_asym_id +struct_conn.ptnr2_label_comp_id +struct_conn.ptnr2_label_seq_id +struct_conn.ptnr2_label_atom_id +struct_conn.pdbx_ptnr2_label_alt_id +struct_conn.pdbx_ptnr2_PDB_ins_code +struct_conn.ptnr1_auth_asym_id +struct_conn.ptnr1_auth_comp_id +struct_conn.ptnr1_auth_seq_id +struct_conn.ptnr2_auth_asym_id +struct_conn.ptnr2_auth_comp_id +struct_conn.ptnr2_auth_seq_id +struct_conn.ptnr2_symmetry +struct_conn.pdbx_ptnr3_label_atom_id +struct_conn.pdbx_ptnr3_label_seq_id +struct_conn.pdbx_ptnr3_label_comp_id +struct_conn.pdbx_ptnr3_label_asym_id +struct_conn.pdbx_ptnr3_label_alt_id +struct_conn.pdbx_ptnr3_PDB_ins_code +struct_conn.details +struct_conn.pdbx_dist_value +struct_conn.pdbx_value_order + +struct_conn_type.id +struct_conn_type.criteria +struct_conn_type.reference + +struct_keywords.entry_id +struct_keywords.pdbx_keywords +struct_keywords.text + +struct_ncs_oper.id +struct_ncs_oper.code +struct_ncs_oper.matrix +struct_ncs_oper.vector +struct_ncs_oper.details + +struct_sheet_range.sheet_id +struct_sheet_range.id +struct_sheet_range.beg_label_comp_id +struct_sheet_range.beg_label_asym_id +struct_sheet_range.beg_label_seq_id +struct_sheet_range.pdbx_beg_PDB_ins_code +struct_sheet_range.end_label_comp_id +struct_sheet_range.end_label_asym_id +struct_sheet_range.end_label_seq_id +struct_sheet_range.pdbx_end_PDB_ins_code +struct_sheet_range.beg_auth_comp_id +struct_sheet_range.beg_auth_asym_id +struct_sheet_range.beg_auth_seq_id +struct_sheet_range.end_auth_comp_id +struct_sheet_range.end_auth_asym_id +struct_sheet_range.end_auth_seq_id + +struct_site.id +struct_site.pdbx_evidence_code +struct_site.pdbx_auth_asym_id +struct_site.pdbx_auth_comp_id +struct_site.pdbx_auth_seq_id +struct_site.pdbx_auth_ins_code +struct_site.pdbx_num_residues +struct_site.details + +struct_site_gen.id +struct_site_gen.site_id +struct_site_gen.pdbx_num_res +struct_site_gen.label_comp_id +struct_site_gen.label_asym_id +struct_site_gen.label_seq_id +struct_site_gen.pdbx_auth_ins_code +struct_site_gen.auth_comp_id +struct_site_gen.auth_asym_id +struct_site_gen.auth_seq_id +struct_site_gen.label_atom_id +struct_site_gen.label_alt_id +struct_site_gen.symmetry +struct_site_gen.details + +symmetry.entry_id +symmetry.cell_setting +symmetry.Int_Tables_number +symmetry.space_group_name_Hall +symmetry.space_group_name_H-M + +pdbx_molecule.instance_id +pdbx_molecule.prd_id +pdbx_molecule.asym_id + +pdbx_molecule_features.prd_id +pdbx_molecule_features.name +pdbx_molecule_features.type +pdbx_molecule_features.class +pdbx_molecule_features.details + +pdbx_reference_entity_link.prd_id +pdbx_reference_entity_link.link_id +pdbx_reference_entity_link.link_class +pdbx_reference_entity_link.ref_entity_id_1 +pdbx_reference_entity_link.entity_seq_num_1 +pdbx_reference_entity_link.comp_id_1 +pdbx_reference_entity_link.atom_id_1 +pdbx_reference_entity_link.ref_entity_id_2 +pdbx_reference_entity_link.entity_seq_num_2 +pdbx_reference_entity_link.comp_id_2 +pdbx_reference_entity_link.atom_id_2 +pdbx_reference_entity_link.value_order +pdbx_reference_entity_link.component_1 +pdbx_reference_entity_link.component_2 +pdbx_reference_entity_link.details + +pdbx_reference_entity_list.prd_id +pdbx_reference_entity_list.ref_entity_id +pdbx_reference_entity_list.component_id +pdbx_reference_entity_list.type +pdbx_reference_entity_list.details + +pdbx_reference_entity_poly_link.prd_id +pdbx_reference_entity_poly_link.ref_entity_id +pdbx_reference_entity_poly_link.link_id +pdbx_reference_entity_poly_link.atom_id_1 +pdbx_reference_entity_poly_link.comp_id_1 +pdbx_reference_entity_poly_link.entity_seq_num_1 +pdbx_reference_entity_poly_link.atom_id_2 +pdbx_reference_entity_poly_link.comp_id_2 +pdbx_reference_entity_poly_link.entity_seq_num_2 +pdbx_reference_entity_poly_link.value_order +pdbx_reference_entity_poly_link.component_id + +pdbx_struct_assembly.id +pdbx_struct_assembly.details +pdbx_struct_assembly.method_details +pdbx_struct_assembly.oligomeric_details +pdbx_struct_assembly.oligomeric_count + +pdbx_struct_assembly_gen.assembly_id +pdbx_struct_assembly_gen.oper_expression +pdbx_struct_assembly_gen.asym_id_list + +pdbx_struct_oper_list.id +pdbx_struct_oper_list.type +pdbx_struct_oper_list.name +pdbx_struct_oper_list.symmetry_operation +pdbx_struct_oper_list.matrix +pdbx_struct_oper_list.vector + +pdbx_struct_mod_residue.id +pdbx_struct_mod_residue.label_asym_id +pdbx_struct_mod_residue.label_seq_id +pdbx_struct_mod_residue.label_comp_id +pdbx_struct_mod_residue.auth_asym_id +pdbx_struct_mod_residue.auth_seq_id +pdbx_struct_mod_residue.auth_comp_id +pdbx_struct_mod_residue.PDB_ins_code +pdbx_struct_mod_residue.parent_comp_id +pdbx_struct_mod_residue.details + +pdbx_unobs_or_zero_occ_residues.id +pdbx_unobs_or_zero_occ_residues.PDB_model_num +pdbx_unobs_or_zero_occ_residues.polymer_flag +pdbx_unobs_or_zero_occ_residues.occupancy_flag +pdbx_unobs_or_zero_occ_residues.auth_asym_id +pdbx_unobs_or_zero_occ_residues.auth_comp_id +pdbx_unobs_or_zero_occ_residues.auth_seq_id +pdbx_unobs_or_zero_occ_residues.PDB_ins_code +pdbx_unobs_or_zero_occ_residues.label_asym_id +pdbx_unobs_or_zero_occ_residues.label_comp_id +pdbx_unobs_or_zero_occ_residues.label_seq_id + +ihm_struct_assembly.id +ihm_struct_assembly.name +ihm_struct_assembly.description + +ihm_struct_assembly_details.id +ihm_struct_assembly_details.assembly_id +ihm_struct_assembly_details.parent_assembly_id +ihm_struct_assembly_details.entity_description +ihm_struct_assembly_details.entity_id +ihm_struct_assembly_details.asym_id +ihm_struct_assembly_details.entity_poly_segment_id + +ihm_model_representation.id +ihm_model_representation.name +ihm_model_representation.details + +ihm_model_representation_details.id +ihm_model_representation_details.representation_id +ihm_model_representation_details.entity_id +ihm_model_representation_details.entity_description +ihm_model_representation_details.entity_asym_id +ihm_model_representation_details.entity_poly_segment_id +ihm_model_representation_details.model_object_primitive +ihm_model_representation_details.starting_model_id +ihm_model_representation_details.model_mode +ihm_model_representation_details.model_granularity +ihm_model_representation_details.model_object_count + +ihm_external_reference_info.reference_id +ihm_external_reference_info.reference_provider +ihm_external_reference_info.reference_type +ihm_external_reference_info.reference +ihm_external_reference_info.refers_to +ihm_external_reference_info.associated_url + +ihm_external_files.id +ihm_external_files.reference_id +ihm_external_files.file_path +ihm_external_files.content_type +ihm_external_files.file_size_bytes +ihm_external_files.details + +ihm_dataset_list.id +ihm_dataset_list.data_type +ihm_dataset_list.database_hosted + +ihm_dataset_group.id +ihm_dataset_group.name +ihm_dataset_group.application +ihm_dataset_group.details + +ihm_dataset_group_link.group_id +ihm_dataset_group_link.dataset_list_id + +ihm_dataset_external_reference.id +ihm_dataset_external_reference.dataset_list_id +ihm_dataset_external_reference.file_id + +ihm_dataset_related_db_reference.id +ihm_dataset_related_db_reference.dataset_list_id +ihm_dataset_related_db_reference.db_name +ihm_dataset_related_db_reference.accession_code +ihm_dataset_related_db_reference.version +ihm_dataset_related_db_reference.details + +ihm_related_datasets.dataset_list_id_derived +ihm_related_datasets.dataset_list_id_primary + +ihm_poly_residue_feature.ordinal_id +ihm_poly_residue_feature.feature_id +ihm_poly_residue_feature.entity_id +ihm_poly_residue_feature.asym_id +ihm_poly_residue_feature.seq_id_begin +ihm_poly_residue_feature.comp_id_begin +ihm_poly_residue_feature.seq_id_end +ihm_poly_residue_feature.comp_id_end + +ihm_feature_list.feature_id +ihm_feature_list.feature_type +ihm_feature_list.entity_type + +ihm_cross_link_list.id +ihm_cross_link_list.group_id +ihm_cross_link_list.entity_description_1 +ihm_cross_link_list.entity_id_1 +ihm_cross_link_list.seq_id_1 +ihm_cross_link_list.comp_id_1 +ihm_cross_link_list.entity_description_2 +ihm_cross_link_list.entity_id_2 +ihm_cross_link_list.seq_id_2 +ihm_cross_link_list.comp_id_2 +ihm_cross_link_list.linker_type +ihm_cross_link_list.dataset_list_id + +ihm_cross_link_restraint.id +ihm_cross_link_restraint.group_id +ihm_cross_link_restraint.entity_id_1 +ihm_cross_link_restraint.asym_id_1 +ihm_cross_link_restraint.seq_id_1 +ihm_cross_link_restraint.atom_id_1 +ihm_cross_link_restraint.comp_id_1 +ihm_cross_link_restraint.entity_id_2 +ihm_cross_link_restraint.asym_id_2 +ihm_cross_link_restraint.seq_id_2 +ihm_cross_link_restraint.atom_id_2 +ihm_cross_link_restraint.comp_id_2 +ihm_cross_link_restraint.restraint_type +ihm_cross_link_restraint.conditional_crosslink_flag +ihm_cross_link_restraint.model_granularity +ihm_cross_link_restraint.distance_threshold +ihm_cross_link_restraint.psi +ihm_cross_link_restraint.sigma_1 +ihm_cross_link_restraint.sigma_2 + +ihm_cross_link_result_parameters.id +ihm_cross_link_result_parameters.restraint_id +ihm_cross_link_result_parameters.model_id +ihm_cross_link_result_parameters.psi +ihm_cross_link_result_parameters.sigma_1 +ihm_cross_link_result_parameters.sigma_2 + +ihm_sas_restraint.id +ihm_sas_restraint.dataset_list_id +ihm_sas_restraint.model_id +ihm_sas_restraint.struct_assembly_id +ihm_sas_restraint.profile_segment_flag +ihm_sas_restraint.fitting_atom_type +ihm_sas_restraint.fitting_method +ihm_sas_restraint.fitting_state +ihm_sas_restraint.radius_of_gyration +ihm_sas_restraint.chi_value +ihm_sas_restraint.details + +ihm_derived_distance_restraint.id +ihm_derived_distance_restraint.group_id +ihm_derived_distance_restraint.feature_id_1 +ihm_derived_distance_restraint.feature_id_2 +ihm_derived_distance_restraint.group_conditionality +ihm_derived_distance_restraint.restraint_type +ihm_derived_distance_restraint.distance_upper_limit +ihm_derived_distance_restraint.random_exclusion_fraction +ihm_derived_distance_restraint.dataset_list_id + +ihm_2dem_class_average_restraint.id +ihm_2dem_class_average_restraint.dataset_list_id +ihm_2dem_class_average_restraint.number_raw_micrographs +ihm_2dem_class_average_restraint.pixel_size_width +ihm_2dem_class_average_restraint.pixel_size_height +ihm_2dem_class_average_restraint.image_resolution +ihm_2dem_class_average_restraint.image_segment_flag +ihm_2dem_class_average_restraint.number_of_projections +ihm_2dem_class_average_restraint.struct_assembly_id +ihm_2dem_class_average_restraint.details + +ihm_2dem_class_average_fitting.id +ihm_2dem_class_average_fitting.restraint_id +ihm_2dem_class_average_fitting.model_id +ihm_2dem_class_average_fitting.cross_correlation_coefficient +ihm_2dem_class_average_fitting.rot_matrix +ihm_2dem_class_average_fitting.tr_vector + +ihm_3dem_restraint.id +ihm_3dem_restraint.dataset_list_id +ihm_3dem_restraint.fitting_method +ihm_3dem_restraint.struct_assembly_id +ihm_3dem_restraint.number_of_gaussians +ihm_3dem_restraint.model_id +ihm_3dem_restraint.cross_correlation_coefficient + +ihm_predicted_contact_restraint.id +ihm_predicted_contact_restraint.group_id +ihm_predicted_contact_restraint.entity_id_1 +ihm_predicted_contact_restraint.asym_id_1 +ihm_predicted_contact_restraint.seq_id_1 +ihm_predicted_contact_restraint.comp_id_1 +ihm_predicted_contact_restraint.rep_atom_1 +ihm_predicted_contact_restraint.entity_id_2 +ihm_predicted_contact_restraint.asym_id_2 +ihm_predicted_contact_restraint.seq_id_2 +ihm_predicted_contact_restraint.comp_id_2 +ihm_predicted_contact_restraint.rep_atom_2 +ihm_predicted_contact_restraint.restraint_type +ihm_predicted_contact_restraint.distance_lower_limit +ihm_predicted_contact_restraint.distance_upper_limit +ihm_predicted_contact_restraint.probability +ihm_predicted_contact_restraint.model_granularity +ihm_predicted_contact_restraint.dataset_list_id +ihm_predicted_contact_restraint.software_id + +ihm_starting_model_details.starting_model_id +ihm_starting_model_details.entity_id +ihm_starting_model_details.entity_description +ihm_starting_model_details.asym_id +ihm_starting_model_details.entity_poly_segment_id +ihm_starting_model_details.starting_model_source +ihm_starting_model_details.starting_model_auth_asym_id +ihm_starting_model_details.starting_model_sequence_offset +ihm_starting_model_details.dataset_list_id + +ihm_starting_comparative_models.id +ihm_starting_comparative_models.starting_model_id +ihm_starting_comparative_models.starting_model_auth_asym_id +ihm_starting_comparative_models.starting_model_seq_id_begin +ihm_starting_comparative_models.starting_model_seq_id_end +ihm_starting_comparative_models.template_auth_asym_id +ihm_starting_comparative_models.template_seq_id_begin +ihm_starting_comparative_models.template_seq_id_end +ihm_starting_comparative_models.template_sequence_identity +ihm_starting_comparative_models.template_sequence_identity_denominator +ihm_starting_comparative_models.template_dataset_list_id +ihm_starting_comparative_models.alignment_file_id + +ihm_starting_model_coord.starting_model_id +ihm_starting_model_coord.group_PDB +ihm_starting_model_coord.id +ihm_starting_model_coord.type_symbol +ihm_starting_model_coord.atom_id +ihm_starting_model_coord.comp_id +ihm_starting_model_coord.entity_id +ihm_starting_model_coord.asym_id +ihm_starting_model_coord.seq_id +ihm_starting_model_coord.Cartn_x +ihm_starting_model_coord.Cartn_y +ihm_starting_model_coord.Cartn_z +ihm_starting_model_coord.B_iso_or_equiv +ihm_starting_model_coord.ordinal_id + +ihm_starting_model_seq_dif.id +ihm_starting_model_seq_dif.entity_id +ihm_starting_model_seq_dif.asym_id +ihm_starting_model_seq_dif.seq_id +ihm_starting_model_seq_dif.comp_id +ihm_starting_model_seq_dif.starting_model_id +ihm_starting_model_seq_dif.db_asym_id +ihm_starting_model_seq_dif.db_seq_id +ihm_starting_model_seq_dif.db_comp_id +ihm_starting_model_seq_dif.details + +ihm_modeling_protocol.id +ihm_modeling_protocol.protocol_name +ihm_modeling_protocol.num_steps + +ihm_modeling_protocol_details.id +ihm_modeling_protocol_details.protocol_id +ihm_modeling_protocol_details.step_id +ihm_modeling_protocol_details.struct_assembly_id +ihm_modeling_protocol_details.dataset_group_id +ihm_modeling_protocol_details.struct_assembly_description +ihm_modeling_protocol_details.step_name +ihm_modeling_protocol_details.step_method +ihm_modeling_protocol_details.num_models_begin +ihm_modeling_protocol_details.num_models_end +ihm_modeling_protocol_details.multi_scale_flag +ihm_modeling_protocol_details.multi_state_flag +ihm_modeling_protocol_details.ordered_flag +ihm_modeling_protocol_details.software_id +ihm_modeling_protocol_details.script_file_id + +ihm_modeling_post_process.id +ihm_modeling_post_process.protocol_id +ihm_modeling_post_process.analysis_id +ihm_modeling_post_process.step_id +ihm_modeling_post_process.type +ihm_modeling_post_process.feature +ihm_modeling_post_process.num_models_begin +ihm_modeling_post_process.num_models_end + +ihm_ensemble_info.ensemble_id +ihm_ensemble_info.ensemble_name +ihm_ensemble_info.post_process_id +ihm_ensemble_info.model_group_id +ihm_ensemble_info.ensemble_clustering_method +ihm_ensemble_info.ensemble_clustering_feature +ihm_ensemble_info.num_ensemble_models +ihm_ensemble_info.num_ensemble_models_deposited +ihm_ensemble_info.ensemble_precision_value +ihm_ensemble_info.ensemble_file_id + +ihm_localization_density_files.id +ihm_localization_density_files.file_id +ihm_localization_density_files.ensemble_id +ihm_localization_density_files.entity_id +ihm_localization_density_files.asym_id +ihm_localization_density_files.entity_poly_segment_id + +ihm_model_list.model_id +ihm_model_list.model_name +ihm_model_list.assembly_id +ihm_model_list.protocol_id +ihm_model_list.representation_id + +ihm_model_group.id +ihm_model_group.name +ihm_model_group.details + +ihm_model_group_link.group_id +ihm_model_group_link.model_id + +ihm_model_representative.id +ihm_model_representative.model_group_id +ihm_model_representative.model_id +ihm_model_representative.selection_criteria + +ihm_sphere_obj_site.id +ihm_sphere_obj_site.entity_id +ihm_sphere_obj_site.seq_id_begin +ihm_sphere_obj_site.seq_id_end +ihm_sphere_obj_site.asym_id +ihm_sphere_obj_site.Cartn_x +ihm_sphere_obj_site.Cartn_y +ihm_sphere_obj_site.Cartn_z +ihm_sphere_obj_site.object_radius +ihm_sphere_obj_site.rmsf +ihm_sphere_obj_site.model_id + +ihm_gaussian_obj_site.id +ihm_gaussian_obj_site.entity_id +ihm_gaussian_obj_site.seq_id_begin +ihm_gaussian_obj_site.seq_id_end +ihm_gaussian_obj_site.asym_id +ihm_gaussian_obj_site.mean_Cartn_x +ihm_gaussian_obj_site.mean_Cartn_y +ihm_gaussian_obj_site.mean_Cartn_z +ihm_gaussian_obj_site.weight +ihm_gaussian_obj_site.covariance_matrix +ihm_gaussian_obj_site.model_id + +ihm_gaussian_obj_ensemble.id +ihm_gaussian_obj_ensemble.entity_id +ihm_gaussian_obj_ensemble.seq_id_begin +ihm_gaussian_obj_ensemble.seq_id_end +ihm_gaussian_obj_ensemble.asym_id +ihm_gaussian_obj_ensemble.mean_Cartn_x +ihm_gaussian_obj_ensemble.mean_Cartn_y +ihm_gaussian_obj_ensemble.mean_Cartn_z +ihm_gaussian_obj_ensemble.weight +ihm_gaussian_obj_ensemble.covariance_matrix +ihm_gaussian_obj_ensemble.ensemble_id + +ihm_multi_state_modeling.state_id +ihm_multi_state_modeling.state_group_id +ihm_multi_state_modeling.population_fraction +ihm_multi_state_modeling.population_fraction_sd +ihm_multi_state_modeling.state_type +ihm_multi_state_modeling.state_name +ihm_multi_state_modeling.experiment_type +ihm_multi_state_modeling.details \ No newline at end of file diff --git a/data/cif-field-names/mmtf-filter.csv b/data/cif-field-names/mmtf-filter.csv new file mode 100644 index 0000000000000000000000000000000000000000..5a9b2bf0f7d216debfe761f89513d0a2d7d09074 --- /dev/null +++ b/data/cif-field-names/mmtf-filter.csv @@ -0,0 +1,76 @@ +cell.length_a +cell.length_b +cell.length_c +cell.angle_alpha +cell.angle_beta +cell.angle_gamma + +symmetry.space_group_name_H-M + +entry.id + +struct.title + +pdbx_database_status.recvd_initial_deposition_date + +pdbx_audit_revision_history.revision_date + +struct_ncs_oper + +pdbx_struct_assembly_gen + +pdbx_struct_oper_list + +entity.id +entity.type +entity.pdbx_description + +entity_poly.entity_id +entity_poly.pdbx_seq_one_letter_code +entity_poly.pdbx_strand_id + +exptl.method + +refine.ls_d_res_low +refine.ls_R_factor_R_free +refine.ls_R_factor_R_work + +atom_site.pdbx_formal_charge +atom_site.label_atom_id +atom_site.type_symbol + +chem_comp.id +chem_comp.type +chem_comp.name + +chem_comp_bond + +atom_site.Cartn_x +atom_site.Cartn_y +atom_site.Cartn_z +atom_site.B_iso_or_equiv +atom_site.id +atom_site.label_alt_id +atom_site.occupancy +atom_site.label_seq_id +atom_site.label_comp_id + +struct_sheet_range.id +struct_sheet_range.beg_label_asym_id +struct_sheet_range.beg_label_seq_id +struct_sheet_range.pdbx_beg_PDB_ins_code +struct_sheet_range.end_label_asym_id +struct_sheet_range.end_label_seq_id +struct_sheet_range.pdbx_end_PDB_ins_code +struct_conf.conf_type_id +struct_conf.id +struct_conf.beg_label_asym_id +struct_conf.beg_label_seq_id +struct_conf.pdbx_beg_PDB_ins_code +struct_conf.end_label_asym_id +struct_conf.end_label_seq_id +struct_conf.pdbx_end_PDB_ins_code + +atom_site.pdbx_PDB_ins_code +atom_site.label_asym_id +atom_site.auth_asym_id \ No newline at end of file diff --git a/package.json b/package.json index 0730fe89e05d7348caff149e6e23345cfb3194c2..34f59326c9d625d3b5eb83ea8e5da411d337b021 100644 --- a/package.json +++ b/package.json @@ -38,6 +38,8 @@ "lib/" ], "bin": { + "cif2bcif": "lib/apps/cif2bcif/index.js", + "cifschema": "lib/apps/cifschema/index.js", "model-server": "lib/servers/model/server.js", "model-server-query": "lib/servers/model/local.js", "model-server-preprocess": "lib/servers/model/preprocess.js", diff --git a/src/apps/cif2bcif/converter.ts b/src/apps/cif2bcif/converter.ts new file mode 100644 index 0000000000000000000000000000000000000000..b1cf3308971ad3fdf41782e923933df6d515d605 --- /dev/null +++ b/src/apps/cif2bcif/converter.ts @@ -0,0 +1,119 @@ +/** + * Copyright (c) 2017 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + * @author Sebastian Bittrich <sebastian.bittrich@rcsb.org> + */ + +import { CIF, CifCategory, getCifFieldType, CifField, CifFile } from '../../mol-io/reader/cif' +import { CifWriter, EncodingStrategyHint } from '../../mol-io/writer/cif' +import * as util from 'util' +import * as fs from 'fs' +import * as zlib from 'zlib' +import { Progress, Task, RuntimeContext } from '../../mol-task'; +import { classifyFloatArray, classifyIntArray } from '../../mol-io/common/binary-cif'; +import { BinaryEncodingProvider } from '../../mol-io/writer/cif/encoder/binary'; +import { Category } from '../../mol-io/writer/cif/encoder'; +import { ReaderResult } from '../../mol-io/reader/result'; + +function showProgress(p: Progress) { + process.stdout.write(`\r${new Array(80).join(' ')}`); + process.stdout.write(`\r${Progress.format(p)}`); +} + +const readFileAsync = util.promisify(fs.readFile); +const unzipAsync = util.promisify<zlib.InputType, Buffer>(zlib.unzip); + +async function readFile(ctx: RuntimeContext, filename: string): Promise<ReaderResult<CifFile>> { + const isGz = /\.gz$/i.test(filename); + if (filename.match(/\.bcif/)) { + let input = await readFileAsync(filename) + if (isGz) input = await unzipAsync(input); + return await CIF.parseBinary(new Uint8Array(input)).runInContext(ctx); + } else { + let str: string; + if (isGz) { + const data = await unzipAsync(await readFileAsync(filename)); + str = data.toString('utf8'); + } else { + str = await readFileAsync(filename, 'utf8'); + } + return await CIF.parseText(str).runInContext(ctx); + } +} + +async function getCIF(ctx: RuntimeContext, filename: string) { + const parsed = await readFile(ctx, filename); + if (parsed.isError) { + throw new Error(parsed.toString()); + } + return parsed.result; +} + +function getCategoryInstanceProvider(cat: CifCategory, fields: CifWriter.Field[]): CifWriter.Category { + return { + name: cat.name, + instance: () => CifWriter.categoryInstance(fields, { data: cat, rowCount: cat.rowCount }) + }; +} + +function classify(name: string, field: CifField): CifWriter.Field { + const type = getCifFieldType(field); + if (type['@type'] === 'str') { + return { name, type: CifWriter.Field.Type.Str, value: field.str, valueKind: field.valueKind }; + } else if (type['@type'] === 'float') { + const encoder = classifyFloatArray(field.toFloatArray({ array: Float64Array })); + return CifWriter.Field.float(name, field.float, { valueKind: field.valueKind, encoder, typedArray: Float64Array }); + } else { + const encoder = classifyIntArray(field.toIntArray({ array: Int32Array })); + return CifWriter.Field.int(name, field.int, { valueKind: field.valueKind, encoder, typedArray: Int32Array }); + } +} + +export default function convert(path: string, asText = false, hints?: EncodingStrategyHint[], filter?: string) { + return Task.create<Uint8Array>('BinaryCIF', async ctx => { + const encodingProvider: BinaryEncodingProvider = hints + ? CifWriter.createEncodingProviderFromJsonConfig(hints) + : { get: (c, f) => void 0 }; + const cif = await getCIF(ctx, path); + + const encoder = CifWriter.createEncoder({ + binary: !asText, + encoderName: 'mol*/ciftools cif2bcif', + binaryAutoClassifyEncoding: true, + binaryEncodingPovider: encodingProvider + }); + + if (filter) { + encoder.setFilter(Category.filterOf(filter)); + } + + let maxProgress = 0; + for (const b of cif.blocks) { + maxProgress += b.categoryNames.length; + for (const c of b.categoryNames) maxProgress += b.categories[c].fieldNames.length; + } + + let current = 0; + for (const b of cif.blocks) { + encoder.startDataBlock(b.header); + for (const c of b.categoryNames) { + const cat = b.categories[c]; + const fields: CifWriter.Field[] = []; + for (const f of cat.fieldNames) { + fields.push(classify(f, cat.getField(f)!)) + current++; + if (ctx.shouldUpdate) await ctx.update({ message: 'Encoding...', current, max: maxProgress }); + } + + encoder.writeCategory(getCategoryInstanceProvider(b.categories[c], fields)); + current++; + if (ctx.shouldUpdate) await ctx.update({ message: 'Encoding...', current, max: maxProgress }); + } + } + await ctx.update('Exporting...'); + const ret = encoder.getData() as Uint8Array; + await ctx.update('Done.\n'); + return ret; + }).run(showProgress, 250); +} \ No newline at end of file diff --git a/src/apps/cif2bcif/index.ts b/src/apps/cif2bcif/index.ts new file mode 100644 index 0000000000000000000000000000000000000000..e6987c9cf51da9e1839b1f7d6f0d6fefd42176ce --- /dev/null +++ b/src/apps/cif2bcif/index.ts @@ -0,0 +1,67 @@ +/** + * Copyright (c) 2017-2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author David Sehnal <david.sehnal@gmail.com> + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import * as argparse from 'argparse' +import * as util from 'util' +import * as fs from 'fs' +import * as zlib from 'zlib' +import convert from './converter' + +require('util.promisify').shim(); + +async function process(srcPath: string, outPath: string, configPath?: string, filterPath?: string) { + const config = configPath ? JSON.parse(fs.readFileSync(configPath, 'utf8')) : void 0; + const filter = filterPath ? fs.readFileSync(filterPath, 'utf8') : void 0; + + const res = await convert(srcPath, false, config, filter); + await write(outPath, res); +} + +const zipAsync = util.promisify<zlib.InputType, Buffer>(zlib.gzip); + +async function write(outPath: string, res: Uint8Array) { + const isGz = /\.gz$/i.test(outPath); + if (isGz) { + res = await zipAsync(res); + } + fs.writeFileSync(outPath, res); +} + +function run(args: Args) { + process(args.src, args.out, args.config, args.filter) +} + +const parser = new argparse.ArgumentParser({ + addHelp: true, + description: 'Convert any CIF file to a BCIF file' +}); +parser.addArgument([ 'src' ], { + help: 'Source CIF path' +}); +parser.addArgument([ 'out' ], { + help: 'Output BCIF path' +}); +parser.addArgument([ '-c', '--config' ], { + help: 'Optional encoding strategy/precision config path', + required: false +}); +parser.addArgument([ '-f', '--filter' ], { + help: 'Optional filter whitelist/blacklist path', + required: false +}); + +interface Args { + src: string + out: string + config?: string + filter?: string +} +const args: Args = parser.parseArgs(); + +if (args) { + run(args) +} \ No newline at end of file diff --git a/src/apps/cifschema/index.ts b/src/apps/cifschema/index.ts new file mode 100644 index 0000000000000000000000000000000000000000..38602d919e917f21b660a37f6171afc65008f6e4 --- /dev/null +++ b/src/apps/cifschema/index.ts @@ -0,0 +1,274 @@ +/** + * Copyright (c) 2017-2020 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import * as argparse from 'argparse' +import * as fs from 'fs' +import * as path from 'path' +import fetch from 'node-fetch' + +import { parseCsv } from '../../mol-io/reader/csv/parser' +import { CifFrame, CifBlock } from '../../mol-io/reader/cif' +import parseText from '../../mol-io/reader/cif/text/parser' +import { generateSchema } from './util/cif-dic' +import { generate } from './util/generate' +import { Filter, Database } from './util/schema' +import { parseImportGet } from './util/helper' + +function getDicVersion(block: CifBlock) { + return block.categories.dictionary.getField('version')!.str(0) +} + +function getDicNamespace(block: CifBlock) { + return block.categories.dictionary.getField('namespace')!.str(0) +} + +async function runGenerateSchemaMmcif(name: string, fieldNamesPath: string, typescript = false, out: string, moldbImportPath: string, addAliases: boolean) { + await ensureMmcifDicAvailable() + const mmcifDic = await parseText(fs.readFileSync(MMCIF_DIC_PATH, 'utf8')).run(); + if (mmcifDic.isError) throw mmcifDic + + await ensureIhmDicAvailable() + const ihmDic = await parseText(fs.readFileSync(IHM_DIC_PATH, 'utf8')).run(); + if (ihmDic.isError) throw ihmDic + + await ensureCarbBranchDicAvailable() + const carbBranchDic = await parseText(fs.readFileSync(CARB_BRANCH_DIC_PATH, 'utf8')).run(); + if (carbBranchDic.isError) throw carbBranchDic + + await ensureCarbCompDicAvailable() + const carbCompDic = await parseText(fs.readFileSync(CARB_COMP_DIC_PATH, 'utf8')).run(); + if (carbCompDic.isError) throw carbCompDic + + const mmcifDicVersion = getDicVersion(mmcifDic.result.blocks[0]) + const ihmDicVersion = getDicVersion(ihmDic.result.blocks[0]) + const carbDicVersion = 'draft' + const version = `Dictionary versions: mmCIF ${mmcifDicVersion}, IHM ${ihmDicVersion}, CARB ${carbDicVersion}.` + + const frames: CifFrame[] = [...mmcifDic.result.blocks[0].saveFrames, ...ihmDic.result.blocks[0].saveFrames, ...carbBranchDic.result.blocks[0].saveFrames, ...carbCompDic.result.blocks[0].saveFrames] + const schema = generateSchema(frames) + + await runGenerateSchema(name, version, schema, fieldNamesPath, typescript, out, moldbImportPath, addAliases) +} + +async function runGenerateSchemaCifCore(name: string, fieldNamesPath: string, typescript = false, out: string, moldbImportPath: string, addAliases: boolean) { + await ensureCifCoreDicAvailable() + const cifCoreDic = await parseText(fs.readFileSync(CIF_CORE_DIC_PATH, 'utf8')).run(); + if (cifCoreDic.isError) throw cifCoreDic + + const cifCoreDicVersion = getDicVersion(cifCoreDic.result.blocks[0]) + const version = `Dictionary versions: CifCore ${cifCoreDicVersion}.` + + const frames: CifFrame[] = [...cifCoreDic.result.blocks[0].saveFrames] + const imports = await resolveImports(frames, DIC_DIR) + const schema = generateSchema(frames, imports) + + await runGenerateSchema(name, version, schema, fieldNamesPath, typescript, out, moldbImportPath, addAliases) +} + +async function resolveImports(frames: CifFrame[], baseDir: string): Promise<Map<string, CifFrame[]>> { + const imports = new Map<string, CifFrame[]>() + + for (const d of frames) { + if ('import' in d.categories) { + const importGet = parseImportGet(d.categories['import'].getField('get')!.str(0)) + for (const g of importGet) { + const { file } = g + if (!file) continue + if (imports.has(file)) continue + + const dic = await parseText(fs.readFileSync(path.join(baseDir, file), 'utf8')).run(); + if (dic.isError) throw dic + + imports.set(file, [...dic.result.blocks[0].saveFrames]) + } + } + } + + return imports +} + +async function runGenerateSchemaDic(name: string, dicPath: string, fieldNamesPath: string, typescript = false, out: string, moldbImportPath: string, addAliases: boolean) { + const dic = await parseText(fs.readFileSync(dicPath, 'utf8')).run(); + if (dic.isError) throw dic + + const dicVersion = getDicVersion(dic.result.blocks[0]) + const dicName = getDicNamespace(dic.result.blocks[0]) + const version = `Dictionary versions: ${dicName} ${dicVersion}.` + + const frames: CifFrame[] = [...dic.result.blocks[0].saveFrames] + const imports = await resolveImports(frames, path.dirname(dicPath)) + const schema = generateSchema(frames, imports) + + await runGenerateSchema(name, version, schema, fieldNamesPath, typescript, out, moldbImportPath, addAliases) +} + +async function runGenerateSchema(name: string, version: string, schema: Database, fieldNamesPath: string, typescript = false, out: string, moldbImportPath: string, addAliases: boolean) { + const filter = fieldNamesPath ? await getFieldNamesFilter(fieldNamesPath) : undefined + const output = typescript ? generate(name, version, schema, filter, moldbImportPath, addAliases) : JSON.stringify(schema, undefined, 4) + + if (out) { + fs.writeFileSync(out, output) + } else { + console.log(output) + } +} + +async function getFieldNamesFilter(fieldNamesPath: string): Promise<Filter> { + const fieldNamesStr = fs.readFileSync(fieldNamesPath, 'utf8') + const parsed = await parseCsv(fieldNamesStr, { noColumnNames: true }).run(); + if (parsed.isError) throw parser.error + const csvFile = parsed.result; + + const fieldNamesCol = csvFile.table.getColumn('0') + if (!fieldNamesCol) throw 'error getting fields columns' + const fieldNames = fieldNamesCol.toStringArray() + + const filter: Filter = {} + fieldNames.forEach((name, i) => { + const [ category, field ] = name.split('.') + // console.log(category, field) + if (!filter[ category ]) filter[ category ] = {} + filter[ category ][ field ] = true + }) + return filter +} + +async function ensureMmcifDicAvailable() { await ensureDicAvailable(MMCIF_DIC_PATH, MMCIF_DIC_URL) } +async function ensureIhmDicAvailable() { await ensureDicAvailable(IHM_DIC_PATH, IHM_DIC_URL) } +async function ensureCarbBranchDicAvailable() { await ensureDicAvailable(CARB_BRANCH_DIC_PATH, CARB_BRANCH_DIC_URL) } +async function ensureCarbCompDicAvailable() { await ensureDicAvailable(CARB_COMP_DIC_PATH, CARB_COMP_DIC_URL) } +async function ensureCifCoreDicAvailable() { + await ensureDicAvailable(CIF_CORE_DIC_PATH, CIF_CORE_DIC_URL) + await ensureDicAvailable(CIF_CORE_ENUM_PATH, CIF_CORE_ENUM_URL) + await ensureDicAvailable(CIF_CORE_ATTR_PATH, CIF_CORE_ATTR_URL) +} + +async function ensureDicAvailable(dicPath: string, dicUrl: string) { + if (FORCE_DIC_DOWNLOAD || !fs.existsSync(dicPath)) { + const name = dicUrl.substr(dicUrl.lastIndexOf('/') + 1) + console.log(`downloading ${name}...`) + const data = await fetch(dicUrl) + if (!fs.existsSync(DIC_DIR)) { + fs.mkdirSync(DIC_DIR); + } + fs.writeFileSync(dicPath, await data.text()) + console.log(`done downloading ${name}`) + } +} + +const DIC_DIR = path.resolve(__dirname, '../dics/') +const MMCIF_DIC_PATH = `${DIC_DIR}/mmcif_pdbx_v50.dic` +const MMCIF_DIC_URL = 'http://mmcif.wwpdb.org/dictionaries/ascii/mmcif_pdbx_v50.dic' +const IHM_DIC_PATH = `${DIC_DIR}/ihm-extension.dic` +const IHM_DIC_URL = 'https://raw.githubusercontent.com/ihmwg/IHM-dictionary/master/ihm-extension.dic' +const CARB_BRANCH_DIC_PATH = `${DIC_DIR}/entity_branch-extension.dic` +const CARB_BRANCH_DIC_URL = 'https://raw.githubusercontent.com/pdbxmmcifwg/carbohydrate-extension/master/dict/entity_branch-extension.dic' +const CARB_COMP_DIC_PATH = `${DIC_DIR}/chem_comp-extension.dic` +const CARB_COMP_DIC_URL = 'https://raw.githubusercontent.com/pdbxmmcifwg/carbohydrate-extension/master/dict/chem_comp-extension.dic' + +const CIF_CORE_DIC_PATH = `${DIC_DIR}/cif_core.dic` +const CIF_CORE_DIC_URL = 'https://raw.githubusercontent.com/COMCIFS/cif_core/master/cif_core.dic' +const CIF_CORE_ENUM_PATH = `${DIC_DIR}/templ_enum.cif` +const CIF_CORE_ENUM_URL = 'https://raw.githubusercontent.com/COMCIFS/cif_core/master/templ_enum.cif' +const CIF_CORE_ATTR_PATH = `${DIC_DIR}/templ_attr.cif` +const CIF_CORE_ATTR_URL = 'https://raw.githubusercontent.com/COMCIFS/cif_core/master/templ_attr.cif' + +const parser = new argparse.ArgumentParser({ + addHelp: true, + description: 'Create schema from mmcif dictionary (v50 plus IHM and entity_branch extensions, downloaded from wwPDB)' +}); +parser.addArgument([ '--preset', '-p' ], { + defaultValue: '', + choices: ['', 'mmCIF', 'CCD', 'BIRD', 'CifCore'], + help: 'Preset name' +}); +parser.addArgument([ '--name', '-n' ], { + defaultValue: '', + help: 'Schema name' +}); +parser.addArgument([ '--out', '-o' ], { + help: 'Generated schema output path, if not given printed to stdout' +}); +parser.addArgument([ '--targetFormat', '-tf' ], { + defaultValue: 'typescript-molstar', + choices: ['typescript-molstar', 'json-internal'], + help: 'Target format' +}); +parser.addArgument([ '--dicPath', '-d' ], { + defaultValue: '', + help: 'Path to dictionary' +}); +parser.addArgument([ '--fieldNamesPath', '-fn' ], { + defaultValue: '', + help: 'Field names to include' +}); +parser.addArgument([ '--forceDicDownload', '-f' ], { + action: 'storeTrue', + help: 'Force download of dictionaries' +}); +parser.addArgument([ '--moldataImportPath', '-mip' ], { + defaultValue: 'molstar/lib/mol-data', + help: 'mol-data import path (for typescript target only)' +}); +parser.addArgument([ '--addAliases', '-aa' ], { + action: 'storeTrue', + help: 'Add field name/path aliases' +}); +interface Args { + name: string + preset: '' | 'mmCIF' | 'CCD' | 'BIRD' | 'CifCore' + forceDicDownload: boolean + dic: '' | 'mmCIF' | 'CifCore' + dicPath: string, + fieldNamesPath: string + targetFormat: 'typescript-molstar' | 'json-internal' + out: string, + moldataImportPath: string + addAliases: boolean +} +const args: Args = parser.parseArgs(); + +const FORCE_DIC_DOWNLOAD = args.forceDicDownload + +switch (args.preset) { + case 'mmCIF': + args.name = 'mmCIF' + args.dic = 'mmCIF' + args.fieldNamesPath = path.resolve(__dirname, '../../../data/cif-field-names/mmcif-field-names.csv') + break + case 'CCD': + args.name = 'CCD' + args.dic = 'mmCIF' + args.fieldNamesPath = path.resolve(__dirname, '../../../data/cif-field-names/ccd-field-names.csv') + break + case 'BIRD': + args.name = 'BIRD' + args.dic = 'mmCIF' + args.fieldNamesPath = path.resolve(__dirname, '../../../data/cif-field-names/bird-field-names.csv') + break + case 'CifCore': + args.name = 'CifCore' + args.dic = 'CifCore' + args.fieldNamesPath = path.resolve(__dirname, '../../../data/cif-field-names/cif-core-field-names.csv') + break +} + +if (args.name) { + const typescript = args.targetFormat === 'typescript-molstar' + if (args.dicPath) { + runGenerateSchemaDic(args.name, args.dicPath, args.fieldNamesPath, typescript, args.out, args.moldataImportPath, args.addAliases).catch(e => { + console.error(e) + }) + } else if (args.dic === 'mmCIF') { + runGenerateSchemaMmcif(args.name, args.fieldNamesPath, typescript, args.out, args.moldataImportPath, args.addAliases).catch(e => { + console.error(e) + }) + } else if (args.dic === 'CifCore') { + runGenerateSchemaCifCore(args.name, args.fieldNamesPath, typescript, args.out, args.moldataImportPath, args.addAliases).catch(e => { + console.error(e) + }) + } +} diff --git a/src/apps/cifschema/util/cif-dic.ts b/src/apps/cifschema/util/cif-dic.ts new file mode 100644 index 0000000000000000000000000000000000000000..2f43e75a00009ae4a2ab604c52c1fd401e6112db --- /dev/null +++ b/src/apps/cifschema/util/cif-dic.ts @@ -0,0 +1,446 @@ +/** + * Copyright (c) 2017-2018 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import { Database, Column, EnumCol, StrCol, IntCol, ListCol, FloatCol, CoordCol, MatrixCol, VectorCol } from './schema' +import { parseImportGet } from './helper' +import * as Data from '../../../mol-io/reader/cif/data-model' +import { CifFrame } from '../../../mol-io/reader/cif/data-model'; + +export function getFieldType(type: string, description: string, values?: string[], container?: string): Column { + switch (type) { + // mmCIF + case 'code': + case 'ucode': + case 'line': + case 'uline': + case 'text': + case 'char': + case 'uchar3': + case 'uchar1': + case 'boolean': + return values && values.length ? EnumCol(values, 'str', description) : StrCol(description) + case 'aliasname': + case 'name': + case 'idname': + case 'any': + case 'atcode': + case 'fax': + case 'phone': + case 'email': + case 'code30': + case 'seq-one-letter-code': + case 'author': + case 'orcid_id': + case 'sequence_dep': + case 'pdb_id': + case 'emd_id': + // todo, consider adding specialised fields + case 'yyyy-mm-dd': + case 'yyyy-mm-dd:hh:mm': + case 'yyyy-mm-dd:hh:mm-flex': + case 'int-range': + case 'float-range': + case 'binary': + case 'operation_expression': + case 'point_symmetry': + case '4x3_matrix': + case '3x4_matrices': + case 'point_group': + case 'point_group_helical': + case 'symmetry_operation': + case 'date_dep': + case 'url': + case 'symop': + case 'exp_data_doi': + case 'asym_id': + return StrCol(description) + case 'int': + case 'non_negative_int': + case 'positive_int': + return values && values.length ? EnumCol(values, 'int', description) : IntCol(description) + case 'float': + return FloatCol(description) + case 'ec-type': + case 'ucode-alphanum-csv': + case 'id_list': + return ListCol('str', ',', description) + case 'id_list_spc': + return ListCol('str', ' ', description) + + // cif + case 'Text': + case 'Code': + case 'Complex': + case 'Symop': + case 'List': + case 'List(Real,Real)': + case 'List(Real,Real,Real,Real)': + case 'Date': + case 'Datetime': + case 'Tag': + case 'Implied': + return wrapContainer('str', ',', description, container) + case 'Real': + return wrapContainer('float', ',', description, container) + case 'Integer': + return wrapContainer('int', ',', description, container) + + } + console.log(`unknown type '${type}'`) + return StrCol(description) +} + +function ColFromType(type: 'int' | 'str' | 'float' | 'coord', description: string): Column { + switch (type) { + case 'int': return IntCol(description) + case 'str': return StrCol(description) + case 'float': return FloatCol(description) + case 'coord': return CoordCol(description) + } +} + +function wrapContainer(type: 'int' | 'str' | 'float' | 'coord', separator: string, description: string, container?: string) { + return container && container === 'List' ? ListCol(type, separator, description) : ColFromType(type, description) +} + +type FrameCategories = { [category: string]: Data.CifFrame } +type FrameLinks = { [k: string]: string } + +interface FrameData { + categories: FrameCategories + links: FrameLinks +} + +type Imports = Map<string, CifFrame[]> + +function getImportFrames(d: Data.CifFrame, imports: Imports) { + const frames: Data.CifFrame[] = [] + if (!('import' in d.categories)) return frames + + const importGet = parseImportGet(d.categories['import'].getField('get')!.str(0)) + for (const g of importGet) { + const { file, save } = g + if (!file || !save) { + console.warn(`missing 'save' or 'file' for import in '${d.header}'`) + continue + } + const importFrames = imports.get(file) + if (!importFrames) { + console.warn(`missing '${file}' entry in imports`) + continue + } + const importSave = importFrames.find(id => id.header.toLowerCase() === save.toLowerCase()) + if (!importSave) { + console.warn(`missing '${save}' save frame in '${file}'`) + continue + } + + frames.push(importSave) + } + + return frames +} + +/** get field from given or linked category */ +function getField(category: string, field: string, d: Data.CifFrame, imports: Imports, ctx: FrameData): Data.CifField|undefined { + const { categories, links } = ctx + const cat = d.categories[category] + if (cat) { + return cat.getField(field) + } else if (d.header in links) { + const linkName = links[d.header] + if (linkName in categories) { + return getField(category, field, categories[linkName], imports, ctx) + } else { + // console.log(`link '${linkName}' not found`) + } + } else { + const importFrames = getImportFrames(d, imports) + for (const idf of importFrames) { + return getField(category, field, idf, imports, ctx) + } + } +} + +function getEnums(d: Data.CifFrame, imports: Imports, ctx: FrameData) { + const value = getField('item_enumeration', 'value', d, imports, ctx) + const enums: string[] = [] + if (value) { + for (let i = 0; i < value.rowCount; ++i) { + enums.push(value.str(i)) + // console.log(value.str(i)) + } + return enums + } else { + // console.log(`item_enumeration.value not found for '${d.header}'`) + } +} + +function getContainer(d: Data.CifFrame, imports: Imports, ctx: FrameData) { + const value = getField('type', 'container', d, imports, ctx) + return value ? value.str(0) : undefined +} + +function getCode(d: Data.CifFrame, imports: Imports, ctx: FrameData): [string, string[] | undefined, string | undefined ] | undefined { + const code = getField('item_type', 'code', d, imports, ctx) || getField('type', 'contents', d, imports, ctx) + if (code) { + return [ code.str(0), getEnums(d, imports, ctx), getContainer(d, imports, ctx) ] + } else { + console.log(`item_type.code or type.contents not found for '${d.header}'`) + } +} + +function getSubCategory(d: Data.CifFrame, imports: Imports, ctx: FrameData): string | undefined { + const value = getField('item_sub_category', 'id', d, imports, ctx) + if (value) { + return value.str(0) + } +} + +function getDescription(d: Data.CifFrame, imports: Imports, ctx: FrameData): string | undefined { + const value = getField('item_description', 'description', d, imports, ctx) || getField('description', 'text', d, imports, ctx) + if (value) { + // trim (after newlines) and remove references to square brackets + return value.str(0).trim() + .replace(/(\r\n|\r|\n)([ \t]+)/g, '\n') + .replace(/(\[[1-3]\])+ element/, 'elements') + .replace(/(\[[1-3]\])+/, '') + } +} + +function getAliases(d: Data.CifFrame, imports: Imports, ctx: FrameData): string[] | undefined { + const value = getField('item_aliases', 'alias_name', d, imports, ctx) || getField('alias', 'definition_id', d, imports, ctx) + return value ? value.toStringArray().map(v => v.substr(1)) : undefined +} + +const reMatrixField = /\[[1-3]\]\[[1-3]\]/ +const reVectorField = /\[[1-3]\]/ + +const FORCE_INT_FIELDS = [ + '_atom_site.id', + '_atom_site.auth_seq_id', + '_atom_site_anisotrop.id', + '_pdbx_struct_mod_residue.auth_seq_id', + '_struct_conf.beg_auth_seq_id', + '_struct_conf.end_auth_seq_id', + '_struct_conn.ptnr1_auth_seq_id', + '_struct_conn.ptnr2_auth_seq_id', + '_struct_sheet_range.beg_auth_seq_id', + '_struct_sheet_range.end_auth_seq_id', +]; + +const COMMA_SEPARATED_LIST_FIELDS = [ + '_atom_site.pdbx_struct_group_id', + '_chem_comp.mon_nstd_parent_comp_id', + '_diffrn_radiation.pdbx_wavelength_list', + '_diffrn_source.pdbx_wavelength_list', + '_em_diffraction.tilt_angle_list', // 20,40,50,55 + '_em_entity_assembly.entity_id_list', + '_entity.pdbx_description', // Endolysin,Beta-2 adrenergic receptor + '_entity.pdbx_ec', + '_entity_poly.pdbx_strand_id', // A,B + '_entity_src_gen.pdbx_gene_src_gene', // ADRB2, ADRB2R, B2AR + '_pdbx_depui_entry_details.experimental_methods', + '_pdbx_depui_entry_details.requested_accession_types', + '_pdbx_soln_scatter_model.software_list', // INSIGHT II, HOMOLOGY, DISCOVERY, BIOPOLYMER, DELPHI + '_pdbx_soln_scatter_model.software_author_list', // MSI + '_pdbx_soln_scatter_model.entry_fitting_list', // Odd example: 'PDB CODE 1HFI, 1HCC, 1HFH, 1VCC' + '_pdbx_struct_assembly_gen.entity_inst_id', + '_pdbx_struct_assembly_gen.asym_id_list', + '_pdbx_struct_assembly_gen.auth_asym_id_list', + '_pdbx_struct_assembly_gen_depositor_info.asym_id_list', + '_pdbx_struct_assembly_gen_depositor_info.chain_id_list', + '_pdbx_struct_group_list.group_enumeration_type', + '_reflns.pdbx_diffrn_id', + '_refine.pdbx_diffrn_id', + '_reflns_shell.pdbx_diffrn_id', + '_struct_keywords.text', +]; + +const SPACE_SEPARATED_LIST_FIELDS = [ + '_chem_comp.pdbx_subcomponent_list', // TSM DPH HIS CHF EMR + '_pdbx_soln_scatter.data_reduction_software_list', // OTOKO + '_pdbx_soln_scatter.data_analysis_software_list', // SCTPL5 GNOM +]; + +const SEMICOLON_SEPARATED_LIST_FIELDS = [ + '_chem_comp.pdbx_synonyms' // GLYCERIN; PROPANE-1,2,3-TRIOL +] + +/** + * Useful when a dictionary extension will add enum values to an existing dictionary. + * By adding them here, the dictionary extension can be tested before the added enum + * values are available in the existing dictionary. + */ +const EXTRA_ENUM_VALUES: { [k: string]: string[] } = { + +} + +export function generateSchema(frames: CifFrame[], imports: Imports = new Map()): Database { + + const tables: Database['tables'] = {} + const aliases: Database['aliases'] = {} + + const categories: FrameCategories = {} + const links: FrameLinks = {} + const ctx = { categories, links } + + // get category metadata + frames.forEach(d => { + // category definitions in mmCIF start with '_' and don't include a '.' + // category definitions in cif don't include a '.' + if (d.header[0] === '_' || d.header.includes('.')) return + const categoryName = d.header.toLowerCase() + // console.log(d.header, d.categoryNames, d.categories) + let descriptionField: Data.CifField | undefined + const categoryKeyNames = new Set<string>() + + if ('category' in d.categories && 'category_key' in d.categories) { + const category = d.categories['category'] + const categoryKey = d.categories['category_key'] + if (categoryKey) { + const categoryKey_names = categoryKey.getField('name') + if (categoryKey_names) { + for (let i = 0, il = categoryKey_names.rowCount; i < il; ++i) { + categoryKeyNames.add(categoryKey_names.str(i)) + } + } + } + + descriptionField = category.getField('description') + + if (categoryKeyNames.size === 0) { + console.log(`no key given for category '${categoryName}'`) + } + } + + if ('description' in d.categories) { + descriptionField = d.categories['description'].getField('text') + } + + let description = '' + if (descriptionField) { + description = descriptionField.str(0).trim() + .replace(/(\r\n|\r|\n)([ \t]+)/g, '\n') // remove padding after newlines + } else { + console.log(`no description given for category '${categoryName}'`) + } + + tables[categoryName] = { description, key: categoryKeyNames, columns: {} } + + // console.log('++++++++++++++++++++++++++++++++++++++++++') + // console.log('name', categoryName) + // console.log('desc', description) + // console.log('key', categoryKeyNames) + }) + + // build list of links between categories + frames.forEach(d => { + if (d.header[0] !== '_' && !d.header.includes('.')) return + categories[d.header] = d + const item_linked = d.categories['item_linked'] + if (item_linked) { + const child_name = item_linked.getField('child_name') + const parent_name = item_linked.getField('parent_name') + if (child_name && parent_name) { + for (let i = 0; i < item_linked.rowCount; ++i) { + const childName = child_name.str(i) + const parentName = parent_name.str(i) + if (childName in links && links[childName] !== parentName) { + console.log(`${childName} linked to ${links[childName]}, ignoring link to ${parentName}`) + } + links[childName] = parentName + } + } + } + }) + + // get field data + Object.keys(categories).forEach(fullName => { + const d = categories[fullName] + if (!d) { + console.log(`'${fullName}' not found, moving on`) + return + } + + const categoryName = d.header.substring(d.header[0] === '_' ? 1 : 0, d.header.indexOf('.')) + const itemName = d.header.substring(d.header.indexOf('.') + 1) + let fields: { [k: string]: Column } + if (categoryName in tables) { + fields = tables[categoryName].columns + tables[categoryName].key.add(itemName) + } else if (categoryName.toLowerCase() in tables) { + // take case from category name in 'field' data as it is better if data is from cif dictionaries + tables[categoryName] = tables[categoryName.toLowerCase()] + fields = tables[categoryName].columns + } else { + console.log(`category '${categoryName}' has no metadata`) + fields = {} + tables[categoryName] = { + description: '', + key: new Set(), + columns: fields + } + } + + const itemAliases = getAliases(d, imports, ctx) + if (itemAliases) aliases[`${categoryName}.${itemName}`] = itemAliases + + const description = getDescription(d, imports, ctx) || '' + + // need to use regex to check for matrix or vector items + // as sub_category assignment is missing for some entries + const subCategory = getSubCategory(d, imports, ctx) + if (subCategory === 'cartesian_coordinate' || subCategory === 'fractional_coordinate') { + fields[itemName] = CoordCol(description) + } else if (FORCE_INT_FIELDS.includes(d.header)) { + fields[itemName] = IntCol(description) + console.log(`forcing int: ${d.header}`) + } else if (subCategory === 'matrix') { + fields[itemName.replace(reMatrixField, '')] = MatrixCol(3, 3, description) + } else if (subCategory === 'vector') { + fields[itemName.replace(reVectorField, '')] = VectorCol(3, description) + } else { + if (itemName.match(reMatrixField)) { + fields[itemName.replace(reMatrixField, '')] = MatrixCol(3, 3, description) + console.log(`${d.header} should have 'matrix' _item_sub_category.id`) + } else if (itemName.match(reVectorField)) { + fields[itemName.replace(reVectorField, '')] = VectorCol(3, description) + console.log(`${d.header} should have 'vector' _item_sub_category.id`) + } else { + const code = getCode(d, imports, ctx) + if (code) { + let fieldType = getFieldType(code[0], description, code[1], code[2]); + if (fieldType.type === 'str') { + if (COMMA_SEPARATED_LIST_FIELDS.includes(d.header)) { + fieldType = ListCol('str', ',', description) + console.log(`forcing comma separated: ${d.header}`) + } else if (SPACE_SEPARATED_LIST_FIELDS.includes(d.header)) { + fieldType = ListCol('str', ' ', description) + console.log(`forcing space separated: ${d.header}`) + } else if (SEMICOLON_SEPARATED_LIST_FIELDS.includes(d.header)) { + fieldType = ListCol('str', ';', description) + console.log(`forcing space separated: ${d.header}`) + } + } + if (d.header in EXTRA_ENUM_VALUES) { + if (fieldType.type === 'enum') { + fieldType.values.push(...EXTRA_ENUM_VALUES[d.header]) + } else { + console.warn(`expected enum: ${d.header}`) + } + } + fields[itemName] = fieldType + } else { + fields[itemName] = StrCol(description) + // console.log(`could not determine code for '${d.header}'`) + } + } + } + }) + + return { tables, aliases } +} diff --git a/src/apps/cifschema/util/generate.ts b/src/apps/cifschema/util/generate.ts new file mode 100644 index 0000000000000000000000000000000000000000..5024811cd9ab0bfc05f359388a745d886f8f02ee --- /dev/null +++ b/src/apps/cifschema/util/generate.ts @@ -0,0 +1,151 @@ +/** + * Copyright (c) 2017-2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +import { Database, Filter, Column } from './schema' +import { indentString } from '../../../mol-util/string' +import { FieldPath } from '../../../mol-io/reader/cif/schema'; + +function header (name: string, info: string, moldataImportPath: string) { + return `/** + * Copyright (c) 2017-2020 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * Code-generated '${name}' schema file. ${info} + * + * @author molstar/ciftools package + */ + +import { Database, Column } from '${moldataImportPath}/db' + +import Schema = Column.Schema` +} + +function footer (name: string) { + return ` +export type ${name}_Schema = typeof ${name}_Schema; +export interface ${name}_Database extends Database<${name}_Schema> {}` +} + +function getTypeShorthands(schema: Database, fields?: Filter) { + const types = new Set<string>() + Object.keys(schema.tables).forEach(table => { + if (fields && !fields[table]) return + const { columns } = schema.tables[table] + Object.keys(columns).forEach(columnName => { + if (fields && !fields[table][columnName]) return + types.add(schema.tables[table].columns[columnName].type) + }) + }) + const shorthands: string[] = [] + types.forEach(type => { + switch (type) { + case 'str': shorthands.push('const str = Schema.str;'); break + case 'int': shorthands.push('const int = Schema.int;'); break + case 'float': shorthands.push('const float = Schema.float;'); break + case 'coord': shorthands.push('const coord = Schema.coord;'); break + case 'enum': shorthands.push('const Aliased = Schema.Aliased;'); break + case 'matrix': shorthands.push('const Matrix = Schema.Matrix;'); break + case 'vector': shorthands.push('const Vector = Schema.Vector;'); break + case 'list': shorthands.push('const List = Schema.List;'); break + } + }) + return shorthands.join('\n') +} + +function getTypeDef(c: Column): string { + switch (c.type) { + case 'str': return 'str' + case 'int': return 'int' + case 'float': return 'float' + case 'coord': return 'coord' + case 'enum': + return `Aliased<'${c.values.map(v => v.replace(/'/g, '\\\'')).join(`' | '`)}'>(${c.subType})` + case 'matrix': + return `Matrix(${c.rows}, ${c.columns})` + case 'vector': + return `Vector(${c.length})` + case 'list': + if (c.subType === 'int') { + return `List('${c.separator}', x => parseInt(x, 10))` + } else if (c.subType === 'float' || c.subType === 'coord') { + return `List('${c.separator}', x => parseFloat(x))` + } else { + return `List('${c.separator}', x => x)` + } + } +} + +const reSafePropertyName = /^[a-zA-Z_$][0-9a-zA-Z_$]*$/ +function safePropertyString(name: string) { return name.match(reSafePropertyName) ? name : `'${name}'` } + +function doc(description: string, spacesCount: number) { + const spaces = ' '.repeat(spacesCount) + return [ + `${spaces}/**`, + `${indentString(description, 1, `${spaces} * `)}`.replace(/ +\n/g, '\n'), + `${spaces} */` + ].join('\n') +} + +export function generate (name: string, info: string, schema: Database, fields: Filter | undefined, moldataImportPath: string, addAliases: boolean) { + const codeLines: string[] = [] + + if (fields) { + Object.keys(fields).forEach(table => { + if (table in schema.tables) { + const schemaTable = schema.tables[table] + Object.keys(fields[table]).forEach(column => { + if (!(column in schemaTable.columns)) { + console.log(`filter field '${table}.${column}' not found in schema`) + } + }) + } else { + console.log(`filter category '${table}' not found in schema`) + } + }) + } + + codeLines.push(`export const ${name}_Schema = {`) + Object.keys(schema.tables).forEach(table => { + if (fields && !fields[table]) return + const { description, columns } = schema.tables[table] + if (description) codeLines.push(doc(description, 4)) + codeLines.push(` ${safePropertyString(table)}: {`) + Object.keys(columns).forEach(columnName => { + if (fields && !fields[table][columnName]) return + const c = columns[columnName] + const typeDef = getTypeDef(c) + if (c.description) codeLines.push(doc(c.description, 8)) + codeLines.push(` ${safePropertyString(columnName)}: ${typeDef},`) + }) + codeLines.push(' },') + }) + codeLines.push('}') + + if (addAliases) { + codeLines.push('') + codeLines.push(`export const ${name}_Aliases = {`) + Object.keys(schema.aliases).forEach(path => { + const [ table, columnName ] = path.split('.') + if (fields && !fields[table]) return + if (fields && !fields[table][columnName]) return + + const filteredAliases = new Set<string>() + schema.aliases[path].forEach(p => { + if (!FieldPath.equal(p, path)) filteredAliases.add(FieldPath.canonical(p)) + }) + + if (filteredAliases.size === 0) return + codeLines.push(` ${safePropertyString(path)}: [`) + filteredAliases.forEach(alias => { + codeLines.push(` '${alias}',`) + }) + codeLines.push(' ],') + }) + codeLines.push('}') + } + + return `${header(name, info, moldataImportPath)}\n\n${getTypeShorthands(schema, fields)}\n\n${codeLines.join('\n')}\n${footer(name)}` +} \ No newline at end of file diff --git a/src/apps/cifschema/util/helper.ts b/src/apps/cifschema/util/helper.ts new file mode 100644 index 0000000000000000000000000000000000000000..b1bb36bb2b22b2f89dd032fe9c6872f3648b4d55 --- /dev/null +++ b/src/apps/cifschema/util/helper.ts @@ -0,0 +1,20 @@ +/** + * Copyright (c) 2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +export type Import = { save?: string, file?: string } + +export function parseImportGet(s: string): Import[] { + // [{'save':hi_ang_Fox_coeffs 'file':templ_attr.cif} {'save':hi_ang_Fox_c0 'file':templ_enum.cif}] + // [{"file":'templ_enum.cif' "save":'H_M_ref'}] + return s.trim().substring(2, s.length - 2).split(/}[ \n\t]*{/g).map(s => { + const save = s.match(/('save'|"save"):([^ \t\n]+)/) + const file = s.match(/('file'|"file"):([^ \t\n]+)/) + return { + save: save ? save[0].substr(7).replace(/['"]/g, '') : undefined, + file: file ? file[0].substr(7).replace(/['"]/g, '') : undefined + } + }) +} \ No newline at end of file diff --git a/src/apps/cifschema/util/schema.ts b/src/apps/cifschema/util/schema.ts new file mode 100644 index 0000000000000000000000000000000000000000..0647c4f82d5d261c53de3846a88950db7b751a4f --- /dev/null +++ b/src/apps/cifschema/util/schema.ts @@ -0,0 +1,77 @@ +/** + * Copyright (c) 2017-2019 mol* contributors, licensed under MIT, See LICENSE file for more info. + * + * @author Alexander Rose <alexander.rose@weirdbyte.de> + */ + +export interface Database { + tables: { [ tableName: string ]: Table } + aliases: { [ path: string ]: string[] } +} +export interface Table { + description: string + key: Set<string> + columns: { [ columnName: string ]: Column } +} +export type Column = IntCol | StrCol | FloatCol | CoordCol | EnumCol | VectorCol | MatrixCol | ListCol + +type BaseCol = { description: string } + +export type IntCol = { type: 'int' } & BaseCol +export function IntCol(description: string): IntCol { return { type: 'int', description } } + +export type StrCol = { type: 'str' } & BaseCol +export function StrCol(description: string): StrCol { return { type: 'str', description } } + +export type FloatCol = { type: 'float' } & BaseCol +export function FloatCol(description: string): FloatCol { return { type: 'float', description } } + +export type CoordCol = { type: 'coord' } & BaseCol +export function CoordCol(description: string): CoordCol { return { type: 'coord', description } } + +export type EnumCol = { type: 'enum', subType: 'int' | 'str', values: string[] } & BaseCol +export function EnumCol(values: string[], subType: 'int' | 'str', description: string): EnumCol { + return { type: 'enum', description, values, subType } +} + +export type VectorCol = { type: 'vector', length: number } & BaseCol +export function VectorCol(length: number, description: string): VectorCol { + return { type: 'vector', description, length } +} + +export type MatrixCol = { type: 'matrix', rows: number, columns: number } & BaseCol +export function MatrixCol(columns: number, rows: number, description: string): MatrixCol { + return { type: 'matrix', description, columns, rows } +} + +export type ListCol = { type: 'list', subType: 'int' | 'str' | 'float' | 'coord', separator: string } & BaseCol +export function ListCol(subType: 'int' | 'str' | 'float' | 'coord', separator: string, description: string): ListCol { + return { type: 'list', description, separator, subType } +} + +export type Filter = { [ table: string ]: { [ column: string ]: true } } + +export function mergeFilters (...filters: Filter[]) { + const n = filters.length + const mergedFilter: Filter = {} + const fields: Map<string, number> = new Map() + filters.forEach(filter => { + Object.keys(filter).forEach(category => { + Object.keys(filter[ category ]).forEach(field => { + const key = `${category}.${field}` + const value = fields.get(key) || 0 + fields.set(key, value + 1) + }) + }) + }) + fields.forEach((v, k) => { + if (v !== n) return + const [categoryName, fieldName] = k.split('.') + if (categoryName in mergedFilter) { + mergedFilter[categoryName][fieldName] = true + } else { + mergedFilter[categoryName] = { fieldName: true } + } + }) + return mergedFilter +}