diff --git a/CHANGELOG.md b/CHANGELOG.md index c99340dfe21efba671b2309e6a25e9da529326f3..42a2f159d670f750df3737abfd58d71b91bb8ebf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ Note that since we don't clearly distinguish between a public and private interf - Fix aromatic rings assignment: do not mix flags and planarity test - Improve bonds assignment of coarse grained models: check for IndexPairBonds and exhaustive StructConn - Fix unit mapping in bondedAtomicPairs MolScript query +- Improve pdb parsing: handle non unique atom and chain names (fixes #156) ## [v3.5.0] - 2022-03-25 diff --git a/src/mol-model-formats/structure/pdb/atom-site.ts b/src/mol-model-formats/structure/pdb/atom-site.ts index 7fcc6389101767f11bcf71164ecfaff54a2ba2b1..7de7aeed37eb9c36cd14b717f05b7b6b02f6b3f9 100644 --- a/src/mol-model-formats/structure/pdb/atom-site.ts +++ b/src/mol-model-formats/structure/pdb/atom-site.ts @@ -1,5 +1,5 @@ /** - * Copyright (c) 2019-2021 mol* contributors, licensed under MIT, See LICENSE file for more info. + * Copyright (c) 2019-2022 mol* contributors, licensed under MIT, See LICENSE file for more info. * * @author David Sehnal <david.sehnal@gmail.com> * @author Alexander Rose <alexander.rose@weirdbyte.de> @@ -39,27 +39,99 @@ export function getAtomSiteTemplate(data: string, count: number) { }; } -export function getAtomSite(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema['atom_site'] | 'partial_charge']?: CifField } { +export function getAtomSite(sites: AtomSiteTemplate, hasTer: boolean): { [K in keyof mmCIF_Schema['atom_site'] | 'partial_charge']?: CifField } { + const pdbx_PDB_model_num = CifField.ofStrings(sites.pdbx_PDB_model_num); const auth_asym_id = CifField.ofTokens(sites.auth_asym_id); + const auth_seq_id = CifField.ofTokens(sites.auth_seq_id); const auth_atom_id = CifField.ofTokens(sites.auth_atom_id); const auth_comp_id = CifField.ofTokens(sites.auth_comp_id); + const id = CifField.ofStrings(sites.id); + + // + + let currModelNum = pdbx_PDB_model_num.str(0); + let currAsymId = auth_asym_id.str(0); + let currSeqId = auth_seq_id.int(0); + let currLabelAsymId = currAsymId; + + const asymIdCounts = new Map<string, number>(); + const atomIdCounts = new Map<string, number>(); + + const labelAsymIds: string[] = []; + const labelAtomIds: string[] = []; + + // ensure unique asym ids per model and unique atom ids per seq id + for (let i = 0, il = id.rowCount; i < il; ++i) { + const modelNum = pdbx_PDB_model_num.str(i); + const asymId = auth_asym_id.str(i); + const seqId = auth_seq_id.int(i); + let atomId = auth_atom_id.str(i); + + let asymIdChanged = false; + + if (modelNum !== currModelNum) { + asymIdCounts.clear(); + atomIdCounts.clear(); + currModelNum = modelNum; + currAsymId = asymId; + currSeqId = seqId; + asymIdChanged = true; + currLabelAsymId = asymId; + } else if (currAsymId !== asymId) { + atomIdCounts.clear(); + currAsymId = asymId; + currSeqId = seqId; + asymIdChanged = true; + currLabelAsymId = asymId; + } else if (currSeqId !== seqId) { + atomIdCounts.clear(); + currSeqId = seqId; + } + + if (asymIdCounts.has(asymId)) { + // only change the chains name if there are TER records + // otherwise assume repeated chain name use is from interleaved chains + if (hasTer && asymIdChanged) { + const asymIdCount = asymIdCounts.get(asymId)! + 1; + asymIdCounts.set(asymId, asymIdCount); + currLabelAsymId = `${asymId}_${asymIdCount}`; + } + } else { + asymIdCounts.set(asymId, 0); + } + labelAsymIds[i] = currLabelAsymId; + + if (atomIdCounts.has(atomId)) { + const atomIdCount = atomIdCounts.get(atomId)! + 1; + atomIdCounts.set(atomId, atomIdCount); + atomId = `${atomId}_${atomIdCount}`; + } else { + atomIdCounts.set(atomId, 0); + } + labelAtomIds[i] = atomId; + } + + const labelAsymId = Column.ofStringArray(labelAsymIds); + const labelAtomId = Column.ofStringArray(labelAtomIds); + + // return { auth_asym_id, auth_atom_id, auth_comp_id, - auth_seq_id: CifField.ofTokens(sites.auth_seq_id), + auth_seq_id, B_iso_or_equiv: CifField.ofTokens(sites.B_iso_or_equiv), Cartn_x: CifField.ofTokens(sites.Cartn_x), Cartn_y: CifField.ofTokens(sites.Cartn_y), Cartn_z: CifField.ofTokens(sites.Cartn_z), group_PDB: CifField.ofTokens(sites.group_PDB), - id: CifField.ofStrings(sites.id), + id, label_alt_id: CifField.ofTokens(sites.label_alt_id), - label_asym_id: auth_asym_id, - label_atom_id: auth_atom_id, + label_asym_id: CifField.ofColumn(labelAsymId), + label_atom_id: CifField.ofColumn(labelAtomId), label_comp_id: auth_comp_id, label_seq_id: CifField.ofUndefined(sites.index, Column.Schema.int), label_entity_id: CifField.ofStrings(sites.label_entity_id), @@ -68,7 +140,7 @@ export function getAtomSite(sites: AtomSiteTemplate): { [K in keyof mmCIF_Schema type_symbol: CifField.ofTokens(sites.type_symbol), pdbx_PDB_ins_code: CifField.ofTokens(sites.pdbx_PDB_ins_code), - pdbx_PDB_model_num: CifField.ofStrings(sites.pdbx_PDB_model_num), + pdbx_PDB_model_num, partial_charge: CifField.ofTokens(sites.partial_charge) }; diff --git a/src/mol-model-formats/structure/pdb/to-cif.ts b/src/mol-model-formats/structure/pdb/to-cif.ts index ff90a4da280c5bf95efa0a8c2e38af33d335da44..21a2a68f0593c6aeecf3d549600b0836290d167d 100644 --- a/src/mol-model-formats/structure/pdb/to-cif.ts +++ b/src/mol-model-formats/structure/pdb/to-cif.ts @@ -51,6 +51,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { let modelNum = 0, modelStr = ''; let conectRange: [number, number] | undefined = undefined; + let hasTer = false; for (let i = 0, _i = lines.count; i < _i; i++) { let s = indices[2 * i], e = indices[2 * i + 1]; @@ -161,6 +162,10 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { } // TODO: SCALE record => cif.atom_sites.fract_transf_matrix, cif.atom_sites.fract_transf_vector break; + case 'T': + if (substringStartsWith(data, s, e, 'TER')) { + hasTer = true; + } } } @@ -178,7 +183,7 @@ export async function pdbToMmCif(pdb: PdbFile): Promise<CifFrame> { atomSite.label_entity_id[i] = entityBuilder.getEntityId(compId, moleculeType, asymIds.value(i)); } - const atom_site = getAtomSite(atomSite); + const atom_site = getAtomSite(atomSite, hasTer); if (!isPdbqt) delete atom_site.partial_charge; if (conectRange) {