import { ParserOptionsSpec } from "./ParserOptionsSpec";
import { computeSecondaryStructure } from "./utils/computeSecondaryStructure";
import { processSymmetries } from "./utils/processSymmetries";
import { conversionMatrix3, Matrix3, Matrix4, Vector3 } from "../WebGL";
import { assignPDBBonds } from "./utils/assignPDBBonds";
import { AtomSpec } from "specs";
/**
* Puts atoms specified in mmCIF fromat in str into atoms
*
* @param {string} str
* @param {ParserOptionsSpec} options
* @category Parsers
*/
//coordinate conversion
const fractionalToCartesian = function (
cmat: Matrix3,
x: number,
y: number,
z: number
) {
return new Vector3(x, y, z).applyMatrix3(cmat);
};
export function CIF(str: string, options: ParserOptionsSpec = {}) {
const atoms: Array<AtomSpec[]> & { modelData?: unknown } = [];
const noAssembly = !options.doAssembly; // don't assemble by default
const modelData = (atoms.modelData = []);
const assignbonds =
options.assignBonds === undefined ? true : options.assignBonds;
// Used to handle quotes correctly
function splitRespectingQuotes(string: string, separator: string) {
const sections: string[] = [];
let sectionStart = 0;
let sectionEnd = 0;
while (sectionEnd < string.length) {
while (
string.substring(sectionEnd, sectionEnd + separator.length) !==
separator &&
sectionEnd < string.length
) {
// currently does not support escaping quotes
if (string[sectionEnd] === "'") {
sectionEnd++;
while (sectionEnd < string.length && string[sectionEnd] !== "'") {
sectionEnd++;
}
//biopython apparently generates invalid string literals so if we think we are done but aren't at a separator keep going
while (
string.substring(sectionEnd, sectionEnd + separator.length) !==
separator &&
sectionEnd < string.length
) {
sectionEnd++;
}
} else if (string[sectionEnd] === '"') {
sectionEnd++;
while (sectionEnd < string.length && string[sectionEnd] !== '"') {
sectionEnd++;
}
sectionEnd++;
} else {
sectionEnd++;
}
}
sections.push(string.substring(sectionStart, sectionEnd));
sectionStart = sectionEnd = sectionEnd + separator.length;
}
return sections;
}
const lines = str.split(/\r?\n|\r/);
// Filter text to remove comments, trailing spaces, and empty lines
const linesFiltered: string[] = [];
let trimDisabled = false;
for (let lineNum = 0; lineNum < lines.length; lineNum++) {
// first remove comments
// incorrect if #'s are allowed in strings
// comments might only be allowed at beginning of line, not sure
var line = lines[lineNum].split("#")[0];
// inside data blocks, the string must be left verbatim
// datablocks are started with a ';' at the beginning of a line
// and ended with a ';' on its own line.
if (trimDisabled) {
if (line[0] === ";") {
trimDisabled = false;
}
} else {
if (line[0] === ";") {
trimDisabled = true;
}
}
if (trimDisabled || line !== "") {
if (!trimDisabled) {
line = line.trim();
if (line[0] === "_") {
// Replace dot separating category from data item with underscore. Dots aren't guarenteed, to makes
// files consistent.
const dot = line.split(/\s/)[0].indexOf(".");
if (dot > -1) {
let lineArr = line.split("");
lineArr[dot] = "_";
line = lineArr.join("");
line = line.substring(0, dot) + "_" + line.substring(dot + 1);
}
}
}
linesFiltered.push(line);
}
}
let lineNum = 0;
while (lineNum < linesFiltered.length) {
while (
!linesFiltered[lineNum].startsWith("data_") ||
linesFiltered[lineNum] === "data_global"
) {
lineNum++;
}
lineNum++;
// Process the lines and puts all of the data into an object.
const mmCIF: Record<string, any> = {};
while (
lineNum < linesFiltered.length &&
!linesFiltered[lineNum].startsWith("data_")
) {
if (linesFiltered[lineNum][0] === undefined) {
lineNum++;
} else if (linesFiltered[lineNum][0] === "_") {
const dataItemName = linesFiltered[lineNum]
.split(/\s/)[0]
.toLowerCase();
const dataItem = (mmCIF[dataItemName] = mmCIF[dataItemName] || []);
// if nothing left on the line go to the next one
const restOfLine = linesFiltered[lineNum].substring(
linesFiltered[lineNum].indexOf(dataItemName) + dataItemName.length
);
if (restOfLine === "") {
lineNum++;
if (linesFiltered[lineNum][0] === ";") {
let dataBlock = linesFiltered[lineNum].substring(1);
lineNum++;
while (linesFiltered[lineNum] !== ";") {
dataBlock = dataBlock + "\n" + linesFiltered[lineNum];
lineNum++;
}
dataItem.push(dataBlock);
} else {
dataItem.push(linesFiltered[lineNum]);
}
} else {
dataItem.push(restOfLine.trim());
}
lineNum++;
} else if (linesFiltered[lineNum].substring(0, 5) === "loop_") {
lineNum++;
const dataItems = [];
while (
linesFiltered[lineNum] === "" ||
linesFiltered[lineNum][0] === "_"
) {
if (linesFiltered[lineNum] !== "") {
let dataItemName = linesFiltered[lineNum]
.split(/\s/)[0]
.toLowerCase();
let dataItem = (mmCIF[dataItemName] = mmCIF[dataItemName] || []);
dataItems.push(dataItem);
}
lineNum++;
}
let currentDataItem = 0;
while (
lineNum < linesFiltered.length &&
linesFiltered[lineNum][0] !== "_" &&
!linesFiltered[lineNum].startsWith("loop_") &&
!linesFiltered[lineNum].startsWith("data_")
) {
let line = splitRespectingQuotes(linesFiltered[lineNum], " ");
for (let field = 0; field < line.length; field++) {
if (line[field] !== "") {
dataItems[currentDataItem].push(line[field]);
currentDataItem = (currentDataItem + 1) % dataItems.length;
}
}
lineNum++;
}
} else {
lineNum++;
}
}
modelData.push({ symmetries: [] });
// Pulls atom information out of the data
atoms.push([]);
const atomCount =
mmCIF._atom_site_id !== undefined
? mmCIF._atom_site_id.length
: mmCIF._atom_site_label.length;
let conversionMatrix: Matrix3;
if (mmCIF._cell_length_a !== undefined) {
const a = parseFloat(mmCIF._cell_length_a);
const b = parseFloat(mmCIF._cell_length_b);
const c = parseFloat(mmCIF._cell_length_c);
const alpha_deg = parseFloat(mmCIF._cell_angle_alpha) || 90;
const beta_deg = parseFloat(mmCIF._cell_angle_beta) || 90;
const gamma_deg = parseFloat(mmCIF._cell_angle_gamma) || 90;
conversionMatrix = conversionMatrix3(
a,
b,
c,
alpha_deg,
beta_deg,
gamma_deg
);
modelData[modelData.length - 1].cryst = {
a: a,
b: b,
c: c,
alpha: alpha_deg,
beta: beta_deg,
gamma: gamma_deg,
};
}
for (let i = 0; i < atomCount; i++) {
if (
mmCIF._atom_site_group_pdb !== undefined &&
mmCIF._atom_site_group_pdb[i] === "TER"
)
continue;
const atom: AtomSpec = {};
if (mmCIF._atom_site_cartn_x !== undefined) {
atom.x = parseFloat(mmCIF._atom_site_cartn_x[i]);
atom.y = parseFloat(mmCIF._atom_site_cartn_y[i]);
atom.z = parseFloat(mmCIF._atom_site_cartn_z[i]);
} else {
const coords = fractionalToCartesian(
conversionMatrix,
parseFloat(mmCIF._atom_site_fract_x[i]),
parseFloat(mmCIF._atom_site_fract_y[i]),
parseFloat(mmCIF._atom_site_fract_z[i])
);
atom.x = coords.x;
atom.y = coords.y;
atom.z = coords.z;
}
atom.chain = mmCIF._atom_site_auth_asym_id
? mmCIF._atom_site_auth_asym_id[i]
: mmCIF._atom_site_label_asym_id
? mmCIF._atom_site_label_asym_id[i]
: undefined;
atom.resi = mmCIF._atom_site_auth_seq_id
? parseInt(mmCIF._atom_site_auth_seq_id[i])
: mmCIF._atom_site_label_seq_id
? mmCIF._atom_site_label_seq_id[i]
: undefined;
atom.resn = mmCIF._atom_site_auth_comp_id
? mmCIF._atom_site_auth_comp_id[i].trim()
: mmCIF._atom_site_label_comp_id
? mmCIF._atom_site_label_comp_id[i].trim()
: undefined;
atom.atom = mmCIF._atom_site_auth_atom_id
? mmCIF._atom_site_auth_atom_id[i].replace(/"/gm, "")
: mmCIF._atom_site_label_atom_id
? mmCIF._atom_site_label_atom_id[i].replace(/"/gm, "")
: undefined; //"primed" names are in quotes
atom.hetflag =
!mmCIF._atom_site_group_pdb ||
mmCIF._atom_site_group_pdb[i] === "HETA" ||
mmCIF._atom_site_group_pdb[i] === "HETATM";
if(mmCIF._atom_site_b_iso_or_equiv ) {
atom.b = parseFloat(mmCIF._atom_site_b_iso_or_equiv[i]);
}
let elem = "X";
if (mmCIF._atom_site_type_symbol) {
elem = mmCIF._atom_site_type_symbol[i].replace(/\(?\+?\d+.*/, "");
} else if (mmCIF._atom_site_label) {
//first two components are concatenated, then separated by underscore
//best I can do is assume second component, if present, starts with a number
elem = mmCIF._atom_site_label[i].split("_")[0].replace(/\(?\d+.*/, "");
}
atom.elem = elem[0].toUpperCase() + elem.substring(1, 2).toLowerCase();
atom.bonds = [];
atom.ss = "c";
atom.serial = i;
atom.bondOrder = [];
atom.properties = {};
atoms[atoms.length - 1].push(atom);
}
if (mmCIF._pdbx_struct_oper_list_id !== undefined && !noAssembly) {
for (let i = 0; i < mmCIF._pdbx_struct_oper_list_id.length; i++) {
const matrix11 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[1][1]"][i]
);
const matrix12 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[1][2]"][i]
);
const matrix13 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[1][3]"][i]
);
const vector1 = parseFloat(
mmCIF["_pdbx_struct_oper_list_vector[1]"][i]
);
const matrix21 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[2][1]"][i]
);
const matrix22 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[2][2]"][i]
);
const matrix23 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[2][3]"][i]
);
const vector2 = parseFloat(
mmCIF["_pdbx_struct_oper_list_vector[2]"][i]
);
const matrix31 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[3][1]"][i]
);
const matrix32 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[3][2]"][i]
);
const matrix33 = parseFloat(
mmCIF["_pdbx_struct_oper_list_matrix[3][3]"][i]
);
const vector3 = parseFloat(
mmCIF["_pdbx_struct_oper_list_vector[3]"][i]
);
const matrix = new Matrix4(
matrix11,
matrix12,
matrix13,
vector1,
matrix21,
matrix22,
matrix23,
vector2,
matrix31,
matrix32,
matrix33,
vector3
);
modelData[modelData.length - 1].symmetries.push(matrix);
}
}
const parseTerm = function (term: string) {
const negative = term.match("-");
term = term.replace(/[-xyz]/g, "");
const fractionParts = term.split("/");
let numerator: number, denominator: number;
if (fractionParts[1] === undefined) {
denominator = 1;
} else {
denominator = parseInt(fractionParts[1]);
}
if (fractionParts[0] === "") {
numerator = 1;
} else {
numerator = parseInt(fractionParts[0]);
}
return (numerator / denominator) * (negative ? -1 : 1);
};
if (mmCIF._symmetry_equiv_pos_as_xyz !== undefined && !noAssembly) {
for (let sym = 0; sym < mmCIF._symmetry_equiv_pos_as_xyz.length; sym++) {
const transform = mmCIF._symmetry_equiv_pos_as_xyz[sym].replace(
/["' ]/g,
""
);
const componentStrings = transform
.split(",")
.map(function (val: string) {
return val.replace(/-/g, "+-");
});
let matrix = new Matrix4(
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
0,
1
);
for (let coord = 0; coord < 3; coord++) {
const terms = componentStrings[coord].split("+");
for (let t = 0; t < terms.length; t++) {
const term = terms[t];
if (term === "") continue;
const coefficient = parseTerm(term);
if (term.match("x")) {
matrix.elements[coord + 0] = coefficient;
} else if (term.match("y")) {
matrix.elements[coord + 4] = coefficient;
} else if (term.match("z")) {
matrix.elements[coord + 8] = coefficient;
} else {
matrix.elements[coord + 12] = coefficient;
}
}
}
const conversionMatrix4 = conversionMatrix.getMatrix4();
const conversionInverse = new Matrix4().getInverse(
conversionMatrix4,
true
);
matrix = new Matrix4().multiplyMatrices(matrix, conversionInverse);
matrix = new Matrix4().multiplyMatrices(conversionMatrix4, matrix);
modelData[modelData.length - 1].symmetries.push(matrix);
}
}
}
for (let i = 0; i < atoms.length; i++) {
if (
assignbonds &&
!(options.duplicateAssemblyAtoms && !options.dontConnectDuplicatedAtoms)
) {
assignPDBBonds(atoms[i], options);
}
computeSecondaryStructure(atoms[i], options.hbondCutoff);
processSymmetries(
modelData[i].symmetries,
atoms[i],
options,
modelData[i].cryst
);
if (
options.duplicateAssemblyAtoms &&
!options.dontConnectDuplicatedAtoms &&
assignbonds
)
assignPDBBonds(atoms[i], options);
}
return atoms;
}