The plan was to merge the hits in MID2 in order to inspire the robotic synthesis.
I aligned all protein chains to the reference, which I chose to be 7QRZ
.
That is because there may be density in chain A and not B and viceversa.
Some crystals have two molecules or alternative locations.
import pymol2
import os
import re, operator
import chempy
from typing import List, Set, Dict, Tuple
from IPython.display import clear_output
# ---------- get filenames ------------------------
aligned_folder = 'aligned'
filepaths = []
for folder in os.listdir(aligned_folder):
path: str = os.path.join(aligned_folder, folder)
if not os.path.isdir(path):
continue
filepaths.extend([os.path.join(path, filename) for filename in os.listdir(path)])
is_pdb = lambda filename: os.path.splitext(filename)[1] == '.pdb'
filepaths = list(filter(is_pdb, filepaths))
filepaths = sorted(filepaths)
# ---------- extract ------------------------
extract_name = lambda filepath: os.path.split(filepath)[-1].replace('.pdb', '').replace('MID2A-', '').strip()
with pymol2.PyMOL() as pymol:
pymol.cmd.fetch('7QRZ', '7QRZ')
# load
names: List[str] = []
for filepath in filepaths:
name = extract_name(filepath)
pymol.cmd.load(filepath, name)
names.append(name)
# ## align twice
neonames: List[str] = []
for name in names:
for chain in ('A', 'B'):
pymol.cmd.align(f'%{name} and polymer.protein and chain {chain}',
'7QRZ and polymer.protein and chain A')
neoname = f'{name}_{chain.lower()}'
pymol.cmd.create(neoname, f'{name} and resn LIG')
neonames.append(neoname)
pymol.cmd.delete(name)
names = neonames
# ## split double chain mols
neonames: List[str] = []
for name in names:
lig_atoms: List[chempy.Atom] = pymol.cmd.get_model(f'{name} and resn LIG').atom
chains: Set[str] = {atom.chain for atom in lig_atoms}
if len(chains) == 1:
neonames.append(name)
continue
for chain in chains:
neoname = name+chain.lower()
pymol.cmd.create(neoname, f'{name} and resn LIG and chain {chain}')
neonames.append(neoname)
pymol.cmd.delete(name)
names = neonames
# ## split partial occupancy molecules
# altLoc is now used.
neonames: List[str] = []
for name in names:
lig_atoms: List[chempy.Atom] = pymol.cmd.get_model(f'{name} and resn LIG').atom
occupancies: Dict[Tuple[str, str, str], List[float]] = {}
alt_occupancy = False
for atom in lig_atoms:
atom_handle = atom.resi, atom.chain, atom.name
if atom_handle in occupancies:
alt_occupancy = True
occupancies[atom_handle].append(atom.q)
else:
occupancies[atom_handle] = [atom.q]
if not alt_occupancy:
neonames.append(name)
continue
print('alt occupancy: ', name)
qs = {q for qs in occupancies.values() for q in qs}
assert len(qs) == 2, occupancies
for q in qs:
neoname = f'{name}q{q*100:.0f}'
pymol.cmd.create(neoname, f'{name} and resn LIG and q = {q}')
pymol.cmd.alter(neoname, 'alt=""')
neonames.append(neoname)
pymol.cmd.delete(name)
names = neonames
for name in names:
assert pymol.cmd.select(f'{name} and resn LIG'), f'{name} lacks residue LIG'
# ## remove all distant molecules
for mortiturus in pymol.cmd.get_object_list('resn LIG and not byres %7QRZ around 3'):
pymol.cmd.delete(mortiturus)
pymol.cmd.sort()
pymol.cmd.save('combined.pse')
Then I extracted the molecules from the PyMOL object into individual PDBs:
import os, pymol2
ref = '7QRZ'
if not os.path.exists('aligned_mols'):
os.mkdir('aligned_mols')
with pymol2.PyMOL() as pymol:
pymol.cmd.load('combined.pse')
pymol.cmd.delete(ref)
for name in pymol.cmd.get_object_list():
assert pymol.cmd.select(f'{name} and resn LIG')
pymol.cmd.save(f'aligned_mols/{name}.pdb', name)
Then I saved them as a sdf with extra properties:
import re, json
from fragmenstein import Victor
import os
import pandas as pd
metadata = pd.read_csv('metadata.csv', index_col=0)
mapping = metadata.set_index('crystal_name').smiles.to_dict()
imetadata = metadata.set_index('crystal_name')
filenames = [os.path.join('aligned_mols', filename) for filename in os.listdir('aligned_mols') if os.path.splitext(filename)[1] == '.pdb']
mols = {}
for filename in filenames:
name = re.search(r'(x\d+_\d[AB])', filename).group(1)
uname = re.search(r'(x\d+_\d[AB].*)\.pdb', filename).group(1)
#print(uname)
metadata_name = 'MID2A-'+name[:-1]+name[-1].upper()
if metadata_name not in mapping:
raise ValueError(name)
mol = Victor.extract_mol(ligand_resn='LIG', filepath=filename,
removeHs=True, smiles=mapping[metadata_name],
name=name
)
mol.SetProp('Expected_SMILES', mapping[metadata_name])
mol.SetProp('Short_name', name)
mol.SetProp('Occupancy', json.dumps([a.GetPDBResidueInfo().GetOccupancy() for a in mol.GetAtoms()]) )
mol.SetProp('TempFactor', json.dumps([a.GetPDBResidueInfo().GetTempFactor() for a in mol.GetAtoms()]) )
for k in ('alternate_name', 'site_name', 'pdb_entry'):
v = imetadata.loc[metadata_name][k]
if str(v) == 'nan':
continue
mol.SetProp(k, str(v))
mols[uname] = mol
# Save
with Chem.SDWriter('hits.sdf') as sdf:
for name, mol in mols.items():
sdf.write(mol)
Template minimisation
This is optional but nice to do:
import pyrosetta
import pyrosetta_help as ph
from types import ModuleType
from IPython.display import display, HTML
from gist_import import GistImporter
retrieve_giphy = GistImporter('25a53d54d3ba65919610bb34b188ac67')['retrieve_giphy']
prc: ModuleType = pyrosetta.rosetta.core
prcc: ModuleType = pyrosetta.rosetta.core.conformation
logger = ph.configure_logger()
pyrosetta.distributed.maybe_init(extra_options=ph.make_option_string(no_optH=False,
ex1=None,
ex2=None,
#mute='all',
ignore_unrecognized_res=True,
load_PDB_components=True,
ignore_waters=False)
)
retrieve_giphy('super saiyan')
Then:
map_filename = ph.download_map('7qrz')
pdb_filename = ph.download_pdb('7qrz')
pose: pyrosetta.Pose = pyrosetta.pose_from_file(pdb_filename)
from collections import Counter
residue: prcc.Residue
Counter([residue.annotated_name() for residue in pose.residues if not residue.is_protein()]).most_common()
[('w[HOH]', 123), ('v[HOH_V]', 11), ('Z[pdb_EDO]', 8), ('Z[pdb_SO4]', 1)]
# here is discuss how one sets up ED
#https://blog.matteoferla.com/2020/04/how-to-set-up-electron-density.html
ed = ph.prep_ED(pose, map_filename)
print(f'Pose fits to {ed.matchPose(pose):%}')
ph.do_local_relax(pose)
print(f'Pose fits to {ed.matchPose(pose):%}')
pose.dump_pdb('7qrz.minimised.pdb')
Not too shabby:
Pose fits to 80.922183%
Pose fits to 76.618978%
scorefxn = pyrosetta.get_fa_scorefxn()
scorefxn.set_weight(pyrosetta.rosetta.core.scoring.ScoreType.elec_dens_fast, 30)
relax = pyrosetta.rosetta.protocols.relax.FastRelax(scorefxn, 5)
relax.apply(pose)
print(f'Pose fits to {ed.matchPose(pose):%}')
pose.dump_pdb('7qrz.minimised2.pdb')
Pose fits to 76.979639%
Analysis
import logging, os
from rdkit import Chem
from typing import List
from fragmenstein import Victor, Laboratory, Igor
Igor.init_pyrosetta()
Victor.work_path = 'combined'
Victor.monster_throw_on_discard = True # stop this merger if a fragment cannot be used.
Victor.monster_joining_cutoff = 7 # Å
Victor.quick_reanimation = False # for the impatient
Victor.error_to_catch = Exception # stop the whole laboratory otherwise
#Victor.enable_stdout(logging.ERROR)
Victor.enable_logfile('combine.log', logging.INFO)
hits: List[Chem.Mol] = list(Chem.SDMolSupplier('hits.sdf'))
print(f'There are {len(hits)} hits')
template = open('7qrz.minimised.mod.pdb').read()
assert len(template)
# calculate !
lab = Laboratory(pdbblock=template, covalent_resi='531A')
combinations:pd.DataFrame = lab.combine(hits, n_cores=28)