Analysis by Matteo (v.1)

The plan was to merge the hits in MID2 in order to inspire the robotic synthesis.

I aligned all protein chains to the reference, which I chose to be 7QRZ.
That is because there may be density in chain A and not B and viceversa.
Some crystals have two molecules or alternative locations.

import pymol2
import os
import re, operator
import chempy
from typing import List, Set, Dict, Tuple
from IPython.display import clear_output

# ---------- get filenames ------------------------

aligned_folder = 'aligned'

filepaths = []
for folder in os.listdir(aligned_folder):
    path: str = os.path.join(aligned_folder, folder)
    if not os.path.isdir(path):
        continue
    filepaths.extend([os.path.join(path, filename) for filename in os.listdir(path)])

is_pdb = lambda filename: os.path.splitext(filename)[1] == '.pdb'

filepaths = list(filter(is_pdb, filepaths))
filepaths = sorted(filepaths)

# ---------- extract ------------------------
extract_name = lambda filepath: os.path.split(filepath)[-1].replace('.pdb', '').replace('MID2A-', '').strip()

with pymol2.PyMOL() as pymol: 
    pymol.cmd.fetch('7QRZ', '7QRZ')
    
    # load
    names: List[str] = []
    for filepath in filepaths:
        name = extract_name(filepath)
        pymol.cmd.load(filepath, name)
        names.append(name)
        
    # ## align twice
    neonames: List[str] = []
    for name in names:
        for chain in ('A', 'B'):
            pymol.cmd.align(f'%{name} and polymer.protein and chain {chain}',
                             '7QRZ and polymer.protein and chain A')
            neoname = f'{name}_{chain.lower()}'
            pymol.cmd.create(neoname, f'{name} and resn LIG')
            neonames.append(neoname)
        pymol.cmd.delete(name)    
    names = neonames
    
    # ## split double chain mols
    neonames: List[str] = []
    for name in names:
        lig_atoms: List[chempy.Atom] = pymol.cmd.get_model(f'{name} and resn LIG').atom
        chains: Set[str] = {atom.chain for atom in lig_atoms}
        if len(chains) == 1:
            neonames.append(name)
            continue
        for chain in chains:
            neoname = name+chain.lower()
            pymol.cmd.create(neoname, f'{name} and resn LIG and chain {chain}')
            neonames.append(neoname)
        pymol.cmd.delete(name)
    names = neonames
    
    # ## split partial occupancy molecules
    # altLoc is now used.
    neonames: List[str] = []
    for name in names:
        lig_atoms: List[chempy.Atom] = pymol.cmd.get_model(f'{name} and resn LIG').atom
        occupancies: Dict[Tuple[str, str, str], List[float]] = {}
        alt_occupancy = False
        for atom in lig_atoms:
            atom_handle = atom.resi, atom.chain, atom.name
            if atom_handle in occupancies:
                alt_occupancy = True
                occupancies[atom_handle].append(atom.q)
            else:
                occupancies[atom_handle] = [atom.q]
        if not alt_occupancy:
            neonames.append(name)
            continue
        print('alt occupancy: ', name)
        qs = {q for qs in occupancies.values() for q in qs}
        assert len(qs) == 2, occupancies
        for q in qs:
            neoname = f'{name}q{q*100:.0f}'
            pymol.cmd.create(neoname, f'{name} and resn LIG and q = {q}')
            pymol.cmd.alter(neoname, 'alt=""')
            neonames.append(neoname)
        pymol.cmd.delete(name)
    names = neonames
    
    for name in names:
        assert pymol.cmd.select(f'{name} and resn LIG'), f'{name} lacks residue LIG'

    # ## remove all distant molecules
    for mortiturus in pymol.cmd.get_object_list('resn LIG and not byres %7QRZ around 3'):
        pymol.cmd.delete(mortiturus)
    pymol.cmd.sort()
    pymol.cmd.save('combined.pse')

Then I extracted the molecules from the PyMOL object into individual PDBs:

import os, pymol2
ref = '7QRZ'

if not os.path.exists('aligned_mols'):
    os.mkdir('aligned_mols')

with pymol2.PyMOL() as pymol:
    pymol.cmd.load('combined.pse')
    pymol.cmd.delete(ref)
    for name in pymol.cmd.get_object_list():
        assert pymol.cmd.select(f'{name} and resn LIG')
        pymol.cmd.save(f'aligned_mols/{name}.pdb', name)     

Then I saved them as a sdf with extra properties:

import re, json
from fragmenstein import Victor
import os
import pandas as pd



metadata = pd.read_csv('metadata.csv', index_col=0)
mapping = metadata.set_index('crystal_name').smiles.to_dict()
imetadata = metadata.set_index('crystal_name')

filenames = [os.path.join('aligned_mols', filename) for filename in os.listdir('aligned_mols') if os.path.splitext(filename)[1] == '.pdb']
mols = {}
for filename in filenames:
    name = re.search(r'(x\d+_\d[AB])', filename).group(1)
    uname = re.search(r'(x\d+_\d[AB].*)\.pdb', filename).group(1)
    #print(uname)
    metadata_name = 'MID2A-'+name[:-1]+name[-1].upper()
    if metadata_name not in mapping:
        raise ValueError(name)
    mol =  Victor.extract_mol(ligand_resn='LIG', filepath=filename,
                              removeHs=True, smiles=mapping[metadata_name], 
                              name=name
                             )
    mol.SetProp('Expected_SMILES', mapping[metadata_name])
    mol.SetProp('Short_name', name)
    mol.SetProp('Occupancy', json.dumps([a.GetPDBResidueInfo().GetOccupancy() for a in mol.GetAtoms()]) )
    mol.SetProp('TempFactor', json.dumps([a.GetPDBResidueInfo().GetTempFactor() for a in mol.GetAtoms()]) )
    for k in ('alternate_name', 'site_name', 'pdb_entry'):
        v = imetadata.loc[metadata_name][k]
        if str(v) == 'nan':
            continue
        mol.SetProp(k, str(v))
    mols[uname] = mol

# Save

with Chem.SDWriter('hits.sdf') as sdf:
    for name, mol in mols.items():
        sdf.write(mol)

Template minimisation

This is optional but nice to do:

import pyrosetta
import pyrosetta_help as ph
from types import ModuleType
from IPython.display import display, HTML
from gist_import import GistImporter

retrieve_giphy = GistImporter('25a53d54d3ba65919610bb34b188ac67')['retrieve_giphy']

prc: ModuleType = pyrosetta.rosetta.core
prcc: ModuleType = pyrosetta.rosetta.core.conformation

logger = ph.configure_logger()
pyrosetta.distributed.maybe_init(extra_options=ph.make_option_string(no_optH=False,
                                                ex1=None,
                                                ex2=None,
                                                #mute='all',
                                                ignore_unrecognized_res=True,
                                                load_PDB_components=True,
                                                ignore_waters=False)
                               )
retrieve_giphy('super saiyan')

Then:

map_filename = ph.download_map('7qrz')
pdb_filename = ph.download_pdb('7qrz')
pose: pyrosetta.Pose = pyrosetta.pose_from_file(pdb_filename)
from collections import Counter
residue: prcc.Residue
Counter([residue.annotated_name() for residue in pose.residues if not residue.is_protein()]).most_common()
 [('w[HOH]', 123), ('v[HOH_V]', 11), ('Z[pdb_EDO]', 8), ('Z[pdb_SO4]', 1)]
# here is discuss how one sets up ED
#https://blog.matteoferla.com/2020/04/how-to-set-up-electron-density.html
ed = ph.prep_ED(pose, map_filename)
print(f'Pose fits to {ed.matchPose(pose):%}')
ph.do_local_relax(pose)
print(f'Pose fits to {ed.matchPose(pose):%}')
pose.dump_pdb('7qrz.minimised.pdb')

Not too shabby:

Pose fits to 80.922183% 
Pose fits to 76.618978%
scorefxn = pyrosetta.get_fa_scorefxn()
scorefxn.set_weight(pyrosetta.rosetta.core.scoring.ScoreType.elec_dens_fast, 30)
relax = pyrosetta.rosetta.protocols.relax.FastRelax(scorefxn, 5)
relax.apply(pose)
print(f'Pose fits to {ed.matchPose(pose):%}')
pose.dump_pdb('7qrz.minimised2.pdb')
Pose fits to 76.979639%

Analysis

import logging, os
from rdkit import Chem
from typing import List
from fragmenstein import Victor, Laboratory, Igor

Igor.init_pyrosetta()

Victor.work_path = 'combined'
Victor.monster_throw_on_discard = True  # stop this merger if a fragment cannot be used.
Victor.monster_joining_cutoff = 7  # Å
Victor.quick_reanimation = False  # for the impatient
Victor.error_to_catch = Exception  # stop the whole laboratory otherwise
#Victor.enable_stdout(logging.ERROR)
Victor.enable_logfile('combine.log', logging.INFO)



hits: List[Chem.Mol] = list(Chem.SDMolSupplier('hits.sdf'))
print(f'There are {len(hits)} hits')
template = open('7qrz.minimised.mod.pdb').read()

assert len(template)

# calculate !
lab = Laboratory(pdbblock=template, covalent_resi='531A')
combinations:pd.DataFrame = lab.combine(hits, n_cores=28)