Datasets uploaded by me for EV D68 3C protease (Matteo Ferla)

The datasets upload to Fragalysis by me for EV D68 3C protease are:

  • Manual-Submission
  • ROCS&PyRosetta-filtering
  • ArthorianQuest-sulfonamide
  • Fragmenstein-no_curation

The Manual-Submission are my manual submissions by picking a 3-5 hits from the other three datasets and altering groups if needed, which happened for several. These were ordered.

The Fragmenstein-no_curation was regenerated on Monday 24 July.
It is filtered and sorted but not subjected to human filtering.

This was done with the script fragmenstein_merge_sw_place.py:

python fragmenstein_merge_sw_place.py \
--template reference.pdb \
--hits D68EV3CPROA.corrected.sdf \
--suffix '_v1' \
--sw_databases Enamine-SC-Stock-Mar2022.smi.anon Enamine-BB-Stock-Mar2022.smi.anon REAL-Database-22Q1.smi.anon```

Then in a notebook:

```python
import pandas as pd
from rdkit import Chem

with Chem.SDMolSupplier('D68EV3CPROA.corrected.sdf') as sdf_r:
    hits = list(sdf_r)

placements: pd.DataFrame = pd.read_pickle('fragmenstein_placed_v1.pkl.gz')

Added extras

placements['hit_names'] = placements.hit_mols\
                                    .apply(lambda v: v if isinstance(v, list) else [])\
                                    .apply(lambda v: [m.GetProp('_Name') for m in v])

from rdkit import DataStructs
from rdkit.Chem import AllChem
from rdkit.Chem import rdFingerprintGenerator as rdfpg

fpgen = rdfpg.GetRDKitFPGenerator()
hit2fp = {h.GetProp('_Name'): fpgen.GetFingerprint(h) for h in hits}

def get_similarity(row):
    if not isinstance(row.minimized_mol, Chem.Mol):
        return float('nan')
    fp = fpgen.GetFingerprint(AllChem.RemoveHs(row.minimized_mol))
    return max([DataStructs.TanimotoSimilarity(fp,hit2fp[name]) for name in row.hit_names])

placements['max_hit_Tanimoto'] = placements.apply(get_similarity, axis=1)
m = placements.minimized_mol.apply(lambda m: m if isinstance(m, Chem.Mol) else Chem.Mol())
placements['largest_ring'] = m.apply(lambda mol: max([0]+ list(map(len, mol.GetRingInfo().AtomRings()))))
import operator
from typing import Dict, List

hit_intxns = pd.read_csv('../D68/hit-intxns.csv', index_col=0)
name2intxns: Dict[str, List[str]] = {'D68EV3CPROA-' + n.split('§')[0]: [(xn.split(':')[0], xn.split(':')[1], int(xn.split(':')[2],)) for xn, xc in xs.items() if xc and xn != 'N_interactions'] for n, xs in hit_intxns.to_dict(orient='index').items()}

def tally_hit_intxns(row: pd.Series):
    if not isinstance(row.minimized_mol, Chem.Mol):
        return float('nan'), float('nan')
    for hit_name in row.hit_names:
        present_tally = 0
        absent_tally = 0
        for intxn_name in name2intxns[hit_name]:
            if intxn_name not in row.index:
                print(f'missing {intxn_name}')
                absent_tally += 1
            elif row[intxn_name] == 0:
                absent_tally += 1
            else:
                present_tally += 1
    return (present_tally, absent_tally)
                

hit_checks = placements.apply(tally_hit_intxns, axis=1)
placements['N_interactions_kept'] = hit_checks.apply(operator.itemgetter(0)).fillna(0).astype(int)
placements['N_interactions_lost'] = hit_checks.apply(operator.itemgetter(1)).fillna(99).astype(int)

intxn_names = [c for c in placements.columns if isinstance(c, tuple)]
tallies = placements[intxn_names].sum()
def ratioed(row, k=.5):
    return sum([(row[name]/tallies[name])**k for name in intxn_names])

placements['interaction_uniqueness_metric'] = placements.apply(ratioed, axis=1)

import pandas as pd
from rdkit.Chem import PandasTools, Draw
from rdkit import DataStructs
from rdkit.ML.Cluster import Butina
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.Chem import Descriptors
from IPython.display import HTML

def butina_cluster(mol_list,cutoff=0.35):
    # https://github.com/PatWalters/workshop/blob/master/clustering/taylor_butina.ipynb
    fp_list = [rdmd.GetMorganFingerprintAsBitVect(AllChem.RemoveAllHs(m), 3, nBits=2048) for m in mol_list]
    dists = []
    nfps = len(fp_list)
    for i in range(1,nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i],fp_list[:i])
        dists.extend([1-x for x in sims])
    mol_clusters = Butina.ClusterData(dists,nfps,cutoff,isDistData=True)
    cluster_id_list = [0]*nfps
    for idx,cluster in enumerate(mol_clusters,1):
        for member in cluster:
            cluster_id_list[member] = idx
    return cluster_id_list

m = placements.minimized_mol.apply(lambda m: m if isinstance(m, Chem.Mol) else Chem.Mol())
placements['cluster'] = butina_cluster(m.to_list())

import numpy as np
from scipy.spatial import distance
from sklearn.cluster import KMeans

data = placements[[c for c in placements.columns if isinstance(c, tuple)]].values
kmeans = KMeans(n_clusters=50, random_state=42)
kmeans.fit(data)
placements['interaction_cluster'] = kmeans.predict(data)

Finally, I filtered by a weights pulled out of a hat…

import plotly.express as px

weights = {'N_rotatable_bonds': 2,
           '∆∆G': +3,
           'interaction_uniqueness_metric': -10,
           'N_unconstrained_atoms': 1,
           'N_constrained_atoms': -0.5,
           #'N_HA': 
           #'LE': -2,
           'N_interactions': -5,
           'N_interactions_lost': +5,  # zero would raise error ...
           'max_hit_Tanimoto': -1
          }

def weight_split(row):
    return [round(row[col] / w, 1) for col, w in weights.items()]

def penalize(row):
    penalty = 0
    if row.outcome != 'acceptable':
        return float('nan')
    for col, w in weights.items():
        penalty += row[col] * w
    return penalty

placements['ad_hoc_penalty'] = placements.apply(penalize, axis=1)

fig = px.density_heatmap(placements.loc[placements.outcome == 'acceptable'], 
                         'N_HA', 'ad_hoc_penalty',
                         marginal_x="histogram", marginal_y="histogram")
fig.show()

placements\
.loc[placements.outcome == 'acceptable']\
.sort_values('ad_hoc_penalty')\
[['name', 'ad_hoc_penalty', 'smiles', 'N_HA'] + list(weights.keys())]

Iteration 3

As above but as iteration 1 only yielded 3/~20 (20 or less), there is not much extra to go by. So it was a merger->search->place of full/synthons of new hits and previous hits.

New hits:

['D68EV3CPROA-x0147_0A', 'D68EV3CPROA-x0232_1B', 'D68EV3CPROA-x0771_0A', 'D68EV3CPROA-x0789_0A', 'D68EV3CPROA-x0980_0B', 'D68EV3CPROA-x1020_0A', 'D68EV3CPROA-x1052_1A', 'D68EV3CPROA-x1083_0A', 'D68EV3CPROA-x1140_0A', 'D68EV3CPROA-x1285_0B', 'D68EV3CPROA-x1305_0B', 'D68EV3CPROA-x1453_0B', 
      'D68EV3CPROA-x1454_0B', 'D68EV3CPROA-x1498_0A', 'D68EV3CPROA-x1594_0A', 'D68EV3CPROA-x1604_0A',
     'D68EV3CPROA-x1537_0A','D68EV3CPROA-x1247_0A']

The ranking weights (penalty based) were changed to:

 {'N_rotatable_bonds': 3,
  '∆∆G': 5,
  'interaction_uniqueness_metric': -20,
  'N_unconstrained_atoms': 0.25,
  'N_constrained_atoms': -0.5,
  'N_interactions': -5,
  'N_interactions_lost': 10,
  'max_hit_Tanimoto': -1,
  'N_PAINS': 20,
  'strain_per_HA': 1}

The P1-ridge subset is analogues suggested to interactive with (161, 142) and (164, 162)