Skip to content

datamol

All the below functions are accessible under datamol.FUNCTION_NAME.

datamol._viz

datamol.viz._viz.to_image(mols, legends=None, n_cols=4, use_svg=False, mol_size=(200, 200), highlight_atom=None, highlight_bond=None, outfile=None, max_mols=32, copy=False, indices=False)

Generate an image out of a molecule or a list of molecule.

Parameters:

Name Type Description Default
mols Union[List[rdkit.Chem.rdchem.Mol], rdkit.Chem.rdchem.Mol]

one or a list of molecules.

required
legends Union[List[Optional[str]], str]

a string or a list of string as legend for every molecules.

None
n_cols int

number of molecules per column.

4
use_svg bool

whether to ouput an SVG (or a PNG).

False
mol_size Union[Tuple[int, int], int]

a int or a tuple of int defining the size per molecule.

(200, 200)
highlight_atom List[List[int]]

atom to highlight.

None
highlight_bond List[List[int]]

bonds to highlight.

None
outfile str

path where to save the image (local or remote path).

None
max_mols int

the maximum number of molecules to display.

32
copy bool

whether to copy the molecules or not.

False
indices bool

Whether to draw the atom indices.

False
Source code in datamol/viz/_viz.py
def to_image(
    mols: Union[List[Chem.rdchem.Mol], Chem.rdchem.Mol],
    legends: Union[List[Union[str, None]], str, None] = None,
    n_cols: int = 4,
    use_svg: bool = False,
    mol_size: Union[Tuple[int, int], int] = (200, 200),
    highlight_atom: List[List[int]] = None,
    highlight_bond: List[List[int]] = None,
    outfile: str = None,
    max_mols: int = 32,
    copy: bool = False,
    indices: bool = False,
):
    """Generate an image out of a molecule or a list of molecule.

    Args:
        mols: one or a list of molecules.
        legends: a string or a list of string as legend for every molecules.
        n_cols: number of molecules per column.
        use_svg: whether to ouput an SVG (or a PNG).
        mol_size: a int or a tuple of int defining the size per molecule.
        highlight_atom: atom to highlight.
        highlight_bond: bonds to highlight.
        outfile: path where to save the image (local or remote path).
        max_mols: the maximum number of molecules to display.
        copy: whether to copy the molecules or not.
        indices: Whether to draw the atom indices.
    """

    if isinstance(mol_size, int):
        mol_size = (mol_size, mol_size)

    if isinstance(mols, Chem.rdchem.Mol):
        mols = [mols]

    if isinstance(legends, str):
        legends = [legends]

    if copy:
        mols = [dm.copy_mol(mol) for mol in mols]

    if max_mols is not None:
        mols = mols[:max_mols]

        if legends is not None:
            legends = legends[:max_mols]

    if indices is True:
        [dm.atom_indices_to_mol(mol) for mol in mols]

    _highlight_atom = highlight_atom
    if highlight_atom is not None and isinstance(highlight_atom[0], int):
        _highlight_atom = [highlight_atom]

    _highlight_bond = highlight_bond
    if highlight_bond is not None and isinstance(highlight_bond[0], int):
        _highlight_bond = [highlight_bond]

    # Don't make the image bigger than it
    if len(mols) < n_cols:
        n_cols = len(mols)

    image = Draw.MolsToGridImage(
        mols,
        legends=legends,
        molsPerRow=n_cols,
        useSVG=use_svg,
        subImgSize=mol_size,
        highlightAtomLists=_highlight_atom,
        highlightBondLists=_highlight_bond,
    )

    if outfile is not None:
        with fsspec.open(outfile, "wb") as f:
            if use_svg:
                if isinstance(image, str):
                    # in a terminal process
                    f.write(image.encode())
                else:
                    # in a jupyter kernel process
                    f.write(image.data.encode())  # type: ignore
            else:
                if isinstance(image, PIL.PngImagePlugin.PngImageFile):  # type: ignore
                    # in a terminal process
                    image.save(f)
                else:
                    # in a jupyter kernel process
                    f.write(image.data)  # type: ignore

    return image

datamol.cluster

datamol.cluster.assign_to_centroids(mols, centroids, feature_fn=None, dist_fn=None, n_jobs=1)

Assign molecules to centroids. Each molecule will be assigned to the closest centroid.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

a list of molecules to assign to centroids

required
centroids List[rdkit.Chem.rdchem.Mol]

list of molecules to use as centroid

required
feature_fn Callable

A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
dist_fn Callable

A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.

None
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.

1

Returns:

Type Description
clusters_map

dict of index mapping each centroid index to the molecule index in the cluster clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid. Note that the centroid molecule is not added to the cluster.

Source code in datamol/cluster.py
def assign_to_centroids(
    mols: List[Chem.rdchem.Mol],
    centroids: List[Chem.rdchem.Mol],
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    r"""Assign molecules to centroids. Each molecule will be assigned to the closest centroid.

    Args:
        mols: a list of molecules to assign to centroids
        centroids: list of molecules to use as centroid
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        clusters_map: dict of index mapping each centroid index to the molecule index in the cluster
        clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid.
            Note that the centroid molecule is not added to the cluster.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    all_mols = [x for x in mols] + [c for c in centroids]
    features = dm.parallelized(feature_fn, all_mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)], features[int(j)])

    if dist_fn is None:
        dist_fn = distij

    clusters_map = ddict(list)
    clusters_list = [[] for _ in centroids]
    query_inds = np.expand_dims(np.arange(len(mols), dtype=int), axis=1)
    centroid_inds = np.expand_dims(np.arange(len(centroids), dtype=int), axis=1) + len(mols)
    dist_mat = distance.cdist(query_inds, centroid_inds, metric=distij)
    closest = np.argmin(dist_mat, axis=1)
    for ind, cluster_ind in enumerate(closest):  # type: ignore
        clusters_map[cluster_ind].append(ind)
        clusters_list[cluster_ind].append(mols[ind])
    return clusters_map, clusters_list

datamol.cluster.cluster_mols(mols, cutoff=0.2, feature_fn=None, n_jobs=1)

Cluster a set of molecules using the butina clustering algorithm and a given threshold.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

a list of molecules.

required
cutoff float

Cuttoff for the clustering. Default to 0.2.

0.2
feature_fn Callable

A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.

1
Source code in datamol/cluster.py
def cluster_mols(
    mols: List[Chem.rdchem.Mol],
    cutoff: float = 0.2,
    feature_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    """Cluster a set of molecules using the butina clustering algorithm and a given threshold.

    Args:
        mols: a list of molecules.
        cutoff: Cuttoff for the clustering. Default to 0.2.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    dists = []
    n_mols = len(mols)

    for i in range(1, n_mols):
        dist = DataStructs.BulkTanimotoSimilarity(features[i], features[:i], returnDistance=True)
        dists.extend([x for x in dist])

    # now cluster the data
    cluster_indices = Butina.ClusterData(dists, n_mols, cutoff, isDistData=True)
    cluster_mols = [operator.itemgetter(*cluster)(mols) for cluster in cluster_indices]

    # Make single mol cluster a list
    cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c for c in cluster_mols]

    return cluster_indices, cluster_mols

datamol.cluster.pick_centroids(mols, npick=0, initial_picks=None, threshold=0.5, feature_fn=None, dist_fn=None, seed=42, method='sphere', n_jobs=1)

Pick a set of npick centroids from a list of molecules.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

a list of molecules.

required
npick int

Number of element to pick from mols, including the preselection.

0
threshold float

Minimum distance between centroids for maxmin and sphere exclusion (sphere) methods.

0.5
initial_picks List[int]

Starting list of index for molecules that should be in the set of picked molecules. Default to None.

None
feature_fn Callable

A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
dist_fn Callable

A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.

None
seed int

seed for reproducibility

42
method str

Picking method to use. One of sphere, maxmin or any supported rdkit hierarchical clustering method such as centroid, clink, upgma

'sphere'
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.

1

Returns:

Type Description
picked_inds

index of the molecule that have been selected as centroids mols: molecules that have been picked

Source code in datamol/cluster.py
def pick_centroids(
    mols: List[Chem.rdchem.Mol],
    npick: int = 0,
    initial_picks: List[int] = None,
    threshold: float = 0.5,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    method: str = "sphere",
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of `npick` centroids from a list of molecules.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        method: Picking method to use. One of  `sphere`, `maxmin` or any
            supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been selected as centroids
        mols: molecules that have been picked
    """

    n_mols = len(mols)
    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    initial_picks = [] if initial_picks is None else initial_picks

    if method == "maxmin":
        picker = MaxMinPicker()
        picked_inds, _ = picker.LazyPickWithThreshold(
            dist_fn,
            n_mols,
            pickSize=npick,
            threshold=threshold,
            firstPicks=initial_picks,
            seed=seed,
        )

    elif method == "sphere":
        picker = LeaderPicker()
        picked_inds = picker.LazyPick(
            dist_fn, n_mols, threshold=threshold, pickSize=npick, firstPicks=initial_picks
        )

    elif method.upper() in ClusterMethod.names.keys() and npick:
        if initial_picks:
            logger.warning(
                "Initial picks is not supported by hierarchical clustering. You pick has been discarded."
            )

        dist_mat = dm.parallelized(
            distij, list(zip(*np.tril_indices(len(mols), k=-1))), arg_type="args"
        )
        dist_mat = np.asarray(dist_mat)
        picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()])
        picked_inds = picker.Pick(dist_mat, n_mols, npick)
    else:
        raise ValueError(f"Picking method {method} with {npick} elements to pick is not supported.")
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols

datamol.cluster.pick_diverse(mols, npick, initial_picks=None, feature_fn=None, dist_fn=None, seed=42, n_jobs=1)

Pick a set of diverse molecules based on they fingerprint.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

a list of molecules.

required
npick int

Number of element to pick from mols, including the preselection.

required
initial_picks List[int]

Starting list of index for molecules that should be in the set of picked molecules. Default to None.

None
feature_fn Callable

A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
dist_fn Callable

A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.

None
seed int

seed for reproducibility

42
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.

1

Returns:

Type Description
picked_inds

index of the molecule that have been picked mols: molecules that have been picked

Source code in datamol/cluster.py
def pick_diverse(
    mols: List[Chem.rdchem.Mol],
    npick: int,
    initial_picks: List[int] = None,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of diverse molecules based on they fingerprint.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been picked
        mols: molecules that have been picked
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    picker = MaxMinPicker()
    initial_picks = [] if initial_picks is None else initial_picks
    picked_inds = picker.LazyPick(dist_fn, len(mols), npick, firstPicks=initial_picks, seed=seed)
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols

datamol.convert

datamol.convert.from_df(df, smiles_column='smiles', mol_column=None, conserve_smiles=False, sanitize=True)

Convert a dataframe to a list of mols.

Note

If smiles_column is used to build the molecules, this property is removed from the molecules' properties. You can decide to conserve the SMILES column by setting conserve_smiles to True.

Parameters:

Name Type Description Default
df DataFrame

a dataframe.

required
smiles_column Optional[str]

Column name to extract the molecule.

'smiles'
mol_column str

Column name to extract the molecule. It takes precedence over smiles_column.

None
conserve_smiles bool

Whether to conserve the SMILES in the mols' props.

False
sanitize bool

Whether to sanitize if smiles_column is not None.

True
Source code in datamol/convert.py
def from_df(
    df: pd.DataFrame,
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    conserve_smiles: bool = False,
    sanitize: bool = True,
) -> List[Chem.rdchem.Mol]:
    """Convert a dataframe to a list of mols.

    Note:
        If `smiles_column` is used to build the molecules, this property
        is removed from the molecules' properties. You can decide to conserve
        the SMILES column by setting `conserve_smiles` to True.

    Args:
        df: a dataframe.
        smiles_column: Column name to extract the molecule.
        mol_column: Column name to extract the molecule. It takes
            precedence over `smiles_column`.
        conserve_smiles: Whether to conserve the SMILES in the mols' props.
        sanitize: Whether to sanitize if `smiles_column` is not None.
    """

    if smiles_column is None and mol_column is None:
        raise ValueError("Either `smiles_column` or `mol_column` must be not None.")

    if len(df) == 0:
        return []

    # Try to detect the mol column if `mol_column` is None.
    if mol_column is None:
        for col in df.columns:
            if isinstance(df[col].iloc[0], Chem.rdchem.Mol):
                mol_column = col

    def _row_to_mol(row):

        props = row.to_dict()

        if mol_column is not None:
            mol = props.pop(mol_column)
        else:

            if conserve_smiles:
                smiles = props[smiles_column]
            else:
                # If a SMILES column is used to create the molecule then it is removed from the
                # properties.
                smiles = props.pop(smiles_column)

            mol = dm.to_mol(smiles, sanitize=sanitize)

        if mol is None:
            return None

        dm.set_mol_props(mol, props)
        return mol

    return df.apply(_row_to_mol, axis=1).tolist()

datamol.convert.from_inchi(inchi, sanitize=True, remove_hs=True)

Convert an InChi to a mol.

Parameters:

Name Type Description Default
inchi Optional[str]

an inchi string.

required
sanitize bool

do sanitize.

True
remove_hs bool

do remove hs.

True

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

mol

Source code in datamol/convert.py
def from_inchi(
    inchi: Optional[str],
    sanitize: bool = True,
    remove_hs: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Convert an InChi to a mol.

    Args:
        inchi: an inchi string.
        sanitize: do sanitize.
        remove_hs: do remove hs.

    Returns:
        mol
    """
    if inchi is None:
        return None

    return Chem.MolFromInchi(inchi, sanitize=sanitize, removeHs=remove_hs)

datamol.convert.from_selfies(selfies, as_mol=False)

Convert a SEFLIES to a smiles or a mol.

Parameters:

Name Type Description Default
selfies str

a selfies.

required
as_mol bool

whether to return a mol or a smiles.

False

Returns:

Type Description
Union[str, rdkit.Chem.rdchem.Mol]

smiles or mol.

Source code in datamol/convert.py
def from_selfies(selfies: str, as_mol: bool = False) -> Optional[Union[str, Chem.rdchem.Mol]]:
    """Convert a SEFLIES to a smiles or a mol.

    Args:
        selfies: a selfies.
        as_mol (str, optional): whether to return a mol or a smiles.

    Returns:
        smiles or mol.
    """
    if selfies is None:
        return None

    smiles = sf.decoder(selfies)

    if as_mol and smiles is not None:
        return dm.to_mol(smiles)

    return smiles

datamol.convert.to_df(mols, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, render_df_mol=True, render_all_df_mol=False)

Convert a list of mols to a dataframe using each mol properties as a column.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

a molecule.

required
smiles_column Optional[str]

name of the SMILES column.

'smiles'
mol_column str

Name of the column. If not None, rdkit.Chem.PandaTools is used to add a molecule column.

None
include_private bool

Include private properties in the columns.

False
include_computed bool

Include computed properties in the columns.

False
render_df_mol bool

whether to render the molecule in the dataframe to images. If called once, it will be applied for the newly created dataframe with mol in it.

True
render_all_df_mol bool

Whether to render all pandas dataframe mol column as images.

False
Source code in datamol/convert.py
def to_df(
    mols: List[Chem.rdchem.Mol],
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    include_private: bool = False,
    include_computed: bool = False,
    render_df_mol: bool = True,
    render_all_df_mol: bool = False,
) -> Optional[pd.DataFrame]:
    """Convert a list of mols to a dataframe using each mol properties
    as a column.

    Args:
        mols: a molecule.
        smiles_column: name of the SMILES column.
        mol_column: Name of the column. If not None, rdkit.Chem.PandaTools
            is used to add a molecule column.
        include_private: Include private properties in the columns.
        include_computed: Include computed properties in the columns.
        render_df_mol: whether to render the molecule in the dataframe to images.
            If called once, it will be applied for the newly created dataframe with
            mol in it.
        render_all_df_mol: Whether to render all pandas dataframe mol column as images.
    """

    # Init a dataframe
    df = pd.DataFrame()

    # Feed it with smiles
    if smiles_column is not None:
        smiles = [dm.to_smiles(mol) for mol in mols]
        df[smiles_column] = smiles

    # Add a mol column
    if mol_column is not None:
        df[mol_column] = mols

    # Add any other properties present in the molecule
    props = [
        mol.GetPropsAsDict(
            includePrivate=include_private,
            includeComputed=include_computed,
        )
        for mol in mols
    ]
    props_df = pd.DataFrame(props)

    if smiles_column is not None and smiles_column in props_df.columns:
        logger.warning(
            f"The SMILES column name provided ('{smiles_column}') is already present in the properties"
            " of the molecules. THe returned dataframe will two columns with the same name."
        )

    # Concat the df with the properties df
    df = pd.concat([df, props_df], axis=1)

    # Render mol column to images
    if render_df_mol is True and mol_column is not None:
        # NOTE(hadim): replace by `PandaTools.ChangeMoleculeRendering` once
        # https://github.com/rdkit/rdkit/issues/3563 is fixed.
        _ChangeMoleculeRendering(df)

        if render_all_df_mol:
            PandasTools.RenderImagesInAllDataFrames()

    return df

datamol.convert.to_inchi(mol)

Convert a mol to Inchi.

Parameters:

Name Type Description Default
mol Union[str, rdkit.Chem.rdchem.Mol]

a molecule.

required
Source code in datamol/convert.py
def to_inchi(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
    """Convert a mol to Inchi.

    Args:
        mol: a molecule.
    """

    if mol is None:
        return None

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    return Chem.MolToInchi(mol)

datamol.convert.to_inchikey(mol)

Convert a mol to Inchi key.

Parameters:

Name Type Description Default
mol Union[str, rdkit.Chem.rdchem.Mol]

a molecule

required
Source code in datamol/convert.py
def to_inchikey(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
    """Convert a mol to Inchi key.

    Args:
        mol: a molecule
    """

    if mol is None:
        return None

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    return Chem.MolToInchiKey(mol)

datamol.convert.to_selfies(mol)

Convert a mol to SELFIES.

Parameters:

Name Type Description Default
mol Union[str, rdkit.Chem.rdchem.Mol]

a molecule or a SMILES.

required

Returns:

Type Description
Optional[str]

selfies: SELFIES string.

Source code in datamol/convert.py
def to_selfies(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
    """Convert a mol to SELFIES.

    Args:
        mol: a molecule or a SMILES.

    Returns:
        selfies: SELFIES string.
    """
    if mol is None:
        return None

    if isinstance(mol, Chem.rdchem.Mol):
        mol = to_smiles(mol)

    selfies = sf.encoder(mol)  # type: ignore

    if selfies == -1:
        return None

    return selfies

datamol.convert.to_smarts(mol, keep_hs=True)

Convert a molecule to a smarts.

Parameters:

Name Type Description Default
mol Union[str, rdkit.Chem.rdchem.Mol]

a molecule.

required
keep_hs bool

Whether to keep hydrogen. This will increase the count of H atoms for atoms with attached hydrogens to create a valid smarts. e.g. [H]-[CH2]-[] -> [H]-[CH3]-[]

True

Returns:

Type Description
Optional[str]

smarts of the molecule

Source code in datamol/convert.py
def to_smarts(mol: Union[str, Chem.rdchem.Mol], keep_hs: bool = True) -> Optional[str]:
    """Convert a molecule to a smarts.

    Args:
        mol: a molecule.
        keep_hs: Whether to keep hydrogen. This will increase the count of H atoms
            for atoms with attached hydrogens to create a valid smarts.
            e.g. [H]-[CH2]-[*] -> [H]-[CH3]-[*]

    Returns:
        smarts of the molecule
    """

    if mol is None:
        return None

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    # Change the isotope to 42
    for atom in mol.GetAtoms():  # type: ignore
        if keep_hs:
            s = sum(na.GetAtomicNum() == 1 for na in atom.GetNeighbors())
            if s:
                atom.SetNumExplicitHs(atom.GetTotalNumHs() + s)
        atom.SetIsotope(42)

    # Print out the smiles, all the atom attributes will be fully specified
    smarts = to_smiles(mol, isomeric=True, explicit_bonds=True)

    if smarts is None:
        return None

    # Remove the 42 isotope labels
    smarts = re.sub(r"\[42", "[", smarts)
    return smarts

datamol.convert.to_smiles(mol, canonical=True, isomeric=True, ordered=False, explicit_bonds=False, explicit_hs=False, randomize=False, cxsmiles=False, allow_to_fail=False)

Convert a mol to a SMILES.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
canonical bool

if false no attempt will be made to canonicalize the molecule.

True
isomeric bool

whether to include information about stereochemistry in the SMILES.

True
ordered bool

whether to force reordering of the atoms first.

False
explicit_bonds bool

if true, all bond orders will be explicitly indicated in the output SMILES.

False
explicit_hs bool

if true, all H counts will be explicitly indicated in the output SMILES.

False
randomize bool

whether to randomize the generated smiles. Override canonical.

False
cxsmiles bool

Whether to return a CXSMILES instead of a SMILES.

False
allow_to_fail bool

Raise an error if the conversion to SMILES fails. Return None otherwise.

False
Source code in datamol/convert.py
def to_smiles(
    mol: Chem.rdchem.Mol,
    canonical: bool = True,
    isomeric: bool = True,
    ordered: bool = False,
    explicit_bonds: bool = False,
    explicit_hs: bool = False,
    randomize: bool = False,
    cxsmiles: bool = False,
    allow_to_fail: bool = False,
) -> Optional[str]:
    """Convert a mol to a SMILES.

    Args:
        mol: a molecule.
        canonical: if false no attempt will be made to canonicalize the molecule.
        isomeric: whether to include information about stereochemistry in the SMILES.
        ordered: whether to force reordering of the atoms first.
        explicit_bonds: if true, all bond orders will be explicitly indicated in the output SMILES.
        explicit_hs: if true, all H counts will be explicitly indicated in the output SMILES.
        randomize: whether to randomize the generated smiles. Override `canonical`.
        cxsmiles: Whether to return a CXSMILES instead of a SMILES.
        allow_to_fail: Raise an error if the conversion to SMILES fails. Return None otherwise.
    """
    if ordered and canonical is False:
        mol = dm.reorder_atoms(mol)

    if randomize:
        mol = dm.randomize_atoms(mol)
        canonical = False

    smiles = None
    try:

        if cxsmiles:
            smiles = Chem.MolToCXSmiles(  # type: ignore
                mol,
                isomericSmiles=isomeric,
                canonical=canonical,
                allBondsExplicit=explicit_bonds,
                allHsExplicit=explicit_hs,
            )

        else:
            smiles = Chem.MolToSmiles(  # type: ignore
                mol,
                isomericSmiles=isomeric,
                canonical=canonical,
                allBondsExplicit=explicit_bonds,
                allHsExplicit=explicit_hs,
            )

    except Exception as e:

        if allow_to_fail:
            raise e

        return None

    return smiles

datamol.data

datamol.data.freesolv()

Source code in datamol/data.py
def freesolv():
    with pkg_resources.resource_stream("datamol", "data/freesolv.csv") as f:
        data = pd.read_csv(f)
    return data

datamol.fp

datamol.fp.fp_to_array(fp, dtype=<class 'int'>)

Convert rdkit fingerprint to numpy array.

Note

This implementation has shown to be faster than using DataStructs.ConvertToNumpyArray by a factor of ~4.

Source code in datamol/fp.py
def fp_to_array(fp: DataStructs.ExplicitBitVect, dtype: type = int) -> np.ndarray:
    """Convert rdkit fingerprint to numpy array.

    Note:
        This implementation has shown to be faster than using `DataStructs.ConvertToNumpyArray`
        by a factor of ~4.
    """
    if isinstance(fp, np.ndarray):
        return fp
    return np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")

datamol.fp.to_fp(mol, fp_size=2048, radius=3, use_features=True, as_array=True)

Transform a molecule from smiles to morgan fingerprint.

Note

That function should be expanded to compute more type of fingerprints.

Parameters:

Name Type Description Default
mol Union[str, rdkit.Chem.rdchem.Mol]

a molecule or a SMILES.

required
fp_size int

Size of morgan fingerprint. Default to 2048.

2048
radius int

Radius of the morgan fingerprints. Default to 3.

3
use_features bool

Whether to use atom features. Default to True.

True
as_array bool

Whether to return a numpy array of an RDKit vec. Default to True.

True

Returns:

Type Description
Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.ExplicitBitVect]

A fingerprint vector or None

Source code in datamol/fp.py
def to_fp(
    mol: Union[str, Chem.rdchem.Mol],
    fp_size: int = 2048,
    radius: int = 3,
    use_features: bool = True,
    as_array: bool = True,
) -> Optional[Union[np.ndarray, DataStructs.ExplicitBitVect]]:
    """Transform a molecule from smiles to morgan fingerprint.

    Note:
        That function should be expanded to compute more type of fingerprints.

    Args:
        mol (Chem.Mol or str): a molecule or a SMILES.
        fp_size (int, optional): Size of morgan fingerprint. Default to 2048.
        radius (int, optional): Radius of the morgan fingerprints. Default to 3.
        use_features: Whether to use atom features. Default to True.
        as_array: Whether to return a numpy array of an RDKit vec. Default to True.

    Returns:
        A fingerprint vector or None
    """

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
        mol,
        radius,
        nBits=fp_size,
        useFeatures=use_features,
    )

    if as_array:
        return fp_to_array(fp)

    return fp

datamol.graph

datamol.graph.get_all_path_between(mol, atom_idx_1, atom_idx_2, ignore_cycle_basis=False)

Get all simple path between two atoms of a molecule

Parameters:

Name Type Description Default
mol Mol

a molecule

required
atom_idx_1 int

Atom index 1.

required
atom_idx_2 int

Atom index 2.

required
ignore_cycle_basis bool

Whether to ignore cycle basis. Defaults to False.

False

Returns:

Type Description
[type]

[description]

Source code in datamol/graph.py
def get_all_path_between(
    mol: Chem.Mol,
    atom_idx_1: int,
    atom_idx_2: int,
    ignore_cycle_basis: bool = False,
):
    """Get all simple path between two atoms of a molecule

    Args:
        mol (Chem.Mol): a molecule
        atom_idx_1 (int): Atom index 1.
        atom_idx_2 (int): Atom index 2.
        ignore_cycle_basis: Whether to ignore cycle basis.
            Defaults to False.

    Returns:
        [type]: [description]
    """

    nx = _get_networkx()

    adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
    G = nx.Graph(adj)
    path = nx.all_simple_paths(G, source=atom_idx_1, target=atom_idx_2)

    if ignore_cycle_basis:
        rings = [set(x) for x in mol.GetRingInfo().AtomRings()]
        final_path = []
        for p in path:
            reject_path = False
            for r in rings:
                if r.issubset(set(p)):
                    reject_path = True
                    break
            if not reject_path:
                final_path.append(p)
        path = final_path

    return list(path)

datamol.graph.to_graph(mol)

Convert a molecule to a network x graph. A list of properties are added to every nodes and edges.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required

Returns:

Type Description
mol_graph (networkx.Graph)

a graph representing the molecule.

Source code in datamol/graph.py
def to_graph(mol: Chem.Mol):
    """Convert a molecule to a network x graph. A list of properties are added
    to every nodes and edges.

    Args:
        mol (Chem.Mol): a molecule.

    Returns:
        mol_graph (networkx.Graph): a graph representing the molecule.
    """

    nx = _get_networkx()

    mol_graph = nx.Graph()
    for atom in mol.GetAtoms():
        mol_graph.add_node(
            atom.GetIdx(),
            atomic_num=atom.GetAtomicNum(),
            formal_charge=atom.GetFormalCharge(),
            chiral_tag=atom.GetChiralTag(),
            hybridization=atom.GetHybridization(),
            num_explicit_hs=atom.GetNumExplicitHs(),
            implicit_valence=atom.GetImplicitValence(),
            degree=atom.GetDegree(),
            symbol=atom.GetSymbol(),
            ring_atom=atom.IsInRing(),
            is_aromatic=atom.GetIsAromatic(),
        )
    for bond in mol.GetBonds():
        mol_graph.add_edge(
            bond.GetBeginAtomIdx(),
            bond.GetEndAtomIdx(),
            bond_type=bond.GetBondType(),
        )
    return mol_graph

datamol.io

datamol.io.read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs)

Read a CSV file.

Parameters:

Name Type Description Default
urlpath Union[str, os.PathLike, TextIO]

Path to a file or a file-like object. Path can be remote or local.

required
smiles_column str

Use this column to build a mol column.

None
mol_column str

Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file.

'mol'
kwargs

Arguments to pass to pd.read_csv().

{}

Returns:

Type Description
DataFrame

df: a pandas.DataFrame

Source code in datamol/io.py
def read_csv(
    urlpath: Union[str, os.PathLike, TextIO],
    smiles_column: str = None,
    mol_column: str = "mol",
    **kwargs,
) -> pd.DataFrame:
    """Read a CSV file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Use this column to build a mol column.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        kwargs: Arguments to pass to `pd.read_csv()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df: pd.DataFrame = pd.read_csv(urlpath, **kwargs)  # type: ignore

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

datamol.io.read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs)

Read an excel file.

Parameters:

Name Type Description Default
urlpath Union[str, os.PathLike, TextIO]

Path to a file or a file-like object. Path can be remote or local.

required
sheet_name Union[str, int, list]

see pandas.read_excel() doc.

0
mol_column str

Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file.

'mol'
mol_column str

name to give to the mol column.

'mol'
kwargs

Arguments to pass to pd.read_excel().

{}

Returns:

Type Description
DataFrame

df: a pandas.DataFrame

Source code in datamol/io.py
def read_excel(
    urlpath: Union[str, os.PathLike, TextIO],
    sheet_name: Optional[Union[str, int, list]] = 0,
    smiles_column: str = None,
    mol_column: str = "mol",
    **kwargs,
) -> pd.DataFrame:
    """Read an excel file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sheet_name: see `pandas.read_excel()` doc.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        mol_column: name to give to the mol column.
        kwargs: Arguments to pass to `pd.read_excel()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df = pd.read_excel(urlpath, sheet_name=sheet_name, **kwargs)  # type: ignore

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

datamol.io.read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True)

Read an SDF file.

Note: This function is meant to be used with dataset that fit in-memory. For a more advanced usage we suggest you to use directly Chem.ForwardSDMolSupplier.

Parameters:

Name Type Description Default
urlpath Union[str, os.PathLike, TextIO]

Path to a file or a file-like object. Path can be remote or local.

required
sanitize bool

Whether to sanitize the molecules.

True
as_df bool

Whether to return a list mol or a pandas DataFrame.

False
smiles_column Optional[str]

Name of the SMILES column. Only relevant if as_df is True.

'smiles'
mol_column str

Name of the mol column. Only relevant if as_df is True.

None
include_private bool

Include private properties in the columns. Only relevant if as_df is True.

False
include_computed bool

Include computed properties in the columns. Only relevant if as_df is True.

False
strict_parsing bool

If set to false, the parser is more lax about correctness of the contents.

True
Source code in datamol/io.py
def read_sdf(
    urlpath: Union[str, os.PathLike, TextIO],
    sanitize: bool = True,
    as_df: bool = False,
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    include_private: bool = False,
    include_computed: bool = False,
    strict_parsing: bool = True,
) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
    """Read an SDF file.

    Note: This function is meant to be used with dataset that fit _in-memory_.
    For a more advanced usage we suggest you to use directly `Chem.ForwardSDMolSupplier`.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sanitize: Whether to sanitize the molecules.
        as_df: Whether to return a list mol or a pandas DataFrame.
        smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
        include_private: Include private properties in the columns.  Only relevant if
            `as_df` is True.
        include_computed: Include computed properties in the columns.  Only relevant if
            `as_df` is True.
        strict_parsing: If set to false, the parser is more lax about correctness of the contents.
    """

    # File-like object
    if isinstance(urlpath, io.IOBase):
        supplier = Chem.ForwardSDMolSupplier(
            urlpath,
            sanitize=sanitize,
            strictParsing=strict_parsing,
        )
        mols = list(supplier)

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath) as f:

            # Handle gzip file if needed
            if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
                f = gzip.open(f)

            supplier = Chem.ForwardSDMolSupplier(
                f,
                sanitize=sanitize,
                strictParsing=strict_parsing,
            )
            mols = list(supplier)

    # Discard None values
    mols = [mol for mol in mols if mol is not None]

    # Convert to dataframe
    if as_df:
        return dm.to_df(
            mols,
            smiles_column=smiles_column,
            mol_column=mol_column,
            include_private=include_private,
            include_computed=include_computed,
        )  # type: ignore

    return mols

datamol.io.read_smi(urlpath)

Read a list of smiles from am .smi file.

Parameters:

Name Type Description Default
urlpath Union[str, os.PathLike]

Path to a file or a file-like object. Path can be remote or local. Note: file-like object are not supported yet.

required
Source code in datamol/io.py
def read_smi(
    urlpath: Union[str, os.PathLike],
) -> Sequence[Chem.rdchem.Mol]:
    """Read a list of smiles from am `.smi` file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
            Note: file-like object are not supported yet.
    """

    active_path = urlpath

    # NOTE(hadim): the temporary local file copy
    # is because `SmilesMolSupplier` does not support
    # using file-like object, only path.

    # Copy to a local temporary path if the path is a remote one.
    if not fsspec.utils.can_be_local(str(urlpath)):
        active_path = pathlib.Path(tempfile.mkstemp()[1])
        dm.utils.fs.copy_file(urlpath, active_path)

    # Read the molecules
    supplier = Chem.SmilesMolSupplier(str(active_path), titleLine=0)
    mols = [mol for mol in supplier if mol is not None]

    # Delete the local temporary path
    if not fsspec.utils.can_be_local(str(urlpath)):
        pathlib.Path(active_path).unlink()

    return mols

datamol.io.to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None)

Write molecules to a file.

Parameters:

Name Type Description Default
mols Union[rdkit.Chem.rdchem.Mol, Sequence[rdkit.Chem.rdchem.Mol], pandas.core.frame.DataFrame]

a dataframe, a molecule or a list of molecule.

required
urlpath Union[str, os.PathLike, TextIO]

Path to a file or a file-like object. Path can be remote or local.

required
smiles_column Optional[str]

Column name to extract the molecule.

'smiles'
mol_column str

Column name to extract the molecule. It takes precedence over smiles_column.

None
Source code in datamol/io.py
def to_sdf(
    mols: Union[Chem.rdchem.Mol, Sequence[Chem.rdchem.Mol], pd.DataFrame],
    urlpath: Union[str, os.PathLike, TextIO],
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
):
    """Write molecules to a file.

    Args:
        mols: a dataframe, a molecule or a list of molecule.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Column name to extract the molecule.
        mol_column: Column name to extract the molecule. It takes
            precedence over `smiles_column`.
    """

    if isinstance(mols, pd.DataFrame):
        mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column)

    elif isinstance(mols, Chem.rdchem.Mol):
        mols = [mols]

    # Filter out None values
    mols = [mol for mol in mols if mol is not None]

    # File-like object
    if isinstance(urlpath, io.IOBase):
        writer = Chem.SDWriter(urlpath)
        for mol in mols:
            writer.write(mol)
        writer.close()

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath, mode="w") as f:
            writer = Chem.SDWriter(f)
            for mol in mols:
                writer.write(mol)
            writer.close()

datamol.io.to_smi(mols, urlpath, error_if_empty=False)

Save a list of molecules in an .smi file.

Parameters:

Name Type Description Default
mols Sequence[rdkit.Chem.rdchem.Mol]

a list of molecules.

required
urlpath Union[str, os.PathLike, TextIO]

Path to a file or a file-like object. Path can be remote or local.

required
error_if_empty bool

whether to raise and error if the input list is empty.

False
Source code in datamol/io.py
def to_smi(
    mols: Sequence[Chem.rdchem.Mol],
    urlpath: Union[str, os.PathLike, TextIO],
    error_if_empty: bool = False,
):
    """Save a list of molecules in an `.smi` file.

    Args:
        mols: a list of molecules.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        error_if_empty: whether to raise and error if the input list is empty.
    """

    if len(mols) == 0 and error_if_empty:
        raise ValueError("The list of mols/smiles provided is empty.")

    # Filter out None values
    mols = [mol for mol in mols if mol is not None]

    # File-like object
    if isinstance(urlpath, io.IOBase):
        writer = Chem.SmilesWriter(urlpath, includeHeader=False, nameHeader="")
        for mol in mols:
            writer.write(mol)
        writer.close()

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath, "w") as f:
            writer = Chem.SmilesWriter(f, includeHeader=False, nameHeader="")
            for mol in mols:
                writer.write(mol)
            writer.close()

datamol.jobs

datamol.utils.jobs.JobRunner

is_sequential property readonly

Check whether the job is sequential or parallel

__call__(self, *args, **kwargs) special

Run job using the n_jobs attribute to determine regime

Source code in datamol/utils/jobs.py
def __call__(self, *args, **kwargs):
    """
    Run job using the n_jobs attribute to determine regime
    """
    if self.is_sequential:
        return self.sequential(*args, **kwargs)
    return self.parallel(*args, **kwargs)

__init__(self, n_jobs=-1, prefer=None, progress=False, **job_kwargs) special

JobRunner with sequential/parallel regimes. The multiprocessing backend use joblib which allows taking advantage of its features, while the progress bar use tqdm

Parameters:

Name Type Description Default
n_jobs Optional[int]

Number of process. Use 0 or None to force sequential. Use -1 to use all the available processors. For details see https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation

-1
prefer str

Choose from ['processes', 'threads'] or None. Default to None. Soft hint to choose the default backend if no specific backend was selected with the parallel_backend context manager. The default process-based backend is 'loky' and the default thread-based backend is 'threading'. Ignored if the backend parameter is specified.

None
progress bool

whether to display progress bar

False
job_kwargs

Any additional keyword argument supported by joblib.Parallel.

{}

Examples:

import datamol as dm
runner = dm.JobRunner(n_jobs=4, progress=True, prefer="threads")
results = runner(lambda x: x**2, [1, 2, 3, 4])
Source code in datamol/utils/jobs.py
def __init__(
    self,
    n_jobs: Optional[int] = -1,
    prefer: str = None,
    progress: bool = False,
    **job_kwargs,
):
    """
    JobRunner with sequential/parallel regimes. The multiprocessing backend use joblib which
    allows taking advantage of its features, while the progress bar use tqdm

    Args:
        n_jobs: Number of process. Use 0 or None to force sequential.
            Use -1 to use all the available processors. For details see
            https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation
        prefer: Choose from ['processes', 'threads'] or None. Default to None.
            Soft hint to choose the default backend if no specific backend
            was selected with the parallel_backend context manager. The
            default process-based backend is 'loky' and the default
            thread-based backend is 'threading'. Ignored if the ``backend``
            parameter is specified.
        progress: whether to display progress bar
        job_kwargs: Any additional keyword argument supported by joblib.Parallel.

    Example:

    ```python
    import datamol as dm
    runner = dm.JobRunner(n_jobs=4, progress=True, prefer="threads")
    results = runner(lambda x: x**2, [1, 2, 3, 4])
    ```
    """
    self.n_jobs = n_jobs
    self.prefer = prefer
    self.job_kwargs = job_kwargs
    self.job_kwargs.update(n_jobs=self.n_jobs, prefer=self.prefer)
    self.no_progress = not progress

get_iterator_length(data) staticmethod

Attempt to get the length of an iterator

Source code in datamol/utils/jobs.py
@staticmethod
def get_iterator_length(data):
    """Attempt to get the length of an iterator"""
    total_length = None
    try:
        total_length = len(data)
    except TypeError:
        # most likely a generator, ignore
        pass
    return total_length

parallel(self, callable_fn, data, arg_type=None, **fn_kwargs)

Run job in parallel

Parameters:

Name Type Description Default
callable_fn Callable

function to call

required
data Iterable[Any]

input data

required
arg_type Optional[str]

function argument type ('arg'/None or 'args' or 'kwargs')

None
fn_kwargs dict

optional keyword argument to pass to the callable funciton

{}
Source code in datamol/utils/jobs.py
def parallel(
    self,
    callable_fn: Callable,
    data: Iterable[Any],
    arg_type: Optional[str] = None,
    **fn_kwargs,
):
    r"""
    Run job in parallel

    Args:
        callable_fn (callable): function to call
        data (iterable): input data
        arg_type (str, optional): function argument type ('arg'/None or 'args' or 'kwargs')
        fn_kwargs (dict, optional): optional keyword argument to pass to the callable funciton
    """
    runner = JobRunner._parallel_helper(**self.job_kwargs)
    total_length = JobRunner.get_iterator_length(data)
    results = runner(total=total_length, disable=self.no_progress)(
        delayed(JobRunner.wrap_fn(callable_fn, arg_type, **fn_kwargs))(dt) for dt in data
    )
    return results

sequential(self, callable_fn, data, arg_type=None, **fn_kwargs)

Run job in sequential version

Parameters:

Name Type Description Default
callable_fn Callable

function to call

required
data Iterable[Any]

input data

required
arg_type Optional[str]

function argument type ('arg'/None or 'args' or 'kwargs')

None
fn_kwargs dict

optional keyword argument to pass to the callable funciton

{}
Source code in datamol/utils/jobs.py
def sequential(
    self,
    callable_fn: Callable,
    data: Iterable[Any],
    arg_type: Optional[str] = None,
    **fn_kwargs,
):
    r"""
    Run job in sequential version

    Args:
        callable_fn (callable): function to call
        data (iterable): input data
        arg_type (str, optional): function argument type ('arg'/None or 'args' or 'kwargs')
        fn_kwargs (dict, optional): optional keyword argument to pass to the callable funciton
    """
    total_length = JobRunner.get_iterator_length(data)
    res = [
        JobRunner.wrap_fn(callable_fn, arg_type, **fn_kwargs)(dt)
        for dt in tqdm(data, total=total_length, disable=self.no_progress)
    ]
    return res

wrap_fn(fn, arg_type=None, **fn_kwargs) staticmethod

Small wrapper around a callable to properly format it's argument

Source code in datamol/utils/jobs.py
@staticmethod
def wrap_fn(fn: Callable, arg_type: Optional[str] = None, **fn_kwargs):
    """Small wrapper around a callable to properly format it's argument"""
    # EN probably use something like (moms.utils.commons.is_callable) ?
    def _run(args: Any):
        if arg_type == "kwargs":
            fn_kwargs.update(**args)
            return fn(**fn_kwargs)
        elif arg_type == "args":
            return fn(*args, **fn_kwargs)
        return fn(args, **fn_kwargs)

    return _run

datamol.utils.jobs.parallelized(fn, inputs_list, scheduler='processes', n_jobs=-1, progress=False, arg_type='arg')

Run a function in parallel.

Parameters:

Name Type Description Default
fn Callable

The function to run in parallel.

required
inputs_list Iterable[Any]

List of inputs to pass to fn.

required
scheduler str

Choose between ["processes", "threads"]. Defaults to None which uses the default joblib "loky" scheduler.

'processes'
n_jobs Optional[int]

Number of process. Use 0 or None to force sequential. Use -1 to use all the available processors. For details see https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation

-1
progress bool

Display a progress bar. Defaults to False.

False
arg_type str

One of ["arg", "args", "kwargs]: - "arg": the input is passed as an argument: fn(arg) (default). - "args": the input is passed as a list: fn(*args). - "kwargs": the input is passed as a map: fn(**kwargs).

'arg'

Returns:

Type Description
Optional[List[Any]]

The results of the execution as a list.

Source code in datamol/utils/jobs.py
def parallelized(
    fn: Callable,
    inputs_list: Iterable[Any],
    scheduler: str = "processes",
    n_jobs: Optional[int] = -1,
    progress: bool = False,
    arg_type: str = "arg",
) -> Optional[List[Any]]:
    """Run a function in parallel.

    Args:
        fn: The function to run in parallel.
        inputs_list: List of inputs to pass to `fn`.
        scheduler: Choose between ["processes", "threads"]. Defaults
            to None which uses the default joblib "loky" scheduler.
        n_jobs: Number of process. Use 0 or None to force sequential.
                Use -1 to use all the available processors. For details see
                https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation
        progress: Display a progress bar. Defaults to False.
        arg_type: One of ["arg", "args", "kwargs]:
            - "arg": the input is passed as an argument: `fn(arg)` (default).
            - "args": the input is passed as a list: `fn(*args)`.
            - "kwargs": the input is passed as a map: `fn(**kwargs)`.

    Returns:
        The results of the execution as a list.
    """

    runner = JobRunner(n_jobs=n_jobs, progress=progress, prefer=scheduler)
    return runner(fn, inputs_list, arg_type=arg_type)

datamol.log

datamol.log.disable_rdkit_log()

Disable all rdkit logs.

Source code in datamol/log.py
def disable_rdkit_log():
    """Disable all rdkit logs."""
    for log_level in RDLogger._levels:
        rdBase.DisableLog(log_level)

datamol.log.enable_rdkit_log()

Enable all rdkit logs.

Source code in datamol/log.py
def enable_rdkit_log():
    """Enable all rdkit logs."""
    for log_level in RDLogger._levels:
        rdBase.EnableLog(log_level)

datamol.log.without_rdkit_log

Context manager to disable RDKit logs. By default all logs are disabled.

datamol.mol

datamol.mol.adjust_singleton(mol)

Remove all atoms that are essentially disconnected singleton nodes in the molecular graph. For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C", but not the ethane fragment.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
Source code in datamol/mol.py
def adjust_singleton(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Remove all atoms that are essentially disconnected singleton nodes in the molecular graph.
    For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C",
    but not the ethane fragment.

    Args:
        mol: a molecule.
    """
    to_rem = []
    em = Chem.RWMol(mol)
    for atom in mol.GetAtoms():
        if atom.GetExplicitValence() == 0:
            to_rem.append(atom.GetIdx())
    to_rem.sort(reverse=True)
    for a_idx in to_rem:
        em.RemoveAtom(a_idx)
    return em.GetMol()

datamol.mol.atom_indices_to_mol(mol, copy=False)

Add the molAtomMapNumber property to each atoms.

Parameters:

Name Type Description Default
mol Mol

a molecule

required
copy bool

Whether to copy the molecule.

False
Source code in datamol/mol.py
def atom_indices_to_mol(mol: Chem.rdchem.Mol, copy: bool = False):
    """Add the `molAtomMapNumber` property to each atoms.

    Args:
        mol: a molecule
        copy: Whether to copy the molecule.
    """

    if copy is True:
        mol = copy_mol(mol)

    for atom in mol.GetAtoms():
        atom.SetProp("molAtomMapNumber", str(atom.GetIdx()))
    return mol

datamol.mol.copy_mol(mol)

Copy a molecule and return a new one.

Parameters:

Name Type Description Default
mol Mol

a molecule to copy.

required
Source code in datamol/mol.py
def copy_mol(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
    """Copy a molecule and return a new one.

    Args:
        mol: a molecule to copy.
    """
    return copy.deepcopy(mol)

datamol.mol.copy_mol_props(source, destination)

Copy properties from one source molecule to another destination molecule.

Parameters:

Name Type Description Default
source Mol

a molecule to copy from.

required
destination Mol

a molecule to copy to.

required
Source code in datamol/mol.py
def copy_mol_props(source: Chem.rdchem.Mol, destination: Chem.rdchem.Mol):
    """Copy properties from one source molecule to another destination
    molecule.

    Args:
        source: a molecule to copy from.
        destination: a molecule to copy to.
    """

    props = source.GetPropsAsDict()
    dm.set_mol_props(destination, props)

datamol.mol.decrease_bond(bond)

Remove one single bond from the input bond. Note that you should first kekulize your molecules and remove non-standard bond.

Parameters:

Name Type Description Default
bond Bond

a bond.

required
Source code in datamol/mol.py
def decrease_bond(bond: Chem.rdchem.Bond) -> Optional[Union[list, Chem.rdchem.Bond]]:
    """Remove one single bond from the input bond. Note that you should
    first kekulize your molecules and remove non-standard bond.

    Args:
        bond: a bond.
    """
    if bond.GetBondType() == TRIPLE_BOND:
        return DOUBLE_BOND
    if bond.GetBondType() == DOUBLE_BOND:
        return SINGLE_BOND
    if bond.GetBondType() == SINGLE_BOND:
        return None
    return bond

datamol.mol.enumerate_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True)

Enumerate the stereocenters and bonds of the current molecule.

Original source: the openff-toolkit lib.

Warning: this function can be computationnaly intensive.

Parameters:

Name Type Description Default
mol

The molecule whose state we should enumerate.

required
n_variants int

The maximum amount of molecules that should be returned.

20
undefined_only bool

If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry.

False
rationalise bool

If we should try to build and rationalise the molecule to ensure it can exist.

True
Source code in datamol/mol.py
def enumerate_stereoisomers(
    mol,
    n_variants: int = 20,
    undefined_only: bool = False,
    rationalise: bool = True,
):
    """Enumerate the stereocenters and bonds of the current molecule.

    Original source: the `openff-toolkit` lib.

    Warning: this function can be computationnaly intensive.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
        undefined_only: If we should enumerate all stereocenters and bonds or only those
            with undefined stereochemistry.
        rationalise: If we should try to build and rationalise the molecule to ensure it
            can exist.
    """
    from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers
    from rdkit.Chem.EnumerateStereoisomers import StereoEnumerationOptions

    # safety first
    mol = copy_mol(mol)

    # in case any bonds/centers are missing stereo chem flag it here
    Chem.AssignStereochemistry(mol, force=False, flagPossibleStereoCenters=True, cleanIt=True)  # type: ignore
    Chem.FindPotentialStereoBonds(mol, cleanIt=True)  # type: ignore

    # set up the options
    stereo_opts = StereoEnumerationOptions(
        tryEmbedding=rationalise,
        onlyUnassigned=undefined_only,
        maxIsomers=n_variants,
    )

    try:
        isomers = tuple(EnumerateStereoisomers(mol, options=stereo_opts))
    except:
        # NOTE(hadim): often got "Stereo atoms should be specified before specifying CIS/TRANS bond stereochemistry"
        # for the ligand of reference (coming from the PDB). Not sure how to handle that.
        isomers = []

    variants = []
    for isomer in isomers:
        # isomer has CIS/TRANS tags so convert back to E/Z
        Chem.SetDoubleBondNeighborDirections(isomer)  # type: ignore
        Chem.AssignStereochemistry(isomer, force=True, cleanIt=True)  # type: ignore
        variants.append(isomer)

    return variants

datamol.mol.enumerate_tautomers(mol, n_variants=20)

Enumerate the possible tautomers of the current molecule.

Original source: the openff-toolkit lib.

Parameters:

Name Type Description Default
mol Mol

The molecule whose state we should enumerate.

required
n_variants int

The maximum amount of molecules that should be returned.

20
Source code in datamol/mol.py
def enumerate_tautomers(mol: Chem.rdchem.Mol, n_variants: int = 20):
    """Enumerate the possible tautomers of the current molecule.

    Original source: the `openff-toolkit` lib.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
    """
    # safety first
    mol = copy_mol(mol)

    enumerator = rdMolStandardize.TautomerEnumerator()
    enumerator.SetMaxTautomers(n_variants)
    tautomers = enumerator.Enumerate(mol)
    return list(tautomers)

datamol.mol.fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)

Fix error in molecule using a greedy approach.

Parameters:

Name Type Description Default
mol Mol

input molecule to fix

required
n_iter int

Number of valence fix iteration to apply

1
remove_singleton bool

Whether adjust_singleton should be applied

False
largest_only bool

Whether only the largest fragment should be kept

False
inplace bool

Whether to return a copy of the mol or perform in place operation

False

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

Fixed molecule.

Source code in datamol/mol.py
def fix_mol(
    mol: Chem.rdchem.Mol,
    n_iter: int = 1,
    remove_singleton: bool = False,
    largest_only: bool = False,
    inplace: bool = False,
) -> Optional[Chem.rdchem.Mol]:
    """Fix error in molecule using a greedy approach.

    Args:
        mol: input molecule to fix
        n_iter: Number of valence fix iteration to apply
        remove_singleton: Whether `adjust_singleton` should be applied
        largest_only: Whether only the largest fragment should be kept
        inplace: Whether to return a copy of the mol or perform in place operation

    Returns:
        Fixed molecule.
    """

    if not inplace:
        mol = copy.copy(mol)

    m = sanitize_mol(mol) or mol  # fail back to mol when the fixer fail

    if m is not None:
        m = remove_dummies(m)
        for _ in range(n_iter):
            m = fix_valence(m)

        if remove_singleton:
            m = adjust_singleton(m)

        if largest_only:
            # m = max(Chem.rdmolops.GetMolFrags(m, asMols=True, sanitizeFrags=False), key=lambda m: m.GetNumAtoms())
            m = rdMolStandardize.FragmentParent(m, skipStandardize=True)

    return m

datamol.mol.fix_valence(mol, inplace=False, allow_ring_break=False)

Identify and try to fix valence issues by removing any supplemental bond that should not be in the graph.

Parameters:

Name Type Description Default
mol

input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy

False
allow_ring_break bool

Whether bond removal involving ring is allowed.

False

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

Fixed potential valence issue in molecule or original molecule when nothing is broken of if failed.

Source code in datamol/mol.py
def fix_valence(
    mol, inplace: bool = False, allow_ring_break: bool = False
) -> Optional[Chem.rdchem.Mol]:
    """Identify and try to fix valence issues by removing any supplemental bond
    that should not be in the graph.

    Args:
        mol: input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy
        allow_ring_break: Whether bond removal involving ring is allowed.

    Returns:
        Fixed potential valence issue in molecule or original molecule when nothing is broken
        of if failed.
    """
    if not inplace:
        mol = copy.copy(mol)

    vm = rdMolStandardize.RDKitValidation()
    if len(vm.validate(mol)) == 0:  # don't fix something that is not broken
        return mol

    try:
        m = Chem.RemoveHs(
            mol,
            implicitOnly=False,
            updateExplicitCount=True,
            sanitize=False,
        )
        m.UpdatePropertyCache(False)

        # first pass using explicit false count
        for atom in m.GetAtoms():
            while incorrect_valence(atom) and atom.GetTotalNumHs() > 0:
                cur_hydrogen = atom.GetTotalNumHs()
                atom.SetNumExplicitHs(max(0, cur_hydrogen - 1))
                atom.SetFormalCharge(max(0, atom.GetFormalCharge() - 1))
                # atom.SetNumRadicalElectrons(0)
            atom.UpdatePropertyCache(False)

        em = Chem.RWMol(m)
        bonds = em.GetBonds()
        bonds = [
            bond
            for bond in bonds
            if any(
                [
                    incorrect_valence(bond.GetBeginAtom()),
                    incorrect_valence(bond.GetEndAtom()),
                ]
            )
        ]
        for bond in bonds:
            a1 = bond.GetBeginAtom()
            a2 = bond.GetEndAtom()
            if incorrect_valence(a1) or incorrect_valence(a2):
                mbond = decrease_bond(bond)
                if allow_ring_break or (mbond or not bond.IsInRing()):
                    em.RemoveBond(a1.GetIdx(), a2.GetIdx())
                    if mbond is not None:
                        em.AddBond(a1.GetIdx(), a2.GetIdx(), mbond)
            a1.UpdatePropertyCache(False)
            a2.UpdatePropertyCache(False)
        m = em.GetMol()

    except Exception:
        return None

    return m

datamol.mol.fix_valence_charge(mol, inplace=False)

Fix valence issues that are due to incorrect charges.

Parameters:

Name Type Description Default
mol Mol

Input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy.

False

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

Fixed molecule via charge correction or original molecule if failed.

Source code in datamol/mol.py
def fix_valence_charge(mol: Chem.rdchem.Mol, inplace: bool = False) -> Optional[Chem.rdchem.Mol]:
    """Fix valence issues that are due to incorrect charges.

    Args:
        mol: Input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy.

    Returns:
        Fixed molecule via charge correction or original molecule if failed.
    """

    vm = rdMolStandardize.RDKitValidation()

    # Don't fix something that is not broken
    if len(vm.validate(mol)) > 0:

        if not inplace:
            mol = copy.copy(mol)

        mol.UpdatePropertyCache(False)
        for a in mol.GetAtoms():
            n_electron = (
                a.GetImplicitValence()
                + a.GetExplicitValence()
                - dm.PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            )
            a.SetFormalCharge(n_electron)

    return mol

datamol.mol.incorrect_valence(a, update=False)

Check if an atom connection is not valid or all the atom of a molecule.

Parameters:

Name Type Description Default
a Union[rdkit.Chem.rdchem.Mol, rdkit.Chem.rdchem.Atom]

atom or molecule to check for valence issue.

required
update bool

Update owning molecule property cache first.

False

Returns:

Type Description
bool

Whether the input atom valence is correct.

Source code in datamol/mol.py
def incorrect_valence(a: Union[Chem.rdchem.Mol, Chem.rdchem.Atom], update: bool = False) -> bool:
    """Check if an atom connection is not valid or all the atom of a molecule.

    Args:
        a: atom or molecule to check for valence issue.
        update: Update owning molecule property cache first.

    Returns:
        Whether the input atom valence is correct.
    """
    if isinstance(a, Chem.rdchem.Mol):
        a.UpdatePropertyCache(False)
        vm = rdMolStandardize.RDKitValidation()
        return len(vm.validate(a)) > 0

    if update:
        m = a.GetOwningMol()
        m.UpdatePropertyCache(False)
    return (a.GetImplicitValence() == 0) and (
        a.GetExplicitValence() > max(PERIODIC_TABLE.GetValenceList(a.GetSymbol()))
    )

datamol.mol.is_transition_metal(at)

Check if atom is a transition metal.

Parameters:

Name Type Description Default
at Atom

an atom.

required
Source code in datamol/mol.py
def is_transition_metal(at: Chem.rdchem.Atom) -> bool:
    """Check if atom is a transition metal.

    Args:
        at: an atom.
    """
    n = at.GetAtomicNum()
    return (n >= 22 and n <= 29) or (n >= 40 and n <= 47) or (n >= 72 and n <= 79)

datamol.mol.keep_largest_fragment(mol)

Only keep largest fragment of each molecule.

Source code in datamol/mol.py
def keep_largest_fragment(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Only keep largest fragment of each molecule."""
    return max(
        rdmolops.GetMolFrags(mol, asMols=True),
        default=mol,
        key=lambda m: m.GetNumAtoms(),
    )

datamol.mol.randomize_atoms(mol)

Randomize the position of the atoms in a mol.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

mol: a molecule.

Source code in datamol/mol.py
def randomize_atoms(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Randomize the position of the atoms in a mol.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    atom_indices = list(range(mol.GetNumAtoms()))
    random.shuffle(atom_indices)
    return Chem.RenumberAtoms(mol, atom_indices)

datamol.mol.remove_dummies(mol, dummy='*')

Remove dummy atoms from molecules.

Source code in datamol/mol.py
def remove_dummies(mol: Chem.rdchem.Mol, dummy: str = "*") -> Optional[Chem.rdchem.Mol]:
    """Remove dummy atoms from molecules."""
    du = dm.to_mol(dummy)
    out = mol
    try:
        out = Chem.ReplaceSubstructs(mol, du, dm.to_mol("[H]"), True)[0]
        out = Chem.RemoveHs(out)
    except Exception as e:
        out = Chem.DeleteSubstructs(mol, du)
    return out

datamol.mol.reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)

Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
break_ties bool

Force breaking of ranked ties.

True
include_chirality bool

Use chiral information when computing rank.

True
include_isotopes bool

Use isotope information when computing rank.

True

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

mol: a molecule.

Source code in datamol/mol.py
def reorder_atoms(
    mol: Chem.rdchem.Mol,
    break_ties: bool = True,
    include_chirality: bool = True,
    include_isotopes: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Reorder the atoms in a mol. It ensures a single atom order for the same molecule,
    regardless of its original representation.

    Args:
        mol: a molecule.
        break_ties: Force breaking of ranked ties.
        include_chirality: Use chiral information when computing rank.
        include_isotopes: Use isotope information when computing rank.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    new_order = Chem.CanonicalRankAtoms(
        mol,
        breakTies=break_ties,
        includeChirality=include_chirality,
        includeIsotopes=include_isotopes,
    )
    new_order = sorted([(y, x) for x, y in enumerate(new_order)])
    return Chem.RenumberAtoms(mol, [y for (x, y) in new_order])

datamol.mol.replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)

Remove dummy atoms from molecules.

Parameters:

Name Type Description Default
mol Mol

molecule with dummies

required
atom str

replacement atom, default is carbon

'C'
dummy str

dummy atom representation

'*'
replace_all bool

Whether to replace all dummies

True

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

mol: Molecule with dummy replaced

Source code in datamol/mol.py
def replace_dummies_atoms(
    mol: Chem.rdchem.Mol,
    atom: str = "C",
    dummy: str = "*",
    replace_all: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Remove dummy atoms from molecules.

    Args:
        mol: molecule with dummies
        atom: replacement atom, default is carbon
        dummy: dummy atom representation
        replace_all: Whether to replace all dummies

    Returns:
        mol: Molecule with dummy replaced
    """
    du = Chem.MolFromSmiles(dummy)
    replacement = Chem.MolFromSmiles(atom)
    out = Chem.ReplaceSubstructs(mol, du, replacement, replaceAll=replace_all)[0]
    return out

datamol.mol.sanitize_first(mols, charge_neutral=False, sanifix=True)

Sanitize a list of molecules and return the first valid molecule seen in the list.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

a list of molecules.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True

Returns:

Type Description
mol

a molecule.

Source code in datamol/mol.py
def sanitize_first(mols: List[Chem.rdchem.Mol], charge_neutral: bool = False, sanifix: bool = True):
    """Sanitize a list of molecules and return the first valid molecule seen in the list.

    Args:
        mols: a list of molecules.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.

    Returns:
        mol: a molecule.
    """
    for mol in mols:
        mol = sanitize_mol(mol, charge_neutral=charge_neutral, sanifix=sanifix)
        if mol:
            return mol
    return None

datamol.mol.sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False)

An augmented version of RDKit sanitize=True. It uses a mol-SMILES-mol conversion to catch potential aromaticity errors and try to fix aromatic nitrogen (using the popular sanifix4 script). Optionally, it can neutralize the charge of the molecule.

Note #1: Only the first conformer (if present) will be preserved and a warning will be displayed if more than one conformer is detected.

Note #2: The molecule's properties will be preserved but the atom's properties will be lost.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True
verbose bool

Whether displaying a warning about multiple conformers.

True
add_hs bool

Add hydrogens to the returned molecule. Useful when the input molecule already contains hydrogens.

False

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

mol: a molecule.

Source code in datamol/mol.py
def sanitize_mol(
    mol: Chem.rdchem.Mol,
    charge_neutral: bool = False,
    sanifix: bool = True,
    verbose: bool = True,
    add_hs: bool = False,
) -> Optional[Chem.rdchem.Mol]:
    """An augmented version of RDKit `sanitize=True`. It uses a
    mol-SMILES-mol conversion to catch potential aromaticity errors
    and try to fix aromatic nitrogen (using the popular sanifix4 script).
    Optionally, it can neutralize the charge of the molecule.

    Note #1: Only the first conformer (if present) will be preserved and
    a warning will be displayed if more than one conformer is detected.

    Note #2: The molecule's properties will be preserved but the atom's
    properties will be lost.

    Args:
        mol: a molecule.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.
        verbose: Whether displaying a warning about multiple conformers.
        add_hs: Add hydrogens to the returned molecule. Useful when the input
            molecule already contains hydrogens.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    # Extract properties.
    original_mol = copy_mol(mol)
    properties = original_mol.GetPropsAsDict()

    if charge_neutral:
        mol = to_neutral(mol)

    if sanifix:
        mol = _sanifix4.sanifix(mol)

    if mol is not None:

        # Detect multiple conformers
        if verbose and mol.GetNumConformers() > 1:
            logger.warning(
                f"The molecule contains multiple conformers. Only the first one will be preserved."
            )

        # Try catch to avoid occasional aromaticity errors
        try:
            # `cxsmiles` is used here to preserve the first conformer.
            mol = to_mol(dm.to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs)  # type: ignore
        except Exception:
            mol = None

    if mol is not None:
        # Insert back properties.
        mol = dm.set_mol_props(mol, properties)

    return mol

datamol.mol.sanitize_smiles(smiles, isomeric=True)

Takes SMILES string and returns its sanitized version.

Parameters:

Name Type Description Default
smiles str

smiles to be sanitized.

required
isomeric bool

Whether to include information about stereochemistry in the SMILES.

True

Returns:

Type Description
Optional[str]

sanitized smiles.

Source code in datamol/mol.py
def sanitize_smiles(smiles: str, isomeric: bool = True) -> Optional[str]:
    """Takes SMILES string and returns its sanitized version.

    Args:
        smiles: smiles to be sanitized.
        isomeric: Whether to include information about stereochemistry in the SMILES.

    Returns:
        sanitized smiles.
    """
    try:
        mol = dm.to_mol(smiles, sanitize=False)
        mol = dm.sanitize_mol(mol, False)
    except Exception:
        return None

    if mol is None:
        return None

    try:
        smiles = dm.to_smiles(mol, isomeric=isomeric)  # type: ignore
    except:
        return None
    return smiles

datamol.mol.set_dative_bonds(mol, from_atoms=(7, 8))

Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms with dative bonds. The replacement is only done if the atom has "too many" bonds.

Parameters:

Name Type Description Default
mol Mol

molecule with bond to modify

required
from_atoms Tuple[int, int]

List of atoms (symbol or atomic number) to consider for bond replacement. By default, only Nitrogen (7) and Oxygen (8) are considered.

(7, 8)

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

The modified molecule.

Source code in datamol/mol.py
def set_dative_bonds(
    mol: Chem.rdchem.Mol, from_atoms: Tuple[int, int] = (7, 8)
) -> Optional[Chem.rdchem.Mol]:
    """Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms
    with dative bonds. The replacement is only done if the atom has "too many" bonds.

    Arguments:
        mol: molecule with bond to modify
        from_atoms: List of atoms  (symbol or atomic number) to consider for bond replacement.
            By default, only Nitrogen (7) and Oxygen (8) are considered.

    Returns:
        The modified molecule.
    """
    rwmol = Chem.RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)

    metals = [at for at in rwmol.GetAtoms() if is_transition_metal(at)]
    for metal in metals:
        for nbr in metal.GetNeighbors():
            if (nbr.GetAtomicNum() in from_atoms or nbr.GetSymbol() in from_atoms) and (
                nbr.GetExplicitValence() > PERIODIC_TABLE.GetDefaultValence(nbr.GetAtomicNum())
                and rwmol.GetBondBetweenAtoms(nbr.GetIdx(), metal.GetIdx()).GetBondType()
                == SINGLE_BOND
            ):
                rwmol.RemoveBond(nbr.GetIdx(), metal.GetIdx())
                rwmol.AddBond(nbr.GetIdx(), metal.GetIdx(), DATIVE_BOND)
    return rwmol

datamol.mol.set_mol_props(mol, props, copy=False)

Set properties to a mol from a dict.

Parameters:

Name Type Description Default
mol Mol

the mol where to copy the props.

required
props Dict[str, Any]

the props to copy.

required
copy bool

whether to copy the provided mol

False
Source code in datamol/mol.py
def set_mol_props(
    mol: Chem.rdchem.Mol,
    props: Dict[str, Any],
    copy: bool = False,
) -> Chem.rdchem.Mol:
    """Set properties to a mol from a dict.

    Args:
        mol: the mol where to copy the props.
        props: the props to copy.
        copy: whether to copy the provided mol

    """

    if copy is True:
        mol = dm.copy_mol(mol)

    for k, v in props.items():
        if isinstance(v, bool):
            mol.SetBoolProp(k, v)
        elif isinstance(v, int):
            mol.SetIntProp(k, v)
        elif isinstance(v, float):
            mol.SetDoubleProp(k, v)
        else:
            mol.SetProp(k, str(v))

    return mol

datamol.mol.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)

This function returns a standardized version the given molecule, with or without disconnect the metals. The process is apply in the order of the argument.

Parameters:

Name Type Description Default
mol Mol

The molecule to standardize.

required
disconnect_metals bool

Whether to disconnect the metallic atoms from non-metals

False
normalize bool

Whether to apply normalization (correct functional groups and recombine charges).

True
reionize bool

Whether to apply molecule reionization

True
uncharge bool

Whether to remove all charge from molecule

False
stereo bool

Whether to attempt to assign stereochemistry

True

Returns:

Type Description
mol

The standardized molecule.

Source code in datamol/mol.py
def standardize_mol(
    mol: Chem.rdchem.Mol,
    disconnect_metals: bool = False,
    normalize: bool = True,
    reionize: bool = True,
    uncharge: bool = False,
    stereo: bool = True,
):
    r"""
    This function returns a standardized version the given molecule, with or without disconnect the metals.
    The process is apply in the order of the argument.

    Arguments:
        mol: The molecule to standardize.
        disconnect_metals: Whether to disconnect the metallic atoms from non-metals
        normalize: Whether to apply normalization (correct functional groups and recombine charges).
        reionize: Whether to apply molecule reionization
        uncharge: Whether to remove all charge from molecule
        stereo: Whether to attempt to assign stereochemistry

    Returns:
        mol: The standardized molecule.
    """
    mol = copy_mol(mol)

    if disconnect_metals:
        md = rdMolStandardize.MetalDisconnector()
        mol = md.Disconnect(mol)

    if normalize:
        mol = rdMolStandardize.Normalize(mol)

    if reionize:
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)

    if uncharge:
        uncharger = rdMolStandardize.Uncharger()
        mol = uncharger.uncharge(mol)

    if stereo:
        Chem.AssignStereochemistry(mol, force=False, cleanIt=True)

    return mol

datamol.mol.standardize_smiles(smiles, tautomer=False)

Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit smiles standardizer and tautomeric canonicalization.

Parameters:

Name Type Description Default
smiles str

Smiles to standardize

required
tautomer bool

Whether to canonicalize tautomers

False

Returns:

Type Description
standard_smiles

the standardized smiles

Source code in datamol/mol.py
def standardize_smiles(smiles: str, tautomer: bool = False):
    r"""
    Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit
    smiles standardizer and tautomeric canonicalization.

    Args:
        smiles: Smiles to standardize
        tautomer: Whether to canonicalize tautomers

    Returns:
        standard_smiles: the standardized smiles
    """

    smiles = rdMolStandardize.StandardizeSmiles(smiles)
    if tautomer:
        smiles = canonicalize_tautomer_smiles(smiles)
    return smiles

datamol.mol.to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True)

Convert an input molecule (smiles representation) into a Chem.rdchem.Mol.

Parameters:

Name Type Description Default
mol str

SMILES of a molecule or a molecule.

required
add_hs bool

Whether hydrogens should be added the molecule.

False
explicit_only bool

Whether to only add explicit hydrogen or both (implicit and explicit). when add_hs is set to True.

False
ordered bool

Whether the atom should be ordered. This option is important if you want to ensure that the features returned will always maintain a single atom order for the same molecule, regardless of its original SMILES representation.

False
kekulize bool

Whether to perform kekulization of the input molecules.

False
sanitize bool

Whether to apply rdkit sanitization when input is a SMILES.

True

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

mol: the molecule if some conversion have been made. If the conversion fails None is returned so make sure that you handle this case on your own.

Source code in datamol/mol.py
def to_mol(
    mol: str,
    add_hs: bool = False,
    explicit_only: bool = False,
    ordered: bool = False,
    kekulize: bool = False,
    sanitize: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Convert an input molecule (smiles representation) into a `Chem.rdchem.Mol`.

    Args:
        mol: SMILES of a molecule or a molecule.
        add_hs: Whether hydrogens should be added the molecule.
        explicit_only: Whether to only add explicit hydrogen or both
            (implicit and explicit). when `add_hs` is set to True.
        ordered: Whether the atom should be ordered. This option is
            important if you want to ensure that the features returned will always maintain
            a single atom order for the same molecule, regardless of its original SMILES representation.
        kekulize: Whether to perform kekulization of the input molecules.
        sanitize: Whether to apply rdkit sanitization when input is a SMILES.

    Returns:
        mol: the molecule if some conversion have been made. If the conversion fails
        None is returned so make sure that you handle this case on your own.
    """

    if not isinstance(mol, (str, Chem.rdchem.Mol)):
        raise ValueError(f"Input should be a Chem.rdchem.Mol or a string instead of '{type(mol)}'")

    if isinstance(mol, str):
        _mol = Chem.MolFromSmiles(mol, sanitize=sanitize)

        if not sanitize and _mol is not None:
            _mol.UpdatePropertyCache(False)
    else:
        _mol = mol

    # Add hydrogens
    if _mol is not None and add_hs:
        _mol = Chem.AddHs(_mol, explicitOnly=explicit_only, addCoords=True)

    # Reorder atoms
    if _mol is not None and ordered:
        _mol = reorder_atoms(_mol)

    if _mol is not None and kekulize:
        Chem.Kekulize(_mol, clearAromaticFlags=False)
    return _mol

datamol.mol.to_neutral(mol)

Neutralize the charge of a molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required

Returns:

Type Description
Optional[rdkit.Chem.rdchem.Mol]

mol: a molecule.

Source code in datamol/mol.py
def to_neutral(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Neutralize the charge of a molecule.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    for a in mol.GetAtoms():
        if a.GetFormalCharge() < 0 or (
            a.GetExplicitValence() >= PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            and a.GetFormalCharge() > 0
        ):
            a.SetFormalCharge(0)
            a.UpdatePropertyCache(False)
    return mol

datamol.similarity

datamol.similarity.cdist(mols1, mols2, n_jobs=1, **fp_args)

Compute the pairwise tanimoto distance between the fingerprints of each pair of molecules of the two collections of inputs.

Parameters:

Name Type Description Default
mols1 List[rdkit.Chem.rdchem.Mol]

list of molecules.

required
mols2 List[rdkit.Chem.rdchem.Mol]

list of molecules.

required
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.

1
**fp_args

list of args to pass to to_fp().

{}

Returns:

Type Description
ndarray

distmat

Source code in datamol/similarity.py
def cdist(
    mols1: List[Chem.rdchem.Mol],
    mols2: List[Chem.rdchem.Mol],
    n_jobs: Optional[int] = 1,
    **fp_args,
) -> np.ndarray:
    """Compute the pairwise tanimoto distance between the fingerprints of
    each pair of molecules of the two collections of inputs.

    Args:
        mols1: list of molecules.
        mols2: list of molecules.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        distmat
    """

    fps1 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols1,
        n_jobs=n_jobs,
    )

    fps2 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols2,
        n_jobs=n_jobs,
    )

    fps1 = np.array(fps1)
    fps2 = np.array(fps2)

    dist_mat = distance.cdist(fps1, fps2, metric="jaccard")

    return dist_mat

datamol.similarity.pdist(mols, n_jobs=1, **fp_args)

Compute the pairwise tanimoto distance between the fingerprints of all the molecules in the input set.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

list of molecules

required
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.

1
**fp_args

list of args to pass to to_fp().

{}

Returns:

Type Description
Tuple[numpy.ndarray, numpy.ndarray]

distmat, valid_idx: Distance matrix, and valid index that have passed the conversion to fingerprint.

Source code in datamol/similarity.py
def pdist(
    mols: List[Chem.rdchem.Mol], n_jobs: Optional[int] = 1, **fp_args
) -> Tuple[np.ndarray, np.ndarray]:
    """Compute the pairwise tanimoto distance between the fingerprints of all the
    molecules in the input set.

    Args:
        mols: list of molecules
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        distmat, valid_idx: Distance matrix, and valid index that have passed the conversion
            to fingerprint.
    """

    fps = dm.parallelized(
        functools.partial(dm.to_fp, as_array=False, **fp_args),
        mols,
        n_jobs=n_jobs,
    )

    valid_idx, fps = zip(*[(i, fp) for i, fp in enumerate(fps) if fp is not None])
    fps = list(fps)

    dist = GetTanimotoDistMat(fps)
    dist_mat = np.zeros((len(fps), len(fps)))
    dist_mat[np.triu_indices_from(dist_mat, 1)] = dist
    dist_mat += dist_mat.T

    return dist_mat, np.array(valid_idx)