Source code for xgi.algorithms.clustering

"""Algorithms for computing nodal clustering coefficients."""

import numpy as np

from ..exception import XGIError
from ..linalg import adjacency_matrix

__all__ = [
    "clustering_coefficient",
    "local_clustering_coefficient",
    "two_node_clustering_coefficient",
]


[docs]def clustering_coefficient(H):
    r"""Return the clustering coefficients for
    each node in a Hypergraph.

    This clustering coefficient is defined as the
    clustering coefficient of the unweighted pairwise
    projection of the hypergraph, i.e.,
    :math:`c = A^3_{i,i}/\binom{k}{2},`
    where :math:`A` is the adjacency matrix of the network
    and :math:`k` is the pairwise degree of :math:`i`.

    Parameters
    ----------
    H : Hypergraph
        Hypergraph

    Returns
    -------
    dict
        nodes are keys, clustering coefficients are values.

    Notes
    -----
    The clustering coefficient is undefined when the number of
    neighbors is 0 or 1, but we set the clustering coefficient
    to 0 in these cases. For more discussion, see
    https://arxiv.org/abs/0802.2512

    See Also
    --------
    local_clustering_coefficient
    two_node_clustering_coefficient

    References
    ----------
    "Clustering Coefficients in Protein Interaction Hypernetworks"
    by Suzanne Gallagher and Debra Goldberg.
    DOI: 10.1145/2506583.2506635

    Example
    -------
    >>> import xgi
    >>> H = xgi.random_hypergraph(3, [1, 1])
    >>> cc = xgi.clustering_coefficient(H)
    >>> cc
    {0: 1.0, 1: 1.0, 2: 1.0}
    """
    adj, index = adjacency_matrix(H, index=True)
    ndict = {n: i for i, n in index.items()}

    k = adj.sum(axis=1)
    denom = k * (k - 1) / 2
    mat = adj.dot(adj).dot(adj)

    with np.errstate(divide="ignore", invalid="ignore"):
        result = np.nan_to_num(0.5 * mat.diagonal() / denom)

    return {n: result[ndict[n]] if n in ndict else 0 for n in H.nodes}


[docs]def local_clustering_coefficient(H):
    """Compute the local clustering coefficient.

    This clustering coefficient is based on the
    overlap of the edges connected to a given node,
    normalized by the size of the node's neighborhood.

    Parameters
    ----------
    H : Hypergraph
        Hypergraph

    Returns
    -------
    dict
        keys are node IDs and values are the
        clustering coefficients.

    Notes
    -----
    The clustering coefficient is undefined when the number of
    neighbors is 0 or 1, but we set the clustering coefficient
    to 0 in these cases. For more discussion, see
    https://arxiv.org/abs/0802.2512

    See Also
    --------
    clustering_coefficient
    two_node_clustering_coefficient

    References
    ----------
    "Properties of metabolic graphs: biological organization or representation
    artifacts?"  by Wanding Zhou and Luay Nakhleh.
    https://doi.org/10.1186/1471-2105-12-132

    "Hypergraphs for predicting essential genes using multiprotein complex data"
    by Florian Klimm, Charlotte M. Deane, and Gesine Reinert.
    https://doi.org/10.1093/comnet/cnaa028

    Example
    -------
    >>> import xgi
    >>> H = xgi.random_hypergraph(3, [1, 1])
    >>> cc = xgi.local_clustering_coefficient(H)
    >>> cc
    {0: 1.0, 1: 1.0, 2: 1.0}

    """
    result = {}

    memberships = H.nodes.memberships()
    members = H.edges.members()

    for n in H.nodes:
        ev = list(memberships[n])
        dv = len(ev)
        if dv <= 1:
            result[n] = 0
        else:
            total_eo = 0
            # go over all pairs of edges pairwise
            for e1 in range(dv):
                edge1 = members[e1]
                for e2 in range(e1):
                    edge2 = members[e2]
                    # set differences for the hyperedges
                    D1 = set(edge1) - set(edge2)
                    D2 = set(edge2) - set(edge1)
                    # if edges are the same by definition the extra overlap is zero
                    if len(D1.union(D2)) == 0:
                        eo = 0
                    else:
                        # otherwise we have to look at their neighbours
                        # the neighbours of D1 and D2, respectively.
                        neighD1 = {i for d in D1 for i in H.nodes.neighbors(d)}
                        neighD2 = {i for d in D2 for i in H.nodes.neighbors(d)}
                        # compute extra overlap [len() is used for cardinality of edges]
                        eo = (
                            len(neighD1.intersection(D2))
                            + len(neighD2.intersection(D1))
                        ) / len(
                            D1.union(D2)
                        )  # add it up
                    # add it up
                    total_eo = total_eo + eo

            # include normalisation by degree k*(k-1)/2
            result[n] = 2 * total_eo / (dv * (dv - 1))
    return result


[docs]def two_node_clustering_coefficient(H, kind="union"):
    """Return the clustering coefficients for
    each node in a Hypergraph.

    This definition averages over all of the
    two-node clustering coefficients involving the node.

    Parameters
    ----------
    H : Hypergraph
        Hypergraph
    kind : string, optional
        The type of two node clustering coefficient. Options
        are "union", "max", and "min". By default, "union".

    Returns
    -------
    dict
        nodes are keys, clustering coefficients are values.

    Notes
    -----
    The clustering coefficient is undefined when the number of
    neighbors is 0 or 1, but we set the clustering coefficient
    to 0 in these cases. For more discussion, see
    https://arxiv.org/abs/0802.2512

    See Also
    --------
    clustering_coefficient
    local_clustering_coefficient

    References
    ----------
    "Clustering Coefficients in Protein Interaction Hypernetworks"
    by Suzanne Gallagher and Debra Goldberg.
    DOI: 10.1145/2506583.2506635

    Example
    -------
    >>> import xgi
    >>> H = xgi.random_hypergraph(3, [1, 1])
    >>> cc = xgi.two_node_clustering_coefficient(H, kind="union")
    >>> cc
    {0: 0.5, 1: 0.5, 2: 0.5}
    """
    result = {}
    memberships = H.nodes.memberships()
    for n in H.nodes:
        neighbors = H.nodes.neighbors(n)
        result[n] = 0.0
        for v in neighbors:
            result[n] += _uv_cc(n, v, memberships, kind=kind) / len(neighbors)
    return result


def _uv_cc(u, v, memberships, kind="union"):
    """Helper function to compute the two-node
    clustering coefficient.

    Parameters
    ----------
    u : hashable
        First node
    v : hashable
        Second node
    memberships : dict
        node IDs are keys, edge IDs to which they belong
        are values.
    kind : str, optional
        Type of clustering coefficient to compute, by default "union".
        Options:

        - "union"
        - "max"
        - "min"

    Returns
    -------
    float
        The clustering coefficient

    Raises
    ------
    XGIError
        If an invalid clustering coefficient kind
        is specified.

    References
    ----------
    "Clustering Coefficients in Protein Interaction Hypernetworks"
    by Suzanne Gallagher and Debra Goldberg.
    DOI: 10.1145/2506583.2506635
    """
    m_u = memberships[u]
    m_v = memberships[v]

    num = len(m_u.intersection(m_v))

    if kind == "union":
        denom = len(m_u.union(m_v))
    elif kind == "min":
        denom = min(len(m_u), len(m_v))
    elif kind == "max":
        denom = max(len(m_u), len(m_v))
    else:
        raise XGIError("Invalid kind of clustering.")

    if denom == 0:
        return np.nan

    return num / denom