Source code for xgi.stats

"""Statistics of networks, their nodes, and edges.

Any mapping that assigns some quantity to each node of a network is considered a node
statistic.  For example, the degree is a node-integer mapping, while a node attribute
that assigns a string label to each node is a node-string mapping.  The `stats` package
provides a common interface to all such mappings.

Each such mapping is accessible via the `H.nodes` view.  For example, the degree of all
nodes supports type conversion using the `as*` methods.

>>> import xgi
>>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
>>> H.nodes.degree.asdict()
{1: 1, 2: 2, 3: 3, 4: 2, 5: 2}
>>> H.nodes.degree.aslist()
[1, 2, 3, 2, 2]

Another feature is the ability to filter the nodes of a network by degree.

>>> H.nodes.filterby('degree', 2)
NodeView((2, 4, 5))

The power of the stats package is that any other node statistic that can be conceived of
as a node-quantity mapping is given the same interface.  For example, node attributes
get the same treatment:

>>> H.add_nodes_from([
...     (1, {"color": "red", "name": "horse"}),
...     (2, {"color": "blue", "name": "pony"}),
...     (3, {"color": "yellow", "name": "zebra"}),
...     (4, {"color": "red", "name": "orangutan", "age": 20}),
...     (5, {"color": "blue", "name": "fish", "age": 2}),
... ])
>>> H.nodes.attrs('color').asdict()
{1: 'red', 2: 'blue', 3: 'yellow', 4: 'red', 5: 'blue'}
>>> H.nodes.attrs('color').aslist()
['red', 'blue', 'yellow', 'red', 'blue']
>>> H.nodes.filterby_attr('color', 'red')
NodeView((1, 4))

Many other features are available, including edge-statistics, and user-defined
statistics.  For more details, see the `tutorial
<https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

"""

import numpy as np
import pandas as pd
from scipy.stats import moment as spmoment

from ..exception import IDNotFound
from ..utils import hist

from . import edgestats, diedgestats, dinodestats, nodestats

__all__ = [
    "nodestat_func",
    "edgestat_func",
    "dinodestat_func",
    "diedgestat_func",
    "dispatch_stat",
    "dispatch_many_stats",
]


class IDStat:
    """Mapping between nodes or edges and a quantity or property."""

    def __init__(self, network, view, func, args=None, kwargs=None):
        self.view = view
        self.net = network
        self.args = () if args is None else args
        self.kwargs = {} if args is None else kwargs
        self.func = func

    def __call__(self, *args, **kwargs):
        return self.__class__(self.net, self.view, self.func, args=args, kwargs=kwargs)

    def __getitem__(self, idx):
        if idx not in self.view:
            raise IDNotFound(f'ID "{idx}" not in this view')
        return self.func(self.net, [idx], *self.args, **self.kwargs)[idx]

    def __repr__(self):
        cls = self.__class__.__name__
        fnc = self.func.__name__
        out = f"{cls}('{fnc}'"
        if self.args:
            out += f", args={self.args}"
        if self.kwargs:
            out += f", kwargs={self.kwargs}"
        out += ")"
        return out

    def __len__(self):
        return len(self.view)

    @property
    def name(self):
        """Name of this stat.

        The name of a stat is used to populate the keys of dictionaries in `MultiStat`
        objects, as well as the names of columns of pandas dataframes.

        Examples
        --------
        >>> import xgi
        >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
        >>> da, d3 = H.nodes.degree, H.nodes.degree(order=3)
        >>> da.name, d3.name
        ('degree', 'degree(order=3)')
        >>> H.nodes.multi([da, d3]).asdict(transpose=True).keys()
        dict_keys(['degree', 'degree(order=3)'])
        >>> H.nodes.multi([da, d3]).aspandas().columns
        Index(['degree', 'degree(order=3)'], dtype='object')

        """
        name = f"{self.func.__name__}"
        if self.args or self.kwargs:
            args = [f"{s}" for s in self.args]
            kwargs = [f"{k}={v}" for k, v in self.kwargs.items()]
            name += "(" + ", ".join(args + kwargs) + ")"
        return name

    def __iter__(self):
        return iter(self._val.items())

    def items(self):
        return self._val.items()

    @property
    def _val(self):
        return self.func(self.net, self.view.ids, *self.args, **self.kwargs)

    def asdict(self):
        """Output the stat as a dict.

        Notes
        -----
        All stats are stored as dicts and therefore this method incurs in no overhead as
        type conversion is not necessary.

        """
        val = self._val
        return {n: val[n] for n in self.view}

    def aslist(self):
        """Output the stat as a list."""
        val = self._val
        return [val[n] for n in self.view]

    def asnumpy(self):
        """Output the stat as a numpy array."""
        return np.array(self.aslist())

    def aspandas(self):
        """Output the stat as a pandas series.

        Notes
        -----
        The `name` attribute of the returned series is set using the `name` property.

        """
        return pd.Series(self._val, name=self.name)

    def ashist(self, bins=10, bin_edges=False, density=False, log_binning=False):
        """Return the distribution of a numpy array.

        Parameters
        ----------
        vals : Numpy array
            The array of values
        bins : int, list, or Numpy array
            The number of bins or the bin edges.
        bin_edges : bool
            Whether to also output the min and max of each bin,
            by default, False.
        density : bool
            Whether to normalize the resulting distribution.
        log_binning : bool
            Whether to bin the values with log-sized bins.
            By default, False.


        Returns
        -------
        Pandas DataFrame
            A two-column table with "bin_center" and "value" columns,
            where "value" is a count or a probability. If `bin_edges`
            is True, outputs two additional columns, `bin_lo` and `bin_hi`,
            which outputs the left and right bin edges respectively.

        Notes
        -----
        Originally from https://github.com/jkbren/networks-and-dataviz
        """

        # if there is one unique value and more than one bin is specified,
        # sets the number of bins to 1.
        if isinstance(bins, int) and len(set(self.aslist())) == 1:
            bins = 1

        return hist(self.asnumpy(), bins, bin_edges, density, log_binning)

    def max(self):
        """The maximum value of this stat."""
        return self.asnumpy().max(axis=0)

    def min(self):
        """The minimum value of this stat."""
        return self.asnumpy().min(axis=0)

    def sum(self):
        """The sum of this stat."""
        return self.asnumpy().sum(axis=0)

    def mean(self):
        """The arithmetic mean of this stat."""
        return self.asnumpy().mean(axis=0)

    def median(self):
        """The median of this stat."""
        return np.median(self.asnumpy(), axis=0)

    def std(self):
        """The standard deviation of this stat."""
        return self.asnumpy().std(axis=0)

    def var(self):
        """The variance of this stat."""
        return self.asnumpy().var(axis=0)

    def moment(self, order=2, center=False):
        """The statistical moments of this stat.

        Parameters
        ----------
        order : int (default 2)
            The order of the moment.
        center : bool (default False)
            Whether to compute the centered (False) or uncentered/raw (True) moment.

        """
        arr = self.asnumpy()
        return spmoment(arr, moment=order) if center else np.mean(arr**order)

    def argmin(self):
        """The ID corresponding to the minimum of the stat

        When the minimum value is not unique, returns first
        ID corresponding to the minimum value.

        Returns
        -------
        hashable
            The ID to which the minimum value corresponds.
        """
        d = self.asdict()
        return min(d, key=d.get)

    def argmax(self):
        """The ID corresponding to the maximum of the stat

        When the maximal value is not unique, returns first
        ID corresponding to the maximal value.

        Returns
        -------
        hashable
            The ID to which the maximum value corresponds.
        """
        d = self.asdict()
        return max(d, key=d.get)

    def argsort(self, reverse=False):
        """Get the list of IDs sorted by stat value.

        When values are not unique, the order of the IDs
        is preserved.

        Parameters
        ----------
        reverse : bool
            Whether the sorting should be ascending or descending.

        Returns
        -------
        list
            The IDs sorted in ascending or descending order.
        """
        d = self.asdict()
        return sorted(d, key=d.get, reverse=reverse)


[docs]class NodeStat(IDStat):
    """An arbitrary node-quantity mapping.

    `NodeStat` objects represent a mapping that assigns a value to each node in a
    network.  For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

    """


[docs]class DiNodeStat(IDStat):
    """An arbitrary node-quantity mapping.

    `NodeStat` objects represent a mapping that assigns a value to each node in a
    network.  For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

    """


[docs]class EdgeStat(IDStat):
    """An arbitrary edge-quantity mapping.

    `EdgeStat` objects represent a mapping that assigns a value to each edge in a
    network.  For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

    """


[docs]class DiEdgeStat(IDStat):
    """An arbitrary edge-quantity mapping.

    `EdgeStat` objects represent a mapping that assigns a value to each edge in a
    network.  For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

    """


class MultiIDStat(IDStat):
    """Multiple mappings."""

    statsclass = None
    """IDStat subclass to use."""

    statsmodule = None
    """Module in which to search for mappings."""

    def __init__(self, network, view, stats):
        super().__init__(network, view, None)
        if isinstance(stats, self.statsclass):
            name = self.statsclass.__name__
            raise TypeError(f"must pass an iterable of {name}, not a single {name}")
        elif isinstance(stats, str):
            raise TypeError(
                f"must pass an iterable of {self.statsclass.__name__}, not str"
            )
        self.stats = [self._get_stat(f) for f in stats]

    def _get_stat(self, s):
        if isinstance(s, str):
            return self.statsclass(self.net, self.view, getattr(self.statsmodule, s))
        elif isinstance(s, self.statsclass):
            return s
        else:
            raise TypeError(f"{s.__name__} must be str or {self.statsclass.__name__}")

    def __repr__(self):
        return (
            f"{self.__class__.__name__}"
            + "("
            + ", ".join(s.name for s in self.stats)
            + ")"
        )

    @property
    def name(self):
        return "[" + ", ".join(s.name for s in self.stats) + "]"

    @property
    def _val(self):
        result = {s.name: s.asdict() for s in self.stats}
        return {n: {s.name: result[s.name][n] for s in self.stats} for n in self.view}

    def asdict(self, inner=dict, transpose=False):
        """Output the stats as a dict of collections.

        Parameters
        ----------
        inner : dict (default) or list
            The type of the inner collections.  If dict (default), output a dict of
            dicts.  If list, output a dict of lists.
        transpose : bool (default False)
            By default, output a dict of dicts whose outer keys are the nodes and inner
            keys are the specified stats.  If True, the outer and inner keys are
            reversed.  Only used when `inner` is `dict`.

        Examples
        --------
        >>> import xgi
        >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
        >>> m = H.nodes.multi(['degree', 'clustering_coefficient'])
        >>> m.asdict() # doctest: +NORMALIZE_WHITESPACE
        {1: {'degree': 1, 'clustering_coefficient': 1.0},
         2: {'degree': 2, 'clustering_coefficient': 0.6666666666666666},
         3: {'degree': 3, 'clustering_coefficient': 0.6666666666666666},
         4: {'degree': 2, 'clustering_coefficient': 1.0},
         5: {'degree': 2, 'clustering_coefficient': 1.0}}
        >>> m.asdict(transpose=True) # doctest: +NORMALIZE_WHITESPACE
        {'degree': {1: 1, 2: 2, 3: 3, 4: 2, 5: 2},
        'clustering_coefficient': {1: 1.0,
        2: 0.6666666666666666,
        3: 0.6666666666666666,
        4: 1.0,
        5: 1.0}}
        """
        val = self._val
        if inner is dict:
            if not transpose:
                return {n: val[n] for n in self.view}
            else:
                return {s.name: s.asdict() for s in self.stats}
        elif inner is list:
            return {n: list(val[n].values()) for n in self.view}
        else:
            raise ValueError

    def aslist(self, inner=list, transpose=False):
        """Output the stats as a list of collections.

        Parameters
        ----------
        inner : list (default) or dict
            The type of the inner collections.  If list (default), output a list of
            lists.  If dict, output a list of dicts.
        transpose : bool (default False)

            By default, output a list of lists where each inner list contains the stats
            of a single node.  If True, each inner list contains the values of a single
            stat of all nodes.

        Examples
        --------
        >>> import xgi
        >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
        >>> m = H.nodes.multi(['degree', 'clustering_coefficient'])
        >>> m.aslist() # doctest:
        [[1, 1.0], [2, 0.6666666666666666], [3, 0.6666666666666666], [2, 1.0], [2, 1.0]]
        >>> m.aslist(transpose=True)
        [[1, 2, 3, 2, 2], [1.0, 0.6666666666666666, 0.6666666666666666, 1.0, 1.0]]
        """
        val = self._val
        if inner is list:
            if not transpose:
                return [list(val[n].values()) for n in self.view]
            else:
                return [s.aslist() for s in self.stats]
        elif inner is dict:
            return [val[n] for n in self.view]
        else:
            raise ValueError

    def asnumpy(self):
        """Output the stats as a numpy array.

        Notes
        -----
        Equivalent to `np.array(self.aslist(inner=list))`.

        Examples
        --------
        >>> import xgi
        >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
        >>> H.nodes.multi(['degree', 'clustering_coefficient']).asnumpy()
        ... # doctest: +NORMALIZE_WHITESPACE
        array([[1.        , 1.        ],
               [2.        , 0.66666667],
               [3.        , 0.66666667],
               [2.        , 1.        ],
               [2.        , 1.        ]])

        """
        return np.array(self.aslist(inner=list))

    def aspandas(self):
        """Output the stats as a pandas dataframe.

        Examples
        --------
        >>> import xgi
        >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
        >>> H.nodes.multi(['degree', 'clustering_coefficient']).aspandas()
        ... # doctest: +NORMALIZE_WHITESPACE
           degree  clustering_coefficient
        1       1    1.000000
        2       2    0.666667
        3       3    0.666667
        4       2    1.000000
        5       2    1.000000

        """
        result = {s.name: s._val for s in self.stats}
        series = [pd.Series(v, name=k) for k, v in result.items()]
        return pd.concat(series, axis=1)

    def ashist(self, bins=10, bin_edges=False, density=False, log_binning=False):
        """Return the distributions of a numpy array.

        Parameters
        ----------
        vals : Numpy array
            The array of values
        bins : int, list, or Numpy array
            The number of bins or the bin edges.
        bin_edges : bool
            Whether to also output the min and max of each bin,
            by default, False.
        density : bool
            Whether to normalize the resulting distribution.
        log_binning : bool
            Whether to bin the values with log-sized bins.
            By default, False.


        Returns
        -------
        list of Pandas DataFrames
            Each entry of the list is a two-column table with "bin_center"
            and "value" columns, where "value" is a count or a probability.
            If `bin_edges` is True, outputs two additional columns,
            `bin_lo` and `bin_hi`, which outputs the left and right
            bin edges respectively.

        Notes
        -----
        Originally from https://github.com/jkbren/networks-and-dataviz

        """
        return [
            hist(data, bins, bin_edges, density, log_binning)
            for data in self.asnumpy().T
        ]


[docs]class MultiNodeStat(MultiIDStat):
    """Multiple node-quantity mappings.

    For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.
    """

    statsclass = NodeStat
    statsmodule = nodestats


[docs]class MultiDiNodeStat(MultiIDStat):
    """Multiple node-quantity mappings.

    For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

    """

    statsclass = DiNodeStat
    statsmodule = dinodestats


[docs]class MultiEdgeStat(MultiIDStat):
    """Multiple edge-quantity mappings.

    For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

    """

    statsclass = EdgeStat
    statsmodule = edgestats


[docs]class MultiDiEdgeStat(MultiIDStat):
    """Multiple edge-quantity mappings.

    For more details, see the `tutorial
    <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

    """

    statsclass = DiEdgeStat
    statsmodule = diedgestats


_dispatch_data = {
    "node": {
        "module": nodestats,
        "statclass": NodeStat,
        "multistatclass": MultiNodeStat,
    },
    "dinode": {
        "module": dinodestats,
        "statclass": DiNodeStat,
        "multistatclass": MultiDiNodeStat,
    },
    "edge": {
        "module": edgestats,
        "statclass": EdgeStat,
        "multistatclass": MultiEdgeStat,
    },
    "diedge": {
        "module": diedgestats,
        "statclass": DiEdgeStat,
        "multistatclass": MultiDiEdgeStat,
    },
}


def dispatch_stat(kind, net, view, name):
    try:
        func = getattr(_dispatch_data[kind]["module"], name)
    except AttributeError as e:
        raise AttributeError(f"Stat '{name}' not defined") from e
    return _dispatch_data[kind]["statclass"](net, view, func)


def dispatch_many_stats(kind, net, view, stats):
    return _dispatch_data[kind]["multistatclass"](net, view, stats)


[docs]def nodestat_func(func):
    """Decorate arbitrary functions to behave like :class:`NodeStat` objects.

    Parameters
    ----------
    func : callable
        Function or callable with signature `func(net, bunch)`, where `net` is the
        network and `bunch` is an iterable of nodes in `net`.  The call `func(net,
        bunch)` must return a dict with pairs of the form `(node: value)` where `node`
        is in `bunch` and `value` is the value of the statistic at `node`.

    Returns
    -------
    callable
        The decorated callable unmodified, after registering it in the `stats`
        framework.

    See Also
    --------
    :func:`edgestat_func`

    Notes
    -----
    The user must make sure that `func` is such that, if `res` is defined as `res =
    func(net, bunch)`, then `res` has keys in the same order as they are found in
    `bunch`.  Since python dicts preserve order, it is enough for `func` to create the
    returned dict by iterating over `bunch`.


    Examples
    --------
    >>> import xgi
    >>> H = xgi.Hypergraph([[1, 2], [3, 4], [4, 5, 6]])

    The following function defines a node-integer mapping.

    >>> def my_degree(net, bunch):
    ...     return {n: 10 * net.degree(n) for n in bunch}

    Node statistics can be called from the network or from the NodeView.

    >>> H.degree()
    {1: 1, 2: 1, 3: 1, 4: 2, 5: 1, 6: 1}
    >>> H.nodes.degree
    NodeStat('degree')

    However, `my_degree` is not recognized as a node statistic.

    >>> H.my_degree() # doctest: +ELLIPSIS
    Traceback (most recent call last):
    AttributeError:...

    >>> H.nodes.my_degree # doctest: +ELLIPSIS
    Traceback (most recent call last):
    AttributeError:...

    Use the `nodestat_func` decorator to turn `my_degree` into a valid stat.

    >>> original_my_degree = my_degree
    >>> my_degree = xgi.nodestat_func(my_degree)
    >>> H.my_degree()
    {1: 10, 2: 10, 3: 10, 4: 20, 5: 10, 6: 10}
    >>> H.nodes.my_degree
    NodeStat('my_degree')

    Now the entirety of the interface of stat objects is available.

    >>> H.nodes.filterby('my_degree', 20)
    NodeView((4,))
    >>> H.nodes.multi(['degree', 'my_degree']).aspandas()
       degree  my_degree
    1       1         10
    2       1         10
    3       1         10
    4       2         20
    5       1         10
    6       1         10

    Note the passed function is left unmodified.

    >>> my_degree is original_my_degree
    True

    The previous usage of `nodestat` is made for explanatory purposes.  A more typical
    use of `nodestat` is the following.

    >>> @xgi.nodestat_func
    ... def my_degree(net, bunch):
    ...     return {n: 10 * net.degree(n) for n in bunch}

    """
    setattr(nodestats, func.__name__, func)
    return func


[docs]def dinodestat_func(func):
    """Decorator that allows arbitrary functions to behave like :class:`DiNodeStat` objects.

    Works identically to :func:`nodestat`.  For extended documentation, see
    :func:`nodestat_func`.

    Parameters
    ----------
    func : callable
        Function or callable with signature `func(net, bunch)`, where `net` is the
        network and `bunch` is an iterable of edges in `net`.  The call `func(net,
        bunch)` must return a dict with pairs of the form `(edge: value)` where `edge`
        is in `bunch` and `value` is the value of the statistic at `edge`.

    Returns
    -------
    callable
        The decorated callable unmodified, after registering it in the `stats` framework.

    See Also
    --------
    :func:`nodestat_func`
    :func:`edgestat_func`
    :func:`diedgestat_func`

    """
    setattr(dinodestats, func.__name__, func)
    return func


def edgestat_func(func):
    """Decorator that allows arbitrary functions to behave like :class:`EdgeStat` objects.

    Works identically to :func:`nodestat`.  For extended documentation, see
    :func:`nodestat_func`.

    Parameters
    ----------
    func : callable
        Function or callable with signature `func(net, bunch)`, where `net` is the
        network and `bunch` is an iterable of edges in `net`.  The call `func(net,
        bunch)` must return a dict with pairs of the form `(edge: value)` where `edge`
        is in `bunch` and `value` is the value of the statistic at `edge`.

    Returns
    -------
    callable
        The decorated callable unmodified, after registering it in the `stats` framework.

    See Also
    --------
    :func:`nodestat_func`
    :func:`edgestat_func`
    :func:`diedgestat_func`

    """
    setattr(dinodestats, func.__name__, func)
    return func


[docs]def edgestat_func(func):
    """Decorate arbitrary functions to behave like :class:`EdgeStat` objects.

    Works identically to :func:`nodestat`.  For extended documentation, see
    :func:`nodestat_func`.

    Parameters
    ----------
    func : callable
        Function or callable with signature `func(net, bunch)`, where `net` is the
        network and `bunch` is an iterable of edges in `net`.  The call `func(net,
        bunch)` must return a dict with pairs of the form `(edge: value)` where `edge`
        is in `bunch` and `value` is the value of the statistic at `edge`.

    Returns
    -------
    callable
        The decorated callable unmodified, after registering it in the `stats`
        framework.

    See Also
    --------
    :func:`nodestat_func`

    """
    setattr(edgestats, func.__name__, func)
    return func


[docs]def diedgestat_func(func):
    """Decorator that allows arbitrary functions to behave like :class:`DiEdgeStat` objects.

    Works identically to :func:`nodestat`.  For extended documentation, see
    :func:`nodestat_func`.

    Parameters
    ----------
    func : callable
        Function or callable with signature `func(net, bunch)`, where `net` is the
        network and `bunch` is an iterable of edges in `net`.  The call `func(net,
        bunch)` must return a dict with pairs of the form `(edge: value)` where `edge`
        is in `bunch` and `value` is the value of the statistic at `edge`.

    Returns
    -------
    callable
        The decorated callable unmodified, after registering it in the `stats` framework.

    See Also
    --------
    :func:`nodestat_func`
    :func:`dinodestat_func`
    :func:`diedgestat_func`

    """
    setattr(diedgestats, func.__name__, func)
    return func