Source code for xgi.stats

"""Statistics of networks, their nodes, and edges.

Any mapping that assigns some quantity to each node of a network is considered a node
statistic.  For example, the degree is a node-integer mapping, while a node attribute
that assigns a string label to each node is a node-string mapping.  The `stats` package
provides a common interface to all such mappings.

Each such mapping is accessible via the `H.nodes` view.  For example, the degree of all
nodes supports type conversion using the `as*` methods.

>>> import xgi
>>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
>>> H.nodes.degree.asdict()
{1: 1, 2: 2, 3: 3, 4: 2, 5: 2}
>>> H.nodes.degree.aslist()
[1, 2, 3, 2, 2]

Another feature is the ability to filter the nodes of a network by degree.

>>> H.nodes.filterby('degree', 2)
NodeView((2, 4, 5))

The power of the stats package is that any other node statistic that can be conceived of
as a node-quantity mapping is given the same interface.  For example, node attributes
get the same treatment:

>>> H.add_nodes_from([
...     (1, {"color": "red", "name": "horse"}),
...     (2, {"color": "blue", "name": "pony"}),
...     (3, {"color": "yellow", "name": "zebra"}),
...     (4, {"color": "red", "name": "orangutan", "age": 20}),
...     (5, {"color": "blue", "name": "fish", "age": 2}),
... ])
>>> H.nodes.attrs('color').asdict()
{1: 'red', 2: 'blue', 3: 'yellow', 4: 'red', 5: 'blue'}
>>> H.nodes.attrs('color').aslist()
['red', 'blue', 'yellow', 'red', 'blue']
>>> H.nodes.filterby_attr('color', 'red')
NodeView((1, 4))

Many other features are available, including edge-statistics, and user-defined
statistics.  For more details, see the `tutorial
<https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_.

"""

import numpy as np
import pandas as pd
from scipy.stats import moment as spmoment

from ..exception import IDNotFound
from ..utils import hist

from . import edgestats, diedgestats, dinodestats, nodestats

__all__ = [
    "nodestat_func",
    "edgestat_func",
    "dinodestat_func",
    "diedgestat_func",
    "dispatch_stat",
    "dispatch_many_stats",
]


class IDStat:
    """Mapping between nodes or edges and a quantity or property."""

    def __init__(self, network, view, func, args=None, kwargs=None):
        self.view = view
        self.net = network
        self.args = () if args is None else args
        self.kwargs = {} if args is None else kwargs
        self.func = func

    def __call__(self, *args, **kwargs):
        return self.__class__(self.net, self.view, self.func, args=args, kwargs=kwargs)

    def __getitem__(self, idx):
        if idx not in self.view:
            raise IDNotFound(f'ID "{idx}" not in this view')
        return self.func(self.net, [idx], *self.args, **self.kwargs)[idx]

    def __repr__(self):
        cls = self.__class__.__name__
        fnc = self.func.__name__
        out = f"{cls}('{fnc}'"
        if self.args:
            out += f", args={self.args}"
        if self.kwargs:
            out += f", kwargs={self.kwargs}"
        out += ")"
        return out

    def __len__(self):
        return len(self.view)

    @property
    def name(self):
        """Name of this stat.

        The name of a stat is used to populate the keys of dictionaries in `MultiStat`
        objects, as well as the names of columns of pandas dataframes.

        Examples
        --------
        >>> import xgi
        >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]])
        >>> da, d3 = H.nodes.degree, H.nodes.degree(order=3)
        >>> da.name, d3.name
        ('degree', 'degree(order=3)')
        >>> H.nodes.multi([da, d3]).asdict(transpose=True).keys()
        dict_keys(['degree', 'degree(order=3)'])
        >>> H.nodes.multi([da, d3]).aspandas().columns
        Index(['degree', 'degree(order=3)'], dtype='object')

        """
        name = f"{self.func.__name__}"
        if self.args or self.kwargs:
            args = [f"{s}" for s in self.args]
            kwargs = [f"{k}={v}" for k, v in self.kwargs.items()]
            name += "(" + ", ".join(args + kwargs) + ")"
        return name

    def __iter__(self):
        return iter(self._val.items())

    def items(self):
        return self._val.items()

    @property
    def _val(self):
        return self.func(self.net, self.view.ids, *self.args, **self.kwargs)

    def asdict(self):
        """Output the stat as a dict.

        Notes
        -----
        All stats are stored as dicts and therefore this method incurs in no overhead as
        type conversion is not necessary.

        """
        val = self._val
        return {n: val[n] for n in self.view}

    def aslist(self):
        """Output the stat as a list."""
        val = self._val
        return [val[n] for n in self.view]

    def asnumpy(self):
        """Output the stat as a numpy array."""
        return np.array(self.aslist())

    def aspandas(self):
        """Output the stat as a pandas series.

        Notes
        -----
        The `name` attribute of the returned series is set using the `name` property.

        """
        return pd.Series(self._val, name=self.name)

    def ashist(self, bins=10, bin_edges=False, density=False, log_binning=False):
        """Return the distribution of a numpy array.

        Parameters
        ----------
        vals : Numpy array
            The array of values
        bins : int, list, or Numpy array
            The number of bins or the bin edges.
        bin_edges : bool
            Whether to also output the min and max of each bin,
            by default, False.
        density : bool
            Whether to normalize the resulting distribution.
        log_binning : bool
            Whether to bin the values with log-sized bins.
            By default, False.


        Returns
        -------
        Pandas DataFrame
            A two-column table with "bin_center" and "value" columns,
            where "value" is a count or a probability. If `bin_edges`
            is True, outputs two additional columns, `bin_lo` and `bin_hi`,
            which outputs the left and right bin edges respectively.

        Notes
        -----
        Originally from https://github.com/jkbren/networks-and-dataviz
        """

        # if there is one unique value and more than one bin is specified,
        # sets the number of bins to 1.
        if isinstance(bins, int) and len(set(self.aslist())) == 1:
            bins = 1

        return hist(self.asnumpy(), bins, bin_edges, density, log_binning)

    def max(self):
        """The maximum value of this stat."""
        return self.asnumpy().max(axis=0)

    def min(self):
        """The minimum value of this stat."""
        return self.asnumpy().min(axis=0)

    def sum(self):
        """The sum of this stat."""
        return self.asnumpy().sum(axis=0)

    def mean(self):
        """The arithmetic mean of this stat."""
        return self.asnumpy().mean(axis=0)

    def median(self):
        """The median of this stat."""
        return np.median(self.asnumpy(), axis=0)

    def std(self):
        """The standard deviation of this stat."""
        return self.asnumpy().std(axis=0)

    def var(self):
        """The variance of this stat."""
        return self.asnumpy().var(axis=0)

    def moment(self, order=2, center=False):
        """The statistical moments of this stat.

        Parameters
        ----------
        order : int (default 2)
            The order of the moment.
        center : bool (default False)
            Whether to compute the centered (False) or uncentered/raw (True) moment.

        """
        arr = self.asnumpy()
        return spmoment(arr, moment=order) if center else np.mean(arr**order)

    def argmin(self):
        """The ID corresponding to the minimum of the stat

        When the minimum value is not unique, returns first
        ID corresponding to the minimum value.

        Returns
        -------
        hashable
            The ID to which the minimum value corresponds.
        """
        d = self.asdict()
        return min(d, key=d.get)

    def argmax(self):
        """The ID corresponding to the maximum of the stat

        When the maximal value is not unique, returns first
        ID corresponding to the maximal value.

        Returns
        -------
        hashable
            The ID to which the maximum value corresponds.
        """
        d = self.asdict()
        return max(d, key=d.get)

    def argsort(self, reverse=False):
        """Get the list of IDs sorted by stat value.

        When values are not unique, the order of the IDs
        is preserved.

        Parameters
        ----------
        reverse : bool
            Whether the sorting should be ascending or descending.

        Returns
        -------
        list
            The IDs sorted in ascending or descending order.
        """
        d = self.asdict()
        return sorted(d, key=d.get, reverse=reverse)


[docs]class NodeStat(IDStat): """An arbitrary node-quantity mapping. `NodeStat` objects represent a mapping that assigns a value to each node in a network. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """
[docs]class DiNodeStat(IDStat): """An arbitrary node-quantity mapping. `NodeStat` objects represent a mapping that assigns a value to each node in a network. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """
[docs]class EdgeStat(IDStat): """An arbitrary edge-quantity mapping. `EdgeStat` objects represent a mapping that assigns a value to each edge in a network. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """
[docs]class DiEdgeStat(IDStat): """An arbitrary edge-quantity mapping. `EdgeStat` objects represent a mapping that assigns a value to each edge in a network. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """
class MultiIDStat(IDStat): """Multiple mappings.""" statsclass = None """IDStat subclass to use.""" statsmodule = None """Module in which to search for mappings.""" def __init__(self, network, view, stats): super().__init__(network, view, None) if isinstance(stats, self.statsclass): name = self.statsclass.__name__ raise TypeError(f"must pass an iterable of {name}, not a single {name}") elif isinstance(stats, str): raise TypeError( f"must pass an iterable of {self.statsclass.__name__}, not str" ) self.stats = [self._get_stat(f) for f in stats] def _get_stat(self, s): if isinstance(s, str): return self.statsclass(self.net, self.view, getattr(self.statsmodule, s)) elif isinstance(s, self.statsclass): return s else: raise TypeError(f"{s.__name__} must be str or {self.statsclass.__name__}") def __repr__(self): return ( f"{self.__class__.__name__}" + "(" + ", ".join(s.name for s in self.stats) + ")" ) @property def name(self): return "[" + ", ".join(s.name for s in self.stats) + "]" @property def _val(self): result = {s.name: s.asdict() for s in self.stats} return {n: {s.name: result[s.name][n] for s in self.stats} for n in self.view} def asdict(self, inner=dict, transpose=False): """Output the stats as a dict of collections. Parameters ---------- inner : dict (default) or list The type of the inner collections. If dict (default), output a dict of dicts. If list, output a dict of lists. transpose : bool (default False) By default, output a dict of dicts whose outer keys are the nodes and inner keys are the specified stats. If True, the outer and inner keys are reversed. Only used when `inner` is `dict`. Examples -------- >>> import xgi >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]]) >>> m = H.nodes.multi(['degree', 'clustering_coefficient']) >>> m.asdict() # doctest: +NORMALIZE_WHITESPACE {1: {'degree': 1, 'clustering_coefficient': 1.0}, 2: {'degree': 2, 'clustering_coefficient': 0.6666666666666666}, 3: {'degree': 3, 'clustering_coefficient': 0.6666666666666666}, 4: {'degree': 2, 'clustering_coefficient': 1.0}, 5: {'degree': 2, 'clustering_coefficient': 1.0}} >>> m.asdict(transpose=True) # doctest: +NORMALIZE_WHITESPACE {'degree': {1: 1, 2: 2, 3: 3, 4: 2, 5: 2}, 'clustering_coefficient': {1: 1.0, 2: 0.6666666666666666, 3: 0.6666666666666666, 4: 1.0, 5: 1.0}} """ val = self._val if inner is dict: if not transpose: return {n: val[n] for n in self.view} else: return {s.name: s.asdict() for s in self.stats} elif inner is list: return {n: list(val[n].values()) for n in self.view} else: raise ValueError def aslist(self, inner=list, transpose=False): """Output the stats as a list of collections. Parameters ---------- inner : list (default) or dict The type of the inner collections. If list (default), output a list of lists. If dict, output a list of dicts. transpose : bool (default False) By default, output a list of lists where each inner list contains the stats of a single node. If True, each inner list contains the values of a single stat of all nodes. Examples -------- >>> import xgi >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]]) >>> m = H.nodes.multi(['degree', 'clustering_coefficient']) >>> m.aslist() # doctest: [[1, 1.0], [2, 0.6666666666666666], [3, 0.6666666666666666], [2, 1.0], [2, 1.0]] >>> m.aslist(transpose=True) [[1, 2, 3, 2, 2], [1.0, 0.6666666666666666, 0.6666666666666666, 1.0, 1.0]] """ val = self._val if inner is list: if not transpose: return [list(val[n].values()) for n in self.view] else: return [s.aslist() for s in self.stats] elif inner is dict: return [val[n] for n in self.view] else: raise ValueError def asnumpy(self): """Output the stats as a numpy array. Notes ----- Equivalent to `np.array(self.aslist(inner=list))`. Examples -------- >>> import xgi >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]]) >>> H.nodes.multi(['degree', 'clustering_coefficient']).asnumpy() ... # doctest: +NORMALIZE_WHITESPACE array([[1. , 1. ], [2. , 0.66666667], [3. , 0.66666667], [2. , 1. ], [2. , 1. ]]) """ return np.array(self.aslist(inner=list)) def aspandas(self): """Output the stats as a pandas dataframe. Examples -------- >>> import xgi >>> H = xgi.Hypergraph([[1, 2, 3], [2, 3, 4, 5], [3, 4, 5]]) >>> H.nodes.multi(['degree', 'clustering_coefficient']).aspandas() ... # doctest: +NORMALIZE_WHITESPACE degree clustering_coefficient 1 1 1.000000 2 2 0.666667 3 3 0.666667 4 2 1.000000 5 2 1.000000 """ result = {s.name: s._val for s in self.stats} series = [pd.Series(v, name=k) for k, v in result.items()] return pd.concat(series, axis=1) def ashist(self, bins=10, bin_edges=False, density=False, log_binning=False): """Return the distributions of a numpy array. Parameters ---------- vals : Numpy array The array of values bins : int, list, or Numpy array The number of bins or the bin edges. bin_edges : bool Whether to also output the min and max of each bin, by default, False. density : bool Whether to normalize the resulting distribution. log_binning : bool Whether to bin the values with log-sized bins. By default, False. Returns ------- list of Pandas DataFrames Each entry of the list is a two-column table with "bin_center" and "value" columns, where "value" is a count or a probability. If `bin_edges` is True, outputs two additional columns, `bin_lo` and `bin_hi`, which outputs the left and right bin edges respectively. Notes ----- Originally from https://github.com/jkbren/networks-and-dataviz """ return [ hist(data, bins, bin_edges, density, log_binning) for data in self.asnumpy().T ]
[docs]class MultiNodeStat(MultiIDStat): """Multiple node-quantity mappings. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """ statsclass = NodeStat statsmodule = nodestats
[docs]class MultiDiNodeStat(MultiIDStat): """Multiple node-quantity mappings. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """ statsclass = DiNodeStat statsmodule = dinodestats
[docs]class MultiEdgeStat(MultiIDStat): """Multiple edge-quantity mappings. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """ statsclass = EdgeStat statsmodule = edgestats
[docs]class MultiDiEdgeStat(MultiIDStat): """Multiple edge-quantity mappings. For more details, see the `tutorial <https://xgi.readthedocs.io/en/stable/api/tutorials/Tutorial%206%20-%20Statistics.html>`_. """ statsclass = DiEdgeStat statsmodule = diedgestats
_dispatch_data = { "node": { "module": nodestats, "statclass": NodeStat, "multistatclass": MultiNodeStat, }, "dinode": { "module": dinodestats, "statclass": DiNodeStat, "multistatclass": MultiDiNodeStat, }, "edge": { "module": edgestats, "statclass": EdgeStat, "multistatclass": MultiEdgeStat, }, "diedge": { "module": diedgestats, "statclass": DiEdgeStat, "multistatclass": MultiDiEdgeStat, }, } def dispatch_stat(kind, net, view, name): try: func = getattr(_dispatch_data[kind]["module"], name) except AttributeError as e: raise AttributeError(f"Stat '{name}' not defined") from e return _dispatch_data[kind]["statclass"](net, view, func) def dispatch_many_stats(kind, net, view, stats): return _dispatch_data[kind]["multistatclass"](net, view, stats)
[docs]def nodestat_func(func): """Decorate arbitrary functions to behave like :class:`NodeStat` objects. Parameters ---------- func : callable Function or callable with signature `func(net, bunch)`, where `net` is the network and `bunch` is an iterable of nodes in `net`. The call `func(net, bunch)` must return a dict with pairs of the form `(node: value)` where `node` is in `bunch` and `value` is the value of the statistic at `node`. Returns ------- callable The decorated callable unmodified, after registering it in the `stats` framework. See Also -------- :func:`edgestat_func` Notes ----- The user must make sure that `func` is such that, if `res` is defined as `res = func(net, bunch)`, then `res` has keys in the same order as they are found in `bunch`. Since python dicts preserve order, it is enough for `func` to create the returned dict by iterating over `bunch`. Examples -------- >>> import xgi >>> H = xgi.Hypergraph([[1, 2], [3, 4], [4, 5, 6]]) The following function defines a node-integer mapping. >>> def my_degree(net, bunch): ... return {n: 10 * net.degree(n) for n in bunch} Node statistics can be called from the network or from the NodeView. >>> H.degree() {1: 1, 2: 1, 3: 1, 4: 2, 5: 1, 6: 1} >>> H.nodes.degree NodeStat('degree') However, `my_degree` is not recognized as a node statistic. >>> H.my_degree() # doctest: +ELLIPSIS Traceback (most recent call last): AttributeError:... >>> H.nodes.my_degree # doctest: +ELLIPSIS Traceback (most recent call last): AttributeError:... Use the `nodestat_func` decorator to turn `my_degree` into a valid stat. >>> original_my_degree = my_degree >>> my_degree = xgi.nodestat_func(my_degree) >>> H.my_degree() {1: 10, 2: 10, 3: 10, 4: 20, 5: 10, 6: 10} >>> H.nodes.my_degree NodeStat('my_degree') Now the entirety of the interface of stat objects is available. >>> H.nodes.filterby('my_degree', 20) NodeView((4,)) >>> H.nodes.multi(['degree', 'my_degree']).aspandas() degree my_degree 1 1 10 2 1 10 3 1 10 4 2 20 5 1 10 6 1 10 Note the passed function is left unmodified. >>> my_degree is original_my_degree True The previous usage of `nodestat` is made for explanatory purposes. A more typical use of `nodestat` is the following. >>> @xgi.nodestat_func ... def my_degree(net, bunch): ... return {n: 10 * net.degree(n) for n in bunch} """ setattr(nodestats, func.__name__, func) return func
[docs]def dinodestat_func(func): """Decorator that allows arbitrary functions to behave like :class:`DiNodeStat` objects. Works identically to :func:`nodestat`. For extended documentation, see :func:`nodestat_func`. Parameters ---------- func : callable Function or callable with signature `func(net, bunch)`, where `net` is the network and `bunch` is an iterable of edges in `net`. The call `func(net, bunch)` must return a dict with pairs of the form `(edge: value)` where `edge` is in `bunch` and `value` is the value of the statistic at `edge`. Returns ------- callable The decorated callable unmodified, after registering it in the `stats` framework. See Also -------- :func:`nodestat_func` :func:`edgestat_func` :func:`diedgestat_func` """ setattr(dinodestats, func.__name__, func) return func
def edgestat_func(func): """Decorator that allows arbitrary functions to behave like :class:`EdgeStat` objects. Works identically to :func:`nodestat`. For extended documentation, see :func:`nodestat_func`. Parameters ---------- func : callable Function or callable with signature `func(net, bunch)`, where `net` is the network and `bunch` is an iterable of edges in `net`. The call `func(net, bunch)` must return a dict with pairs of the form `(edge: value)` where `edge` is in `bunch` and `value` is the value of the statistic at `edge`. Returns ------- callable The decorated callable unmodified, after registering it in the `stats` framework. See Also -------- :func:`nodestat_func` :func:`edgestat_func` :func:`diedgestat_func` """ setattr(dinodestats, func.__name__, func) return func
[docs]def edgestat_func(func): """Decorate arbitrary functions to behave like :class:`EdgeStat` objects. Works identically to :func:`nodestat`. For extended documentation, see :func:`nodestat_func`. Parameters ---------- func : callable Function or callable with signature `func(net, bunch)`, where `net` is the network and `bunch` is an iterable of edges in `net`. The call `func(net, bunch)` must return a dict with pairs of the form `(edge: value)` where `edge` is in `bunch` and `value` is the value of the statistic at `edge`. Returns ------- callable The decorated callable unmodified, after registering it in the `stats` framework. See Also -------- :func:`nodestat_func` """ setattr(edgestats, func.__name__, func) return func
[docs]def diedgestat_func(func): """Decorator that allows arbitrary functions to behave like :class:`DiEdgeStat` objects. Works identically to :func:`nodestat`. For extended documentation, see :func:`nodestat_func`. Parameters ---------- func : callable Function or callable with signature `func(net, bunch)`, where `net` is the network and `bunch` is an iterable of edges in `net`. The call `func(net, bunch)` must return a dict with pairs of the form `(edge: value)` where `edge` is in `bunch` and `value` is the value of the statistic at `edge`. Returns ------- callable The decorated callable unmodified, after registering it in the `stats` framework. See Also -------- :func:`nodestat_func` :func:`dinodestat_func` :func:`diedgestat_func` """ setattr(diedgestats, func.__name__, func) return func