Source code for higra.assessment.hierarchical_cost

############################################################################
# Copyright ESIEE Paris (2018)                                             #
#                                                                          #
# Contributor(s) : Benjamin Perret                                         #
#                                                                          #
# Distributed under the terms of the CECILL-B License.                     #
#                                                                          #
# The full license is in the file LICENSE, distributed with this software. #
############################################################################

import higra as hg
import numpy as np


[docs]def dendrogram_purity(tree, leaf_labels):
    """
    Weighted average of the purity of each node of the tree with respect to a ground truth
    labelization of the tree leaves.
    
    Let :math:`T` be a tree with leaves :math:`V=\{1, \ldots, n\}`.
    Let :math:`C=\{C_1, \ldots, C_K\}` be a partition of :math:`V` into :math:`k` (label) sets.
    
    The purity of a subset :math:`X` of :math:`V` with respect to class :math:`C_\ell\in C` is the fraction of
    elements of :math:`X` that belongs to class :math:`C_\ell`:
    
    .. math::
    
         pur(X, C_\ell) = \\frac{| X \cap C_\ell |}{| X |}.
    
    The purity of :math:`T` is the defined as:
    
    .. math::
    
         pur(T) = \\frac{1}{Z}\sum_{k=1}^{K}\sum_{x,y\in C_k, x\\neq y} pur(lca_T(x,y), C_k)
    
    with :math:`Z=| \{\{x,y\} \subseteq V \mid x\\neq y, \exists k, \{x,y\}\subseteq C_k\} |`.
    
    :See:
    
         Heller, Katherine A., and Zoubin Ghahramani. "`Bayesian hierarchical clustering <https://www2.stat.duke.edu/~kheller/bhcnew.pdf>`_ ."
         Proc. ICML. ACM, 2005.
    
    :Complexity:
    
    The dendrogram purity is computed in :math:`\mathcal{O}(N\\times K \\times C^2)` with :math:`N` the number of nodes
    in the tree, :math:`K` the number of classes, and :math:`C` the maximal number of children of a node in the tree.

    :param tree: input tree
    :param leaf_labels: a 1d integral array of length `tree.num_leaves()`
    :return:  a score between 0 and 1 (higher is better)
    """
    if leaf_labels.ndim != 1 or leaf_labels.size != tree.num_leaves() or leaf_labels.dtype.kind != 'i':
        raise ValueError("leaf_labels must be a 1d integral array of length `tree.num_leaves()`")

    num_l = tree.num_leaves()
    area = hg.attribute_area(tree)

    max_label = np.max(leaf_labels)
    num_labels = max_label + 1
    label_histo_leaves = np.zeros((num_l, num_labels), dtype=np.float64)
    label_histo_leaves[np.arange(num_l), leaf_labels] = 1

    label_histo = hg.accumulate_sequential(tree, label_histo_leaves, hg.Accumulators.sum)
    class_purity = label_histo / area[:, np.newaxis]

    weights = hg.attribute_children_pair_sum_product(tree, label_histo)
    total = np.sum(class_purity[num_l:, :] * weights[num_l:, :])

    return total / np.sum(weights[num_l:])


[docs]@hg.argument_helper(hg.CptHierarchy)
def dasgupta_cost(tree, edge_weights, leaf_graph, mode="dissimilarity"):
    """
    Dasgupta's cost is an unsupervised measure of the quality of a hierarchical clustering of an edge weighted graph.

    Let :math:`T` be a tree representing a hierarchical clustering of the graph :math:`G=(V, E)`.

    If :math:`w` is a dissimilarity function on the edges :math:`E` of the graph
    (:attr:`mode` is equal to ``"dissimilarity"``), then the Dasgupta's cost is defined as:

    .. math::

        dasgupta(T, V, E, w) = \sum_{\{x,y\}\in E} \\frac{area(lca_T(x,y))}{w(\{x,y\})}

    where :math:`area` is the area of a node in the tree and :math:`lca_T` is the lowest common ancestor of two nodes.

    If :math:`w` is a similarity function on the edges :math:`E` of the graph
    (:attr:`mode` is equal to ``"similarity"``), then the Dasgupta's cost is defined as:

    .. math::

        dasgupta(T, V, E, w) = \sum_{\{x,y\}\in E} area(lca_T(x,y)) \\times w(\{x,y\})


    :See:

        S. Dasgupta. "`A cost function for similarity-based hierarchical clustering <https://arxiv.org/pdf/1510.05043.pdf>`_ ."
        In Proc. STOC, pages 118–127, Cambridge, MA, USA, 2016

    :Complexity:

    The runtime complexity is :math:`\mathcal{O}(n\log(n) + m)` with :math:`n` the number of nodes in :math:`T` and
    :math:`m` the number of edges in :math:`E`.

    :param tree: Input tree
    :param edge_weights: Edge weights on the leaf graph of the input tree
    :param leaf_graph: Leaf graph of the input tree (deduced from :class:`~higra.CptHierarchy`)
    :param mode: "dissimilarity" or "similarity" (default: "dissimilarity")
    :return: a real number
    """
    area = hg.attribute_area(tree, leaf_graph=leaf_graph)

    lcaf = tree.lowest_common_ancestor_preprocess()
    lca = lcaf.lca(leaf_graph)

    if mode == "dissimilarity":
        return np.sum(area[lca] / edge_weights)
    elif mode == "similarity":
        return np.sum(area[lca] * edge_weights)
    else:
        raise ValueError("Unknown mode: " + mode)


[docs]@hg.argument_helper(hg.CptHierarchy)
def tree_sampling_divergence(tree, edge_weights, leaf_graph):
    """
    Tree sampling divergence is an unsupervised measure of the quality of a hierarchical clustering of an
    edge weighted graph.
    It measures how well the given edge weighted graph can be reconstructed from the tree alone.
    It is equal to 0 if and only if the given graph can be fully recovered from the tree.

    It is defined as the Kullback-Leibler divergence between the edge sampling model :math:`p` and the independent
    (null) sampling model :math:`q` of the nodes of a tree (see :func:`~higra.attribute_tree_sampling_probability`).

    The tree sampling divergence on a tree :math:`T` is then

    .. math::

        TSD(T) = \sum_{x \in T} p(x) \log\\frac{p(x)}{q(x)}

    The definition of the tree sampling divergence was proposed in:

        Charpentier, B. & Bonald, T. (2019).  `"Tree Sampling Divergence: An Information-Theoretic Metric for \
        Hierarchical Graph Clustering." <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI.

    :Complexity:

    The tree sampling divergence is computed in :math:`\mathcal{O}(N (\log(N) + C^2) + M)` with :math:`N` the number of
    nodes in the tree, :math:`M` the number of edges in the leaf graph, and :math:`C` the maximal number of children of
    a node in the tree.

    :param tree: Input tree
    :param edge_weights: Edge weights on the leaf graph (similarities)
    :param leaf_graph: Leaf graph of the input tree (deduced from :class:`~higra.CptHierarchy`)
    :return: a real number
    """

    num_l = tree.num_leaves()
    p = hg.attribute_tree_sampling_probability(tree, leaf_graph, edge_weights, 'edge')[num_l:]
    q = hg.attribute_tree_sampling_probability(tree, leaf_graph, edge_weights, 'null')[num_l:]
    index, = np.where(p)
    return np.sum(p[index] * np.log(p[index] / q[index]))