############################################################################
# Copyright ESIEE Paris (2018) #
# #
# Contributor(s) : Benjamin Perret #
# #
# Distributed under the terms of the CECILL-B License. #
# #
# The full license is in the file LICENSE, distributed with this software. #
############################################################################
import higra as hg
import numpy as np
[docs]def dendrogram_purity(tree, leaf_labels):
"""
Weighted average of the purity of each node of the tree with respect to a ground truth
labelization of the tree leaves.
Let :math:`T` be a tree with leaves :math:`V=\{1, \ldots, n\}`.
Let :math:`C=\{C_1, \ldots, C_K\}` be a partition of :math:`V` into :math:`k` (label) sets.
The purity of a subset :math:`X` of :math:`V` with respect to class :math:`C_\ell\in C` is the fraction of
elements of :math:`X` that belongs to class :math:`C_\ell`:
.. math::
pur(X, C_\ell) = \\frac{| X \cap C_\ell |}{| X |}.
The purity of :math:`T` is the defined as:
.. math::
pur(T) = \\frac{1}{Z}\sum_{k=1}^{K}\sum_{x,y\in C_k, x\\neq y} pur(lca_T(x,y), C_k)
with :math:`Z=| \{\{x,y\} \subseteq V \mid x\\neq y, \exists k, \{x,y\}\subseteq C_k\} |`.
:See:
Heller, Katherine A., and Zoubin Ghahramani. "`Bayesian hierarchical clustering <https://www2.stat.duke.edu/~kheller/bhcnew.pdf>`_ ."
Proc. ICML. ACM, 2005.
:Complexity:
The dendrogram purity is computed in :math:`\mathcal{O}(N\\times K \\times C^2)` with :math:`N` the number of nodes
in the tree, :math:`K` the number of classes, and :math:`C` the maximal number of children of a node in the tree.
:param tree: input tree
:param leaf_labels: a 1d integral array of length `tree.num_leaves()`
:return: a score between 0 and 1 (higher is better)
"""
if leaf_labels.ndim != 1 or leaf_labels.size != tree.num_leaves() or leaf_labels.dtype.kind != 'i':
raise ValueError("leaf_labels must be a 1d integral array of length `tree.num_leaves()`")
num_l = tree.num_leaves()
area = hg.attribute_area(tree)
max_label = np.max(leaf_labels)
num_labels = max_label + 1
label_histo_leaves = np.zeros((num_l, num_labels), dtype=np.float64)
label_histo_leaves[np.arange(num_l), leaf_labels] = 1
label_histo = hg.accumulate_sequential(tree, label_histo_leaves, hg.Accumulators.sum)
class_purity = label_histo / area[:, np.newaxis]
weights = hg.attribute_children_pair_sum_product(tree, label_histo)
total = np.sum(class_purity[num_l:, :] * weights[num_l:, :])
return total / np.sum(weights[num_l:])
[docs]@hg.argument_helper(hg.CptHierarchy)
def dasgupta_cost(tree, edge_weights, leaf_graph, mode="dissimilarity"):
"""
Dasgupta's cost is an unsupervised measure of the quality of a hierarchical clustering of an edge weighted graph.
Let :math:`T` be a tree representing a hierarchical clustering of the graph :math:`G=(V, E)`.
If :math:`w` is a dissimilarity function on the edges :math:`E` of the graph
(:attr:`mode` is equal to ``"dissimilarity"``), then the Dasgupta's cost is defined as:
.. math::
dasgupta(T, V, E, w) = \sum_{\{x,y\}\in E} \\frac{area(lca_T(x,y))}{w(\{x,y\})}
where :math:`area` is the area of a node in the tree and :math:`lca_T` is the lowest common ancestor of two nodes.
If :math:`w` is a similarity function on the edges :math:`E` of the graph
(:attr:`mode` is equal to ``"similarity"``), then the Dasgupta's cost is defined as:
.. math::
dasgupta(T, V, E, w) = \sum_{\{x,y\}\in E} area(lca_T(x,y)) \\times w(\{x,y\})
:See:
S. Dasgupta. "`A cost function for similarity-based hierarchical clustering <https://arxiv.org/pdf/1510.05043.pdf>`_ ."
In Proc. STOC, pages 118–127, Cambridge, MA, USA, 2016
:Complexity:
The runtime complexity is :math:`\mathcal{O}(n\log(n) + m)` with :math:`n` the number of nodes in :math:`T` and
:math:`m` the number of edges in :math:`E`.
:param tree: Input tree
:param edge_weights: Edge weights on the leaf graph of the input tree
:param leaf_graph: Leaf graph of the input tree (deduced from :class:`~higra.CptHierarchy`)
:param mode: "dissimilarity" or "similarity" (default: "dissimilarity")
:return: a real number
"""
area = hg.attribute_area(tree, leaf_graph=leaf_graph)
lcaf = tree.lowest_common_ancestor_preprocess()
lca = lcaf.lca(leaf_graph)
if mode == "dissimilarity":
return np.sum(area[lca] / edge_weights)
elif mode == "similarity":
return np.sum(area[lca] * edge_weights)
else:
raise ValueError("Unknown mode: " + mode)
[docs]@hg.argument_helper(hg.CptHierarchy)
def tree_sampling_divergence(tree, edge_weights, leaf_graph):
"""
Tree sampling divergence is an unsupervised measure of the quality of a hierarchical clustering of an
edge weighted graph.
It measures how well the given edge weighted graph can be reconstructed from the tree alone.
It is equal to 0 if and only if the given graph can be fully recovered from the tree.
It is defined as the Kullback-Leibler divergence between the edge sampling model :math:`p` and the independent
(null) sampling model :math:`q` of the nodes of a tree (see :func:`~higra.attribute_tree_sampling_probability`).
The tree sampling divergence on a tree :math:`T` is then
.. math::
TSD(T) = \sum_{x \in T} p(x) \log\\frac{p(x)}{q(x)}
The definition of the tree sampling divergence was proposed in:
Charpentier, B. & Bonald, T. (2019). `"Tree Sampling Divergence: An Information-Theoretic Metric for \
Hierarchical Graph Clustering." <https://hal.telecom-paristech.fr/hal-02144394/document>`_ Proceedings of IJCAI.
:Complexity:
The tree sampling divergence is computed in :math:`\mathcal{O}(N (\log(N) + C^2) + M)` with :math:`N` the number of
nodes in the tree, :math:`M` the number of edges in the leaf graph, and :math:`C` the maximal number of children of
a node in the tree.
:param tree: Input tree
:param edge_weights: Edge weights on the leaf graph (similarities)
:param leaf_graph: Leaf graph of the input tree (deduced from :class:`~higra.CptHierarchy`)
:return: a real number
"""
num_l = tree.num_leaves()
p = hg.attribute_tree_sampling_probability(tree, leaf_graph, edge_weights, 'edge')[num_l:]
q = hg.attribute_tree_sampling_probability(tree, leaf_graph, edge_weights, 'null')[num_l:]
index, = np.where(p)
return np.sum(p[index] * np.log(p[index] / q[index]))