Source code for qumin.clustering.algorithms
# !usr/bin/python3
# -*- coding: utf-8 -*-
"""Algorithms for inflection classes clustering.
Author: Sacha Beniamine
"""
import numpy as np
from . import find_microclasses
import logging
log = logging.getLogger()
[docs]
def choose(iterable):
"""Choose a random element in an iterable of iterable.
The iterable can have more than 1 dimension (but the choice will be
done on the first dimension)."""
i = np.random.choice(len(iterable), 1)
return iterable[i.item()]
[docs]
def log_classes(classes, md, suffix):
filename = md.get_path(suffix + ".txt")
log.info("Found %s %s", len(classes), suffix)
log.info("Printing log to %s", filename)
with open(filename, "w", encoding="utf-8") as flow:
for m in sorted(classes, key=lambda x: len(classes[x])):
flow.write("\n\n{} ({}) \n\t".format(m,
len(classes[m]))
+ ", ".join(classes[m]))
md.register_file(suffix + ".txt", description="Log of the macroclass computation")
[docs]
def hierarchical_clustering(patterns, paradigms, Clusters, **kwargs):
"""Perform hierarchical clustering on patterns according to a clustering algorithm and a measure.
This function ::
Finds microclasses.
Performs the clustering,
Finds the macroclasses (and exports them),
Returns the inflection class tree.
The clustering algorithm is the following::
Begin with one cluster per microclasses.
While there is more than one cluster :
Find the best possible merge of two clusters, among all possible pairs.
Perform this merge
Scoring, finding the best merges, merging nodes depends on the Clusters class.
Arguments:
patterns (patterns.ParadigmPatterns): alternation patterns
paradigms (paradigms.Paradigms): paradigms of forms
Clusters : a cluster class to use in clustering.
clustering_algorithm (Callable): a clustering algorithm.
kwargs: any keywords arguments to pass to Clusters. Some keywords are mandatory :
"md" should be the Metadata register, "patterns" should be a function for pattern finding
"""
# Clustering
microclasses = find_microclasses(paradigms, patterns)
clusters = Clusters(microclasses, patterns, **kwargs)
while len(clusters.nodes) > 1:
log.info("number of classes = %s", len(clusters.nodes))
possible_merges = clusters.find_ordered_merges()
a, b, score = choose(possible_merges)
clusters.merge(a, b)
node = clusters.rootnode()
# Export macroclasses
macroclasses = node.macroclasses()
if macroclasses:
log_classes(macroclasses, kwargs['md'], "macroclasses")
else:
log.warning("No macroclasses could be found "
" this is not necessarily a bug, but it is surprising !")
return node