Source code for qumin.clustering
# -*- coding: utf-8 -*-
# !/usr/bin/env python3
import numpy as np
import pandas as pd
[docs]
def find_microclasses(paradigms, patterns, freqs=None):
"""Find microclasses in a paradigm (lines with identical rows).
This is useful to identify an exemplar of each inflection microclass,
and limit further computation to the collection of these exemplars.
Arguments:
paradigms (pandas.DataFrame):
a dataframe containing inflectional paradigms.
rows describe a pattern between forms from a given lexeme for a given cell.
freqs (pandas.Series): a series of frequencies for each lemma
Return:
microclasses (dict).
classes is a dict. Its keys are exemplars,
its values are lists of the name of rows identical to the exemplar.
Each exemplar represents a macroclass. ::
{"a":["a","A","aa"], "b":["b","B","BBB"]}
"""
lexemes = pd.Series(index=paradigms.data.lexeme.unique())
grouped = lexemes.groupby([df.groupby('lexeme', observed=False).pattern.apply(
lambda x: tuple(sorted([str(p) for p in x if p is not None])))
for df in patterns.values()])
mc = {}
for name, group in grouped:
members = list(group.index)
if freqs is not None:
freq_subset = freqs[group.index]
exemplar = freq_subset.index[freq_subset.argmax()]
else:
exemplar = min(members, key=lambda string: len(string))
mc[exemplar] = members
return mc
[docs]
def find_min_attribute(tree, attr):
"""Find the minimum value for an attribute in a tree.
Arguments:
tree (node.Node): The tree in which to find the minimum attribute.
attr (str): the attribute's key."""
agenda = [tree]
mini = np.inf
while agenda:
node = agenda.pop(0)
if node.children:
agenda.extend(node.children)
if attr in node.attributes and float(node.attributes[attr]) < mini:
mini = node.attributes[attr]
return mini