Source code for qumin.representations.generalize

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Sacha Beniamine.

This module is used to generalize pats contexts.
"""

from collections import Counter
from .contexts import Context
from .patterns import Pattern
import logging
from itertools import zip_longest

log = logging.getLogger("Qumin")


[docs] def generalize_alt(patterns, inv): """Use the generalized alternation, using features when possible rather than segments.""" p0 = patterns[0] alt_count = len(set((p.to_alt(inv, exhaustive_blanks=True) for p in patterns))) if alt_count < 2 or p0._gen_alt is None: return p0.alternation c1, c2 = p0.cells # At first, alternations are {cell: parts}, # parts are tuple(positions) # positions are tuples of phoneme positions # The order of embedding is cell, then parts, then positions # Ex: {c1: (('a','b'),('c',)), c2: (('A','B'),('C',))} # This gets us to parts, then cells, then positions # [ (('a','b'), ('A','B')), # (('c',), ('C',)) ] generalized = list(zip(p0._gen_alt[c1], p0._gen_alt[c2])) specific = list(zip(*(zip(p.alternation[c1], p.alternation[c2]) for p in patterns))) minimal_generalization = [] # Iterate first over parts for i, generalized_part in enumerate(generalized): # Iterates on alternation parts, between blanks # A part looks like: (('a','b'), ('A','B')), # We now want positions, then cells: (('a','A'), ('b','B')), generalized_part = list(zip(*generalized_part)) # Arrange by phoneme position, then cell, then patterns specific_parts = list(zip(*(zip_longest(*p, fillvalue="") for p in specific[i]))) minimal_gen_part = ([], []) for j, generalized_pos in enumerate(generalized_part): # Iterates on each change in the part specific_pos = set(specific_parts[j]) if len(specific_pos) > 1: minimal_gen_part[0].append(generalized_pos[0]) minimal_gen_part[1].append(generalized_pos[1]) else: change = specific_pos.pop() if change[0]: minimal_gen_part[0].append(change[0]) if change[1]: minimal_gen_part[1].append(change[1]) minimal_generalization.append(minimal_gen_part) minimal_generalization = zip(*minimal_generalization) return dict(zip([c1, c2], minimal_generalization))
[docs] def generalize_patterns(pats, inv): """Generalize these patterns' context. Arguments: pats (Iterable[:class:`Pattern`]): the patterns to generalize inv: an Iventory instance Return: :class:`Pattern`: a new pattern """ p0 = pats[0] if len(pats) == 1: return p0 log.debug(f"Merging of {len(pats)} patterns: {[str(x) for x in pats]}") # Generalize the alternation if possible alternation = generalize_alt(pats, inv) # Generalize the context if possible context = p0.context if not p0._is_max_gen(inv): context = Context.merge([p.context for p in pats], inv) new = Pattern(alternation, context, inv) new.lexemes = set().union(*(p.lexemes for p in pats)) return new
[docs] def incremental_generalize_patterns(pats, inv): """Merge patterns incrementally as long as the pattern has the same coverage. Attempt to merge each patterns two by two, and refrain from doing so if the pattern doesn't match all the lexemes that lead to its inference. Also attempt to merge together patterns that have not been merged with others. Arguments: pats: the patterns inv: Inventory instance Returns: List[:class:`Pattern`]: a list of patterns, at best of length 1, at worst of the same length as the input. """ if len(pats) == 1: return pats def correct(p, a, b): """Return whether the pattern p is correct for the forms a and b and the specified cells.""" return (p.apply(a, p.cells, inv, raiseOnFail=False) == b) \ and (p.apply(b, p.cells[::-1], inv, raiseOnFail=False) == a) exact_alternations = [x.to_alt(inv, exhaustive_blanks=True) for x in pats] counts = Counter(exact_alternations) pats = sorted(pats, key=lambda x: counts[x.to_alt(inv, exhaustive_blanks=True)], reverse=True) merged = [pats[0]] log.debug(f"Prudent incremental merging of {len(pats)} patterns: {[str(x) for x in pats]}") for pat in pats[1:]: pat_is_merged = False for i in range(len(merged)): lexemes = merged[i].lexemes if not (lexemes.issubset(pat.lexemes) or lexemes.issuperset(pat.lexemes)): new = generalize_patterns([merged[i], pat], inv) if all(correct(new, a, b) for l, a, b in new.lexemes): merged[i] = new pat_is_merged = True break if not pat_is_merged: merged.append(pat) return merged