# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Sacha Beniamine.
This module is used to generalize pats contexts.
"""
from collections import Counter
from .contexts import Context
from .patterns import Pattern
import logging
from itertools import zip_longest
log = logging.getLogger("Qumin")
[docs]
def generalize_alt(patterns, inv):
"""Use the generalized alternation, using features when possible rather than segments."""
p0 = patterns[0]
alt_count = len(set((p.to_alt(inv, exhaustive_blanks=True) for p in patterns)))
if alt_count < 2 or p0._gen_alt is None:
return p0.alternation
c1, c2 = p0.cells
# At first, alternations are {cell: parts},
# parts are tuple(positions)
# positions are tuples of phoneme positions
# The order of embedding is cell, then parts, then positions
# Ex: {c1: (('a','b'),('c',)), c2: (('A','B'),('C',))}
# This gets us to parts, then cells, then positions
# [ (('a','b'), ('A','B')),
# (('c',), ('C',)) ]
generalized = list(zip(p0._gen_alt[c1], p0._gen_alt[c2]))
specific = list(zip(*(zip(p.alternation[c1], p.alternation[c2]) for p in patterns)))
minimal_generalization = []
# Iterate first over parts
for i, generalized_part in enumerate(generalized): # Iterates on alternation parts, between blanks
# A part looks like: (('a','b'), ('A','B')),
# We now want positions, then cells: (('a','A'), ('b','B')),
generalized_part = list(zip(*generalized_part))
# Arrange by phoneme position, then cell, then patterns
specific_parts = list(zip(*(zip_longest(*p, fillvalue="")
for p in specific[i])))
minimal_gen_part = ([], [])
for j, generalized_pos in enumerate(generalized_part): # Iterates on each change in the part
specific_pos = set(specific_parts[j])
if len(specific_pos) > 1:
minimal_gen_part[0].append(generalized_pos[0])
minimal_gen_part[1].append(generalized_pos[1])
else:
change = specific_pos.pop()
if change[0]:
minimal_gen_part[0].append(change[0])
if change[1]:
minimal_gen_part[1].append(change[1])
minimal_generalization.append(minimal_gen_part)
minimal_generalization = zip(*minimal_generalization)
return dict(zip([c1, c2], minimal_generalization))
[docs]
def generalize_patterns(pats, inv):
"""Generalize these patterns' context.
Arguments:
pats (Iterable[:class:`Pattern`]): the patterns to generalize
inv: an Iventory instance
Return:
:class:`Pattern`: a new pattern
"""
p0 = pats[0]
if len(pats) == 1:
return p0
log.debug(f"Merging of {len(pats)} patterns: {[str(x) for x in pats]}")
# Generalize the alternation if possible
alternation = generalize_alt(pats, inv)
# Generalize the context if possible
context = p0.context
if not p0._is_max_gen(inv):
context = Context.merge([p.context for p in pats], inv)
new = Pattern(alternation, context, inv)
new.lexemes = set().union(*(p.lexemes for p in pats))
return new
[docs]
def incremental_generalize_patterns(pats, inv):
"""Merge patterns incrementally as long as the pattern has the same coverage.
Attempt to merge each patterns two by two, and refrain from doing so if the pattern doesn't match all the lexemes
that lead to its inference.
Also attempt to merge together patterns that have not been merged with others.
Arguments:
pats: the patterns
inv: Inventory instance
Returns:
List[:class:`Pattern`]: a list of patterns, at best of length 1, at worst of the same length as the input.
"""
if len(pats) == 1:
return pats
def correct(p, a, b):
"""Return whether the pattern p is correct for the forms a and b and the specified cells."""
return (p.apply(a, p.cells, inv, raiseOnFail=False) == b) \
and (p.apply(b, p.cells[::-1], inv, raiseOnFail=False) == a)
exact_alternations = [x.to_alt(inv, exhaustive_blanks=True) for x in pats]
counts = Counter(exact_alternations)
pats = sorted(pats, key=lambda x: counts[x.to_alt(inv, exhaustive_blanks=True)], reverse=True)
merged = [pats[0]]
log.debug(f"Prudent incremental merging of {len(pats)} patterns: {[str(x) for x in pats]}")
for pat in pats[1:]:
pat_is_merged = False
for i in range(len(merged)):
lexemes = merged[i].lexemes
if not (lexemes.issubset(pat.lexemes) or lexemes.issuperset(pat.lexemes)):
new = generalize_patterns([merged[i], pat], inv)
if all(correct(new, a, b) for l, a, b in new.lexemes):
merged[i] = new
pat_is_merged = True
break
if not pat_is_merged:
merged.append(pat)
return merged