Source code for qumin.representations.patterns

# -*- coding: utf-8 -*-
# !/usr/bin/env python3
"""author: Sacha Beniamine.

This module addresses the modeling of inflectional alternation patterns."""

import logging
import re
from collections import defaultdict
from copy import deepcopy
from itertools import groupby, zip_longest
# External tools

from . import alignment
from .contexts import Context
from .quantity import one, optional, some, kleenestar
from .segments import Form, _regex_or, Inventory
# Our modules

log = logging.getLogger("Qumin")



def _replace_alternation(matchgroups, replacements, inventory):
    """ Replace all matches in matching groups using replacements.

    Args:
        matches (iterable of str): an iterable of input sequences which match the rule (should cover the entire form)
        replacements (iterable of str|None|tuple): an iterable of replacements.
            Replacements can be:
                - A tuple symbolizing a bijective phonological function
                - None if no replacement is to be made (copy matched characters)
                - characters by which to replace the match
        inventory (segments.Inventory): sound inventory

    Returns:
        a space separated string

    Examples:
        In this example,
        - the first match, "t a " is copied as is,
        - the second match, "t " is transformed by consonant voicing
        - the third match, "a " is replaced by "i"
        >>> inv = Inventory.from_file("tests/data/frenchipa.csv")
        >>> matches = ("t a ",  "t ",                   "a ")
        >>> repl    = (None,    (set("ptk"),set("bdg")), "i")
        >>> _replace_alternation(matches, repl, inv)
        't a d i '
    """

    def iter_replacements():
        for chars, repl in zip(matchgroups, replacements):
            chars = chars.strip()
            t = type(repl)
            if repl is None:  # no change
                yield chars
            elif not repl:  # repl is the empty string, don't yield
                continue
            elif t is str:  # change by substitution
                yield repl
            elif t is tuple:  # change by phonological func
                yield inventory.get_from_transform(chars, repl)

    return " ".join(iter_replacements()) + " "


[docs] def are_all_identical(iterable): """Test whether all elements in the iterable are identical.""" return iterable and len(set(iterable)) == 1
def _iter_alternation(alt, inv): """ Group alternations into sequences of segments or phonological transfomations. An alternation part is a sequence of strings or frozenset. Each string represents either: - A segment - A frozenset representing a class of segment (which forms part of a phonological transformation) This iterates by grouping contiguous segments together, and classes of segments separately. Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> alt_members = _iter_alternation(['a', 'b', 'a', frozenset(('e', 'u'))], inv) >>> list(alt_members) == [(True, ['a', 'b', 'a']), (False, frozenset({'e', 'u'}))] True Args: alt (iterable of str or frozenset): An alternation part. Yields: Iterator of pairs of is_segment, then either a sequence of segments or a frozenset. """ for is_segment, group in groupby(alt, lambda x: inv.is_leaf(x)): if is_segment: yield is_segment, list(group) else: for x in group: yield is_segment, x
[docs] class NotApplicable(Exception): """Raised when a :class:`Pattern` can't be applied to a form.""" pass
[docs] class Pattern(object): r"""Represent the alternation pattern between two forms. Applying the pattern to one of the original forms yields the second one. As an example, we will use the following alternation in a present verb of french: ========================== ========================== ========================== cells Forms Transcription ========================== ========================== ========================== prs.1.sg ⇌ prs.2.pl j'amène ⇌ vous amenez amEn ⇌ amənE ========================== ========================== ========================== Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> cells = ("prs.1.sg", "prs.2.pl") >>> forms = (Form("a m E n"), Form("a m Ø n E")) >>> p = Pattern.from_forms(cells, forms, inv) >>> type(p) <class 'qumin.representations.patterns.Pattern'> >>> p E_ ⇌ Ø_E / am_n_ <0> >>> p.apply(Form("a m E n"), cells, inv) Form(a m Ø n E) """ def __lt__(self, other): """Sort on lexicographic order. There is no reason to sort patterns, but Pandas wants to do it from time to time, this is only implemented to avoid Pandas complaining. Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> cells = ("prs.1.sg", "prs.2.pl") >>> forms = (Form("a m E n"), Form("a m Ø n E")) >>> forms2 = (Form("b w a"), Form("b y v E")) >>> p1 = Pattern.from_forms(cells, forms, inv) >>> p2 = Pattern.from_forms(cells, forms2, inv) >>> p1 < p2 True """ return str(self) < str(other)
[docs] def __init__(self, alternation, context, inv): """ Constructor for Patterns. Arguments: cells (Iterable): Cells labels (str), in the same order. alternation (dict): Dictionary of cells to alternating material (list of tuples) context (bool): a Context instance inv: sounds Inventory """ self.score = 0 self.lexemes = set() self.alternation = alternation self.context = context self.cells = tuple(alternation) self._regex, self._repl = self._create_regex(inv) self._repr = self._make_str_(inv, features=False) self._feat_str = self._make_str_(inv, features=True) self._find_generalized_alt(inv)
[docs] @classmethod def from_forms(cls, cells, forms, inv): """ Create a pattern fron unaligned forms (aligns them left) Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> cells = ("prs.1.sg", "prs.2.pl") >>> forms = (Form("a m E n"), Form("a m Ø n E")) >>> p = Pattern.from_forms(cells, forms, inv) >>> p E_ ⇌ Ø_E / am_n_ <0> >>> p.score # is zero at initialization 0 >>> p.lexemes # is empty at initialization set() >>> p.alternation {'prs.1.sg': [('E',), ('',)], 'prs.2.pl': [('Ø',), ('E',)]} >>> p.context # this is a Context ((?:a )(?:m )){}((?:n )){} >>> p.cells ('prs.1.sg', 'prs.2.pl') >>> p._repr 'E_ ⇌ Ø_E / am_n_' >>> p._feat_str 'E_ ⇌ Ø_E / am_n_' >>> p._gen_alt == {'prs.1.sg': ((frozenset({'ɑ̃', 'ɛ̃', 'i', 'j', 'E'}),), ('',)), ... 'prs.2.pl': ((frozenset({'ɥ', 'ɔ̃', 'y', 'Ø', 'œ̃'}),), ('E',))} True """ aligned = list(alignment.align_left(*[f.tokens for f in forms], fillvalue="")) return cls.from_aligned(cells, aligned, inv)
[docs] @classmethod def from_aligned(cls, cells, alignment, inv): """ Create a pattern fron aligned forms (aligns them left) Arguments: cells (Iterable): Cells labels (str), in the same order. alignment (Iterable): Alogned foorms (str) to be segmented. """ alternation = [] context = [] comparables = iter(alignment) elements = next(comparables, None) while elements is not None: while elements is not None and are_all_identical(elements): context.append(elements[0]) elements = next(comparables, None) if elements is not None and not are_all_identical(elements): altbuffer = [[x] for x in elements] context.append("{}") elements = next(comparables, None) while elements and not are_all_identical(elements): for buffer, new in zip(altbuffer, elements): if buffer[-1] == "": buffer[-1] = new elif new != "": buffer.append(new) elements = next(comparables, None) alternation.append(altbuffer) alternation = {cell: [tuple(x) for x in alt] for cell, alt in zip_longest(cells, zip(*alternation), fillvalue=("",))} context = Context([(x, one) if x != "{}" else "{}" for x in context], inv) return Pattern(alternation, context, inv)
def __deepcopy__(self, memo): """ Deep copy of this pattern. Can't use the constructor because we are avoiding passing the inventory """ cls = self.__class__ copy = cls.__new__(cls) copy.context = deepcopy(self.context, memo) copy.alternation = deepcopy(self.alternation, memo) copy.cells = self.cells copy.score = self.score copy._repr = self._repr copy._str = self._str copy._regex = deepcopy(self._regex, memo) copy._feat_str = self._feat_str copy._gen_alt = deepcopy(self._gen_alt, memo) return copy
[docs] @classmethod def new_identity(cls, cells, inv): """ Identity pattern factory. The alternation is empty, and the context is a sequence of any number of allowed segments. Args: cells: Pair of cell for this pattern. inv (Inventory): Sound Inventory. Returns: Pattern: a new identity pattern. Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> print(Pattern.new_identity(('A','B'), inv)) ⇌ / X* """ alternation = {c:[tuple()] for c in cells} context = Context([(inv._max, kleenestar)], inv) return cls(alternation, context, inv)
[docs] @classmethod def from_str(cls, cells, string, inv): """ Parse an exported pattern. To be parsed back, patterns need to be exported by `repr()`, not `str()`. Note: Phonemes in context classes are now separated by "," Args: cells (tuple of str): Cells labels (str). string (str): pattern given as a string. inv (Inventory): Sound inventory. Returns: Pattern: a parsed Pattern object. Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> p = Pattern.from_str(('A', 'B'), "ɥ ⇌ yj / {E,O,a,b,d,f,g,i,j,k,l,m,n,p,s,t,u,v,w,y,z,Ø,ŋ,œ̃,ɑ̃,ɔ̃,ɛ̃,ɥ,ɲ,ʁ,ʃ,ʒ}*{b,d,f,g,k,l,m,n,p,s,t,v,z,ŋ,ɲ,ʁ,ʃ,ʒ}_E <58>", inv) >>> type(p) is Pattern True >>> str(p) 'ɥ ⇌ yj / X*C_E' >>> p ɥ ⇌ yj / {E,O,a,b,d,f,g,i,j,k,l,m,n,p,s,t,u,v,w,y,z,Ø,ŋ,œ̃,ɑ̃,ɔ̃,ɛ̃,ɥ,ɲ,ʁ,ʃ,ʒ}*{b,d,f,g,k,l,m,n,p,s,t,v,z,ŋ,ɲ,ʁ,ʃ,ʒ}_E <58.0> >>> p = Pattern.from_str(('A','B'), "E_ ⇌ Ø_E / am_n_ <0>", inv) >>> type(p) is Pattern True >>> p E_ ⇌ Ø_E / am_n_ <0.0> """ quantities = {"": one, "?": optional, "+": some, "*": kleenestar} simple_segs = sorted((s for s in inv._classes if inv.is_leaf(s)), key=len, reverse=True) seg = r"(?:{})".format("|".join(simple_segs)) classes = r"(?:\{[^\}]+\})" def is_class(s): return s is not None and ("," in s) and (s[0], s[-1]) == ("{", "}") def get_class(s): return frozenset(s[1:-1].split(",")) def parse_alternation(string, cells): regex = r"({classes}|{seg})".format(seg=seg, classes=classes) left, right = string.split(" ⇌ ") c1, c2 = cells alternation = {c1: [], c2: []} for segs_l, segs_r in zip_longest(left.split("_"), right.split("_")): segs_l = re.findall(regex, segs_l) segs_r = re.findall(regex, segs_r) alt_l = [] alt_r = [] # Re-align classes: i, j = 0, 0 while i < len(segs_l) and j < len(segs_r): l_class = is_class(segs_l[i]) if i < len(segs_l) else False r_class = is_class(segs_r[j]) if j < len(segs_r) else False if l_class and not r_class: segs_l = [""] + segs_l elif r_class and not l_class: segs_r = [""] + segs_r else: i += 1 j += 1 # prepare alternation for sl, sr in zip_longest(segs_l, segs_r): if sr is None: alt_l.append(sl) elif sl is None: alt_r.append(sr) else: l_class = is_class(sl) r_class = is_class(sr) if l_class: alt_l.append(get_class(sl)) else: alt_l.append(sl) if r_class: alt_r.append(get_class(sr)) else: alt_r.append(sr) alternation[c1].append(tuple(alt_l)) alternation[c2].append(tuple(alt_r)) return alternation def parse_context(string): regex = r"({classes}|{seg}|_)([+*?]?)".format(seg=seg, classes=classes) for s, q in re.findall(regex, string): if (s, q) == ("_", ""): yield "{}" elif is_class(s): yield get_class(s), quantities[q] else: yield s, quantities[q] try: alt_str, ctxt_str, score_str = re.match(r"(.*) / (.*) ?<([\d.e-]+)>", string).groups() except AttributeError as e: message = "I can't create a pattern from this: {}. Maybe the pattern has been exported with str and not repr ?".format( string) raise ValueError(message) from e context = Context(list(parse_context(ctxt_str)), inv) alternation = parse_alternation(alt_str, cells) new = cls(alternation, context, inv) new.score = float(score_str) new._gen_alt = None return new
def __eq__(self, other): """ Pattern equality: we simply check that they are both Pattern and their full string representation is identical Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> p1 = Pattern.from_str(("A", "B"), "E_ ⇌ Ø_E / am_n_ <0>", inv) >>> f1 = Form.from_raw("a m E n", inv) >>> f2 = Form.from_raw("a m ə n E", inv) >>> p2 = Pattern.from_forms(('A','B'), (f1, f2), inv) >>> p1 == p2 True >>> p1 == "E_ ⇌ Ø_E / am_n_ <0>" False Args: other (Pattern): another Pattern Returns: Whether the two patterns are identical """ return type(self) is Pattern and type(other) is Pattern and str(self) == str(other) def __hash__(self): return hash(str(self)) def __repr__(self): """Return a repr string, for ex: _ ⇌ E / abEs_ <0.5>. repr() provides an exportable string, which: - Lists all sound classes exhaustively - Comprises also the score This makes it possible to instantiate back a pattern. """ return '{content} <{score}>'.format(content=self._repr, score=self.score) def __str__(self): """ Return a str representation, for ex: _ ⇌ E / X+_ str() provides a human readable string which: - Represents sounds classes in shorthand - Does not include the score """ return self._feat_str
[docs] def is_identity(self): """ Checks whether this pattern is an identity pattern. Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> p = Pattern.new_identity(("A", "B"), inv) >>> p.is_identity() True """ return all(self.alternation[x] == [()] for x in self.cells)
def _make_str_(self, inv, features=True, reverse=False): """ Generic string builder used to construct representations. """ alternation = self._format_alt(inv, features=features) if reverse: alternation = " ⇌ ".join("_".join(alt) for alt in alternation[::-1]) else: alternation = " ⇌ ".join("_".join(alt) for alt in alternation) context = self.context.to_str(inv, mode=int(features) + 1) return alternation + " / " + context
[docs] def to_alt(self, inv, exhaustive_blanks=True, use_gen=False, **kwargs): """ Build a string representing the alternation Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> cells = ("prs.1.sg", "prs.2.pl") >>> forms = (Form("a m E n"), Form("a m Ø n E")) >>> p = Pattern.from_forms(cells, forms, inv) >>> p.alternation {'prs.1.sg': [('E',), ('',)], 'prs.2.pl': [('Ø',), ('E',)]} >>> p.to_alt(inv) '_E_ ⇌ _Ø_E' >>> p.to_alt(inv, exhaustive_blanks=False) 'E_ ⇌ Ø_E' >>> p.to_alt(inv, use_gen=True) '_[-arro]_ ⇌ _[+arro]_E' Arguments: exhaustive_blanks (bool): Whether initial and final contexts should be marked by a filler. use_gen (bool): Whether the alternation should use phonological generalizations (when available). Returns: A string representing the alternation, with contexts positions replaced by the filler "_". """ filler = "_" def add_ellipsis(alt, initial, final): if alt == [""]: return filler else: flattened = ["".join(str(x) for x in affix) for affix in alt] return initial + filler.join(flattened) + final initial = "" if (not self.context[0].blank or not exhaustive_blanks) else filler final = "" if (self.context[-1].blank or not exhaustive_blanks) else filler if use_gen and self._gen_alt: tmp_alt = self.alternation self.alternation = self._gen_alt result = [add_ellipsis(alt, initial, final) for alt in self._format_alt(inv)] if use_gen and self._gen_alt: self.alternation = tmp_alt self._repr = self._make_str_(inv, features=False) self._feat_str = self._make_str_(inv, features=True) return " ⇌ ".join(result)
def _iter_alt(self, **kwargs): """Generator of formatted alternating material for each cell.""" for cell in self.cells: formatted = [] for segs in self.alternation[cell]: formatted.append("".join(segs)) yield formatted def _create_regex(self, inv): """Create regexes and replacement strings for this pattern. Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> cells = ("prs.1.sg", "prs.2.pl") >>> forms = (Form("a m E n"), Form("a m Ø n E")) >>> p = Pattern.from_forms(cells, forms, inv) >>> p E_ ⇌ Ø_E / am_n_ <0> >>> p._repl # Calls _create_regex if needed {'prs.1.sg': [None, 'E', None, ''], 'prs.2.pl': [None, 'Ø', None, 'E']} >>> p._regex # Calls _create_regex if needed {'prs.1.sg': re.compile('^((?:a )(?:m ))((?:E ))((?:n ))()$'), 'prs.2.pl': re.compile('^((?:a )(?:m ))((?:Ø ))((?:n ))((?:E ))$')} """ c1, c2 = self.cells # Build alternation as list of zipped segments / transformations alternances = [] for left, right in zip(self.alternation[c1], self.alternation[c2]): alternances.append( list(zip_longest(_iter_alternation(left, inv), _iter_alternation(right, inv), fillvalue=(False, "")))) regex = {c1: "", c2: ""} repl = {c1: [], c2: []} for i, group in enumerate(self.context): c = group.to_str(inv, mode=0).format("") regex[c1] += c regex[c2] += c repl[c1].append(None) repl[c2].append(None) if group.blank: # alternation # We build one regex group for each continuous sequence of segments and each transformation for (is_segments_1, chars_1), (is_segments_2, chars_2) in alternances[i]: if is_segments_1 or is_segments_2: # Substitution replacement: pass directly the target segments # (this is a string; or None if no replacement) repl[c1].append(" ".join(chars_1)) repl[c2].append(" ".join(chars_2)) # Regex matches these segments as one regex[c1] += "({})".format("".join(inv.regex(x) if x else "" for x in chars_1)) regex[c2] += "({})".format("".join(inv.regex(x) if x else "" for x in chars_2)) else: # Transformation replacement (this is a tuple) repl[c1].append((chars_2, chars_1)) repl[c2].append((chars_1, chars_2)) # Regex matches these segments as one group regex[c1] += "({})".format(_regex_or(chars_1)) regex[c2] += "({})".format(_regex_or(chars_2)) return {c: re.compile("^" + regex[c] + "$") for c in regex}, repl def _find_generalized_alt(self, inv): """See if the alternation can be generalized using phonological operations.""" c1, c2 = self.cells this_alt = {c1: [], c2: []} gen_any = False for left, right in zip(self.alternation[c1], self.alternation[c2]): gen_left = [] gen_right = [] for a, b in zip_longest(left, right, fillvalue=""): if a != "" and b != "": A, B = inv.transformation(a, b) else: A, B = "", "" if len(A) > 1 or len(B) > 1: gen_any = True gen_left.append(A) gen_right.append(B) else: gen_left.append(a) gen_right.append(b) this_alt[c1].append(tuple(gen_left)) this_alt[c2].append(tuple(gen_right)) if gen_any: self._gen_alt = dict(zip(self.cells, (tuple(this_alt[x]) for x in self.cells))) self._regex, self._repl = self._create_regex(inv) else: self._gen_alt = None
[docs] def applicable(self, form, cell): """Test if this pattern matches a form, i.e. if the pattern is applicable to the form. Arguments: form (str): a form. cell (str): A cell contained in self.cells. Returns: `bool`: whether the pattern is applicable to the form from that cell. """ try: regex = self._regex[cell] return bool(regex.match(form)) except KeyError as err: raise KeyError("Unknown cell {}." " This pattern's cells are {}." "".format(err, " and ".join(self.cells)))
[docs] def apply(self, form, names, inv, raiseOnFail=True): """Apply the pattern to a form. Arguments: form : a form, assumed to belong to the cell `names[0]`. names : apply to a form of cell `names[0]` to produce a form of cell `names[1]` (default:`self.cells`). Patterns being non-oriented, it is better to use the names argument. inv (segments.Inventory): sound inventory raiseOnFail (bool): defaults to True. If true, raise an error when the pattern is not applicable to the form. If False, return None instead. Returns: form belonging the opposite cell. """ from_cell, to_cell = names if names else self.cells reg = self._regex[from_cell] string, nb_subs = reg.subn(lambda x: _replace_alternation(x.groups(""), self._repl[to_cell], inv), form) if nb_subs == 0 and (not self.applicable(form, from_cell)): if raiseOnFail: raise NotApplicable("The context {} from the pattern {} and cells {} -> {}" "doesn't match the form \"{}\"" "".format(self._regex[from_cell].pattern, self, from_cell, to_cell, form)) else: return None return Form(string)
def _is_max_gen(self, inv): maxi_seg = inv._max return all([x in [(maxi_seg, kleenestar), "{}"] for x in self.context]) def _format_alt(self, inv, features=True): """Get formatted alternating material for each cell.""" def format_as_chars(left, right): return ("{{{}}}".format(",".join(sorted(left))), "{{{}}}".format(",".join(sorted(right)))) def format_as_features(left, right): feats_left, feats_right = inv.get_transform_features(left, right) feats_left = "[{}]".format(" ".join(sorted(feats_left))) feats_right = "[{}]".format(" ".join(sorted(feats_right))) chars_left, chars_right = format_as_chars(left, right) if len(feats_left) + len(feats_right) <= len(chars_left) + len(chars_right): return feats_left, feats_right return chars_left, chars_right if features: format_regular_change = format_as_features else: format_regular_change = format_as_chars c1, c2 = self.cells alternation = zip(self.alternation[c1], self.alternation[c2]) c1_alt = [] c2_alt = [] for left, right in alternation: formatted_left = "" formatted_right = "" for seg_left, seg_right in zip_longest(left, right, fillvalue=""): if inv.is_leaf(seg_left) and inv.is_leaf(seg_right): formatted_left += seg_left formatted_right += seg_right else: l, r = format_regular_change(seg_left, seg_right) formatted_left += l formatted_right += r c1_alt.append(formatted_left) c2_alt.append(formatted_right) return c1_alt, c2_alt