# -*- coding: utf-8 -*-
# !/usr/bin/env python3
"""author: Sacha Beniamine.
This module addresses the modeling of inflectional alternation patterns."""
import logging
import re
from collections import defaultdict
from copy import deepcopy
from itertools import groupby, zip_longest
# External tools
from . import alignment
from .contexts import Context
from .quantity import one, optional, some, kleenestar
from .segments import Form, _regex_or, Inventory
# Our modules
log = logging.getLogger("Qumin")
def _replace_alternation(matchgroups, replacements, inventory):
""" Replace all matches in matching groups using replacements.
Args:
matches (iterable of str): an iterable of input sequences which match the rule (should cover the entire form)
replacements (iterable of str|None|tuple): an iterable of replacements.
Replacements can be:
- A tuple symbolizing a bijective phonological function
- None if no replacement is to be made (copy matched characters)
- characters by which to replace the match
inventory (segments.Inventory): sound inventory
Returns:
a space separated string
Examples:
In this example,
- the first match, "t a " is copied as is,
- the second match, "t " is transformed by consonant voicing
- the third match, "a " is replaced by "i"
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> matches = ("t a ", "t ", "a ")
>>> repl = (None, (set("ptk"),set("bdg")), "i")
>>> _replace_alternation(matches, repl, inv)
't a d i '
"""
def iter_replacements():
for chars, repl in zip(matchgroups, replacements):
chars = chars.strip()
t = type(repl)
if repl is None: # no change
yield chars
elif not repl: # repl is the empty string, don't yield
continue
elif t is str: # change by substitution
yield repl
elif t is tuple: # change by phonological func
yield inventory.get_from_transform(chars, repl)
return " ".join(iter_replacements()) + " "
[docs]
def are_all_identical(iterable):
"""Test whether all elements in the iterable are identical."""
return iterable and len(set(iterable)) == 1
def _iter_alternation(alt, inv):
""" Group alternations into sequences of segments or phonological transfomations.
An alternation part is a sequence of strings or frozenset. Each string represents either:
- A segment
- A frozenset representing a class of segment (which forms part of a phonological transformation)
This iterates by grouping contiguous segments together, and classes of segments separately.
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> alt_members = _iter_alternation(['a', 'b', 'a', frozenset(('e', 'u'))], inv)
>>> list(alt_members) == [(True, ['a', 'b', 'a']), (False, frozenset({'e', 'u'}))]
True
Args:
alt (iterable of str or frozenset): An alternation part.
Yields:
Iterator of pairs of is_segment, then either a sequence of segments or a frozenset.
"""
for is_segment, group in groupby(alt, lambda x: inv.is_leaf(x)):
if is_segment:
yield is_segment, list(group)
else:
for x in group:
yield is_segment, x
[docs]
class NotApplicable(Exception):
"""Raised when a :class:`Pattern` can't be applied to a form."""
pass
[docs]
class Pattern(object):
r"""Represent the alternation pattern between two forms.
Applying the pattern to one of the original forms yields the second one.
As an example, we will use the following alternation
in a present verb of french:
========================== ========================== ==========================
cells Forms Transcription
========================== ========================== ==========================
prs.1.sg ⇌ prs.2.pl j'amène ⇌ vous amenez amEn ⇌ amənE
========================== ========================== ==========================
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> cells = ("prs.1.sg", "prs.2.pl")
>>> forms = (Form("a m E n"), Form("a m Ø n E"))
>>> p = Pattern.from_forms(cells, forms, inv)
>>> type(p)
<class 'qumin.representations.patterns.Pattern'>
>>> p
E_ ⇌ Ø_E / am_n_ <0>
>>> p.apply(Form("a m E n"), cells, inv)
Form(a m Ø n E)
"""
def __lt__(self, other):
"""Sort on lexicographic order.
There is no reason to sort patterns,
but Pandas wants to do it from time to time,
this is only implemented to avoid Pandas complaining.
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> cells = ("prs.1.sg", "prs.2.pl")
>>> forms = (Form("a m E n"), Form("a m Ø n E"))
>>> forms2 = (Form("b w a"), Form("b y v E"))
>>> p1 = Pattern.from_forms(cells, forms, inv)
>>> p2 = Pattern.from_forms(cells, forms2, inv)
>>> p1 < p2
True
"""
return str(self) < str(other)
[docs]
def __init__(self, alternation, context, inv):
""" Constructor for Patterns.
Arguments:
cells (Iterable): Cells labels (str), in the same order.
alternation (dict): Dictionary of cells to alternating material (list of tuples)
context (bool): a Context instance
inv: sounds Inventory
"""
self.score = 0
self.lexemes = set()
self.alternation = alternation
self.context = context
self.cells = tuple(alternation)
self._regex, self._repl = self._create_regex(inv)
self._repr = self._make_str_(inv, features=False)
self._feat_str = self._make_str_(inv, features=True)
self._find_generalized_alt(inv)
[docs]
@classmethod
def from_aligned(cls, cells, alignment, inv):
""" Create a pattern fron aligned forms (aligns them left)
Arguments:
cells (Iterable): Cells labels (str), in the same order.
alignment (Iterable): Alogned foorms (str) to be segmented.
"""
alternation = []
context = []
comparables = iter(alignment)
elements = next(comparables, None)
while elements is not None:
while elements is not None and are_all_identical(elements):
context.append(elements[0])
elements = next(comparables, None)
if elements is not None and not are_all_identical(elements):
altbuffer = [[x] for x in elements]
context.append("{}")
elements = next(comparables, None)
while elements and not are_all_identical(elements):
for buffer, new in zip(altbuffer, elements):
if buffer[-1] == "":
buffer[-1] = new
elif new != "":
buffer.append(new)
elements = next(comparables, None)
alternation.append(altbuffer)
alternation = {cell: [tuple(x) for x in alt]
for cell, alt
in zip_longest(cells,
zip(*alternation),
fillvalue=("",))}
context = Context([(x, one) if x != "{}" else "{}" for x in context], inv)
return Pattern(alternation, context, inv)
def __deepcopy__(self, memo):
""" Deep copy of this pattern.
Can't use the constructor because we are avoiding passing the inventory
"""
cls = self.__class__
copy = cls.__new__(cls)
copy.context = deepcopy(self.context, memo)
copy.alternation = deepcopy(self.alternation, memo)
copy.cells = self.cells
copy.score = self.score
copy._repr = self._repr
copy._str = self._str
copy._regex = deepcopy(self._regex, memo)
copy._feat_str = self._feat_str
copy._gen_alt = deepcopy(self._gen_alt, memo)
return copy
[docs]
@classmethod
def new_identity(cls, cells, inv):
""" Identity pattern factory.
The alternation is empty, and the context is a sequence of any number of allowed segments.
Args:
cells: Pair of cell for this pattern.
inv (Inventory): Sound Inventory.
Returns:
Pattern: a new identity pattern.
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> print(Pattern.new_identity(('A','B'), inv))
⇌ / X*
"""
alternation = {c:[tuple()] for c in cells}
context = Context([(inv._max, kleenestar)], inv)
return cls(alternation, context, inv)
[docs]
@classmethod
def from_str(cls, cells, string, inv):
""" Parse an exported pattern.
To be parsed back, patterns need to be exported by `repr()`, not `str()`.
Note: Phonemes in context classes are now separated by ","
Args:
cells (tuple of str): Cells labels (str).
string (str): pattern given as a string.
inv (Inventory): Sound inventory.
Returns:
Pattern: a parsed Pattern object.
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> p = Pattern.from_str(('A', 'B'), "ɥ ⇌ yj / {E,O,a,b,d,f,g,i,j,k,l,m,n,p,s,t,u,v,w,y,z,Ø,ŋ,œ̃,ɑ̃,ɔ̃,ɛ̃,ɥ,ɲ,ʁ,ʃ,ʒ}*{b,d,f,g,k,l,m,n,p,s,t,v,z,ŋ,ɲ,ʁ,ʃ,ʒ}_E <58>", inv)
>>> type(p) is Pattern
True
>>> str(p)
'ɥ ⇌ yj / X*C_E'
>>> p
ɥ ⇌ yj / {E,O,a,b,d,f,g,i,j,k,l,m,n,p,s,t,u,v,w,y,z,Ø,ŋ,œ̃,ɑ̃,ɔ̃,ɛ̃,ɥ,ɲ,ʁ,ʃ,ʒ}*{b,d,f,g,k,l,m,n,p,s,t,v,z,ŋ,ɲ,ʁ,ʃ,ʒ}_E <58.0>
>>> p = Pattern.from_str(('A','B'), "E_ ⇌ Ø_E / am_n_ <0>", inv)
>>> type(p) is Pattern
True
>>> p
E_ ⇌ Ø_E / am_n_ <0.0>
"""
quantities = {"": one, "?": optional, "+": some, "*": kleenestar}
simple_segs = sorted((s for s in inv._classes if inv.is_leaf(s)),
key=len, reverse=True)
seg = r"(?:{})".format("|".join(simple_segs))
classes = r"(?:\{[^\}]+\})"
def is_class(s):
return s is not None and ("," in s) and (s[0], s[-1]) == ("{", "}")
def get_class(s):
return frozenset(s[1:-1].split(","))
def parse_alternation(string, cells):
regex = r"({classes}|{seg})".format(seg=seg, classes=classes)
left, right = string.split(" ⇌ ")
c1, c2 = cells
alternation = {c1: [], c2: []}
for segs_l, segs_r in zip_longest(left.split("_"),
right.split("_")):
segs_l = re.findall(regex, segs_l)
segs_r = re.findall(regex, segs_r)
alt_l = []
alt_r = []
# Re-align classes:
i, j = 0, 0
while i < len(segs_l) and j < len(segs_r):
l_class = is_class(segs_l[i]) if i < len(segs_l) else False
r_class = is_class(segs_r[j]) if j < len(segs_r) else False
if l_class and not r_class:
segs_l = [""] + segs_l
elif r_class and not l_class:
segs_r = [""] + segs_r
else:
i += 1
j += 1
# prepare alternation
for sl, sr in zip_longest(segs_l, segs_r):
if sr is None:
alt_l.append(sl)
elif sl is None:
alt_r.append(sr)
else:
l_class = is_class(sl)
r_class = is_class(sr)
if l_class:
alt_l.append(get_class(sl))
else:
alt_l.append(sl)
if r_class:
alt_r.append(get_class(sr))
else:
alt_r.append(sr)
alternation[c1].append(tuple(alt_l))
alternation[c2].append(tuple(alt_r))
return alternation
def parse_context(string):
regex = r"({classes}|{seg}|_)([+*?]?)".format(seg=seg, classes=classes)
for s, q in re.findall(regex, string):
if (s, q) == ("_", ""):
yield "{}"
elif is_class(s):
yield get_class(s), quantities[q]
else:
yield s, quantities[q]
try:
alt_str, ctxt_str, score_str = re.match(r"(.*) / (.*) ?<([\d.e-]+)>", string).groups()
except AttributeError as e:
message = "I can't create a pattern from this: {}. Maybe the pattern has been exported with str and not repr ?".format(
string)
raise ValueError(message) from e
context = Context(list(parse_context(ctxt_str)), inv)
alternation = parse_alternation(alt_str, cells)
new = cls(alternation, context, inv)
new.score = float(score_str)
new._gen_alt = None
return new
def __eq__(self, other):
""" Pattern equality: we simply check that they are both Pattern and their full string representation is identical
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> p1 = Pattern.from_str(("A", "B"), "E_ ⇌ Ø_E / am_n_ <0>", inv)
>>> f1 = Form.from_raw("a m E n", inv)
>>> f2 = Form.from_raw("a m ə n E", inv)
>>> p2 = Pattern.from_forms(('A','B'), (f1, f2), inv)
>>> p1 == p2
True
>>> p1 == "E_ ⇌ Ø_E / am_n_ <0>"
False
Args:
other (Pattern): another Pattern
Returns:
Whether the two patterns are identical
"""
return type(self) is Pattern and type(other) is Pattern and str(self) == str(other)
def __hash__(self):
return hash(str(self))
def __repr__(self):
"""Return a repr string, for ex: _ ⇌ E / abEs_ <0.5>.
repr() provides an exportable string, which:
- Lists all sound classes exhaustively
- Comprises also the score
This makes it possible to instantiate back a pattern.
"""
return '{content} <{score}>'.format(content=self._repr, score=self.score)
def __str__(self):
""" Return a str representation, for ex: _ ⇌ E / X+_
str() provides a human readable string which:
- Represents sounds classes in shorthand
- Does not include the score
"""
return self._feat_str
[docs]
def is_identity(self):
""" Checks whether this pattern is an identity pattern.
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> p = Pattern.new_identity(("A", "B"), inv)
>>> p.is_identity()
True
"""
return all(self.alternation[x] == [()] for x in self.cells)
def _make_str_(self, inv, features=True, reverse=False):
""" Generic string builder used to construct representations.
"""
alternation = self._format_alt(inv, features=features)
if reverse:
alternation = " ⇌ ".join("_".join(alt) for alt in alternation[::-1])
else:
alternation = " ⇌ ".join("_".join(alt) for alt in alternation)
context = self.context.to_str(inv, mode=int(features) + 1)
return alternation + " / " + context
[docs]
def to_alt(self, inv, exhaustive_blanks=True, use_gen=False, **kwargs):
""" Build a string representing the alternation
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> cells = ("prs.1.sg", "prs.2.pl")
>>> forms = (Form("a m E n"), Form("a m Ø n E"))
>>> p = Pattern.from_forms(cells, forms, inv)
>>> p.alternation
{'prs.1.sg': [('E',), ('',)], 'prs.2.pl': [('Ø',), ('E',)]}
>>> p.to_alt(inv)
'_E_ ⇌ _Ø_E'
>>> p.to_alt(inv, exhaustive_blanks=False)
'E_ ⇌ Ø_E'
>>> p.to_alt(inv, use_gen=True)
'_[-arro]_ ⇌ _[+arro]_E'
Arguments:
exhaustive_blanks (bool): Whether initial and final contexts should be marked by a filler.
use_gen (bool): Whether the alternation should use phonological generalizations (when available).
Returns:
A string representing the alternation, with contexts positions replaced by the filler "_".
"""
filler = "_"
def add_ellipsis(alt, initial, final):
if alt == [""]:
return filler
else:
flattened = ["".join(str(x) for x in affix) for affix in alt]
return initial + filler.join(flattened) + final
initial = "" if (not self.context[0].blank or not exhaustive_blanks) else filler
final = "" if (self.context[-1].blank or not exhaustive_blanks) else filler
if use_gen and self._gen_alt:
tmp_alt = self.alternation
self.alternation = self._gen_alt
result = [add_ellipsis(alt, initial, final) for alt in self._format_alt(inv)]
if use_gen and self._gen_alt:
self.alternation = tmp_alt
self._repr = self._make_str_(inv, features=False)
self._feat_str = self._make_str_(inv, features=True)
return " ⇌ ".join(result)
def _iter_alt(self, **kwargs):
"""Generator of formatted alternating material for each cell."""
for cell in self.cells:
formatted = []
for segs in self.alternation[cell]:
formatted.append("".join(segs))
yield formatted
def _create_regex(self, inv):
"""Create regexes and replacement strings for this pattern.
Example:
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
>>> cells = ("prs.1.sg", "prs.2.pl")
>>> forms = (Form("a m E n"), Form("a m Ø n E"))
>>> p = Pattern.from_forms(cells, forms, inv)
>>> p
E_ ⇌ Ø_E / am_n_ <0>
>>> p._repl # Calls _create_regex if needed
{'prs.1.sg': [None, 'E', None, ''], 'prs.2.pl': [None, 'Ø', None, 'E']}
>>> p._regex # Calls _create_regex if needed
{'prs.1.sg': re.compile('^((?:a )(?:m ))((?:E ))((?:n ))()$'), 'prs.2.pl': re.compile('^((?:a )(?:m ))((?:Ø ))((?:n ))((?:E ))$')}
"""
c1, c2 = self.cells
# Build alternation as list of zipped segments / transformations
alternances = []
for left, right in zip(self.alternation[c1], self.alternation[c2]):
alternances.append(
list(zip_longest(_iter_alternation(left, inv),
_iter_alternation(right, inv),
fillvalue=(False, ""))))
regex = {c1: "", c2: ""}
repl = {c1: [], c2: []}
for i, group in enumerate(self.context):
c = group.to_str(inv, mode=0).format("")
regex[c1] += c
regex[c2] += c
repl[c1].append(None)
repl[c2].append(None)
if group.blank:
# alternation
# We build one regex group for each continuous sequence of segments and each transformation
for (is_segments_1, chars_1), (is_segments_2, chars_2) in alternances[i]:
if is_segments_1 or is_segments_2:
# Substitution replacement: pass directly the target segments
# (this is a string; or None if no replacement)
repl[c1].append(" ".join(chars_1))
repl[c2].append(" ".join(chars_2))
# Regex matches these segments as one
regex[c1] += "({})".format("".join(inv.regex(x) if x else "" for x in chars_1))
regex[c2] += "({})".format("".join(inv.regex(x) if x else "" for x in chars_2))
else:
# Transformation replacement (this is a tuple)
repl[c1].append((chars_2, chars_1))
repl[c2].append((chars_1, chars_2))
# Regex matches these segments as one group
regex[c1] += "({})".format(_regex_or(chars_1))
regex[c2] += "({})".format(_regex_or(chars_2))
return {c: re.compile("^" + regex[c] + "$") for c in regex}, repl
def _find_generalized_alt(self, inv):
"""See if the alternation can be generalized using phonological operations."""
c1, c2 = self.cells
this_alt = {c1: [], c2: []}
gen_any = False
for left, right in zip(self.alternation[c1], self.alternation[c2]):
gen_left = []
gen_right = []
for a, b in zip_longest(left, right, fillvalue=""):
if a != "" and b != "":
A, B = inv.transformation(a, b)
else:
A, B = "", ""
if len(A) > 1 or len(B) > 1:
gen_any = True
gen_left.append(A)
gen_right.append(B)
else:
gen_left.append(a)
gen_right.append(b)
this_alt[c1].append(tuple(gen_left))
this_alt[c2].append(tuple(gen_right))
if gen_any:
self._gen_alt = dict(zip(self.cells, (tuple(this_alt[x]) for x in self.cells)))
self._regex, self._repl = self._create_regex(inv)
else:
self._gen_alt = None
[docs]
def applicable(self, form, cell):
"""Test if this pattern matches a form, i.e. if the pattern is applicable to the form.
Arguments:
form (str): a form.
cell (str): A cell contained in self.cells.
Returns:
`bool`: whether the pattern is applicable to the form from that cell.
"""
try:
regex = self._regex[cell]
return bool(regex.match(form))
except KeyError as err:
raise KeyError("Unknown cell {}."
" This pattern's cells are {}."
"".format(err, " and ".join(self.cells)))
[docs]
def apply(self, form, names, inv, raiseOnFail=True):
"""Apply the pattern to a form.
Arguments:
form : a form, assumed to belong to the cell `names[0]`.
names :
apply to a form of cell `names[0]`
to produce a form of cell `names[1]` (default:`self.cells`).
Patterns being non-oriented, it is better to use the names argument.
inv (segments.Inventory): sound inventory
raiseOnFail (bool):
defaults to True. If true, raise an error when the pattern is not applicable to the form.
If False, return None instead.
Returns:
form belonging the opposite cell.
"""
from_cell, to_cell = names if names else self.cells
reg = self._regex[from_cell]
string, nb_subs = reg.subn(lambda x: _replace_alternation(x.groups(""), self._repl[to_cell], inv), form)
if nb_subs == 0 and (not self.applicable(form, from_cell)):
if raiseOnFail:
raise NotApplicable("The context {} from the pattern {} and cells {} -> {}"
"doesn't match the form \"{}\""
"".format(self._regex[from_cell].pattern, self, from_cell, to_cell, form))
else:
return None
return Form(string)
def _is_max_gen(self, inv):
maxi_seg = inv._max
return all([x in [(maxi_seg, kleenestar), "{}"] for x in self.context])
def _format_alt(self, inv, features=True):
"""Get formatted alternating material for each cell."""
def format_as_chars(left, right):
return ("{{{}}}".format(",".join(sorted(left))),
"{{{}}}".format(",".join(sorted(right))))
def format_as_features(left, right):
feats_left, feats_right = inv.get_transform_features(left, right)
feats_left = "[{}]".format(" ".join(sorted(feats_left)))
feats_right = "[{}]".format(" ".join(sorted(feats_right)))
chars_left, chars_right = format_as_chars(left, right)
if len(feats_left) + len(feats_right) <= len(chars_left) + len(chars_right):
return feats_left, feats_right
return chars_left, chars_right
if features:
format_regular_change = format_as_features
else:
format_regular_change = format_as_chars
c1, c2 = self.cells
alternation = zip(self.alternation[c1], self.alternation[c2])
c1_alt = []
c2_alt = []
for left, right in alternation:
formatted_left = ""
formatted_right = ""
for seg_left, seg_right in zip_longest(left, right, fillvalue=""):
if inv.is_leaf(seg_left) and inv.is_leaf(seg_right):
formatted_left += seg_left
formatted_right += seg_right
else:
l, r = format_regular_change(seg_left, seg_right)
formatted_left += l
formatted_right += r
c1_alt.append(formatted_left)
c2_alt.append(formatted_right)
return c1_alt, c2_alt