Source code for qumin.representations.segments

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Sacha Beniamine.

This module addresses the modelisation of phonological segments.
"""
import functools
import logging
import re
from itertools import combinations

import numpy as np
import pandas as pd
from concepts import Context
from tqdm import tqdm

log = logging.getLogger("Qumin")

_to_short_feature = {'anterior': 'ant', 'approximant': 'appr', 'back': 'back', 'click': 'click', 'consonantal': 'C',
                     'constr gl': 'cgl', 'constricted': 'constr', 'constricted glottis': 'cgl', 'continuant': 'cont',
                     'coronal': 'coro', 'delayed release': 'del.rel', 'distributed': 'dist', 'dorsal': 'dors',
                     'front': 'front', 'high': 'high', 'labial': 'lab', 'laryngeal': 'laryng', 'lateral': 'lat',
                     'long': 'long', 'low': 'low', 'nasal': 'nas', 'pharyngeal': 'phar', 'place': 'place',
                     'preaspirated': 'preasp', 'preglottalized': 'pregl', 'prenasal': 'prenas', 'round': 'round',
                     'sibilant': 'sib', 'sonorant': 'son', 'spread': 'spread', 'spread gl': 'spre.gl',
                     'spread glottis': 'sg', 'strident': 'stri', 'syllabic': 'syll', 'tap': 'tap', 'tense': 'tens',
                     'voice': 'voic', 'mid': 'mid', 'central': 'centr', 'compact': 'compact', 'diffuse': 'diff',
                     'abrupt': 'abrupt', 'checked': 'check', 'grave': 'grave', 'acute': 'acute', 'medial': 'med',
                     'flat': 'flat', 'sharp': 'sharp', 'trill': 'tril', 'labiodental': 'labdent'}
_short_features = [y for _, y in _to_short_feature.items()]


[docs] def sound_lattice_context(dataframe): """ Create a Context from a dataframe of properties. Args: dataframe (:class:`pandas:pandas.DataFrame`): A dataframe of sound / features incidence. -1 means unapplicable 1 means +feature 0 means -feature Returns: concepts.Context: the created Context """ def feature_formatter(columns): signs = ["-", "+"] for c in columns: key, val = c.split("=") i = int(float(val)) if i < 2: yield signs[int(float(val))] + key.replace(" ", "_") else: yield c dataframe = dataframe.map(lambda x: None if x == "-1" else x) dummies = pd.get_dummies(dataframe, prefix_sep="=") dummies = dummies.map(lambda x: "X" if x == 1 else "") dummies.columns = feature_formatter(dummies.columns) return Context.fromstring(dummies.to_csv(), frmat='csv')
def _regex_or(sounds, sep=" "): return "(?:" + "|".join(x + sep for x in sorted(sounds)) + ")"
[docs] class Form(str): """ A form is a string of sounds, separated by spaces. If a form is provided as defective, this information is still stored as a Form object with empty content. Defectiveness can be tested with: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> Form('').is_defective() True By default, we segment by cutting on spaces. If resegment=True, we remove spaces, and segment using the sound inventory's list of valid phonemes. Sounds might be more than one character long. Forms are strings, they are segmented at the object creation. Attributes: tokens (Tuple): Tuple of phonemes contained in this form. For defective entries, tokens are an empty tuple. id (str): form_id of the corresponding form according to the Paralex package. If unknown, `None` will be assigned. """ def __new__(cls, contents, form_id=None): if contents != "" and not contents.endswith(" "): contents = contents + " " return str.__new__(cls, contents)
[docs] def __init__(self, string, form_id=None): """ The constructor assumes everything is already clean and normalized""" self.id = form_id self.tokens = tuple([tok for tok in string.split(" ") if tok]) # Ignore final empty string
[docs] @classmethod def from_raw(cls, string, inventory, form_id=None, resegment=False): """ Use inventory to build a cleaned and normalized Form. Args: string: raw string for this form inventory (segments.Inventory): sound inventory form_id: form identifier resegment (bool): defaults to False. Whether to re-segment phoneme tokens. Returns: a formatted Form """ contents = "" if string != "": tokens = inventory.segment_form(string, resegment=resegment) tokens = tuple(inventory._normalization.get(c, c) for c in tokens) if inventory._legal_str.fullmatch("".join(tokens)) is None: raise ValueError("Unknown sound in: " + repr(string)) contents = " ".join(tokens) return cls(contents, form_id)
def __getnewargs__(self): return (" ".join(self.tokens), self.id)
[docs] def is_defective(self): return self == ''
def __repr__(self): segmented = " ".join(self.tokens) if self else "#DEF#" return f"Form({segmented}, id='{self.id}')" if hasattr(self, "id") and self.id else f"Form({segmented})" def __str__(self): return "".join([x.strip() for x in self.tokens]) if self else '#DEF#'
[docs] class Inventory(object): """The static `segments.Inventory` class describes a sound inventory. >>> inv = Inventory.from_file("tests/data/frenchipa.csv") Each sound class in the inventory is a concept in a FCA lattice. Sound class identifiers are either strings (for phonemes) or frozensets (for sound classes). Phonemes are the leaves of the hierarchy. Sound classes can be seen as under-determined phonemes, and both phonemes and sound classes are handled in the same way. For this reason, we call both "sound". Attributes: context: the FCA context underlying the feature space _score_matrix (dict): a dictionnary of sound tuples to alignment score _gap_score (float): a score for insertions _normalization (dict): a dictionnary of sounds to their normalized counterparts _segmenter (re.Pattern): a compiled regex to segment words into phonemes _legal_str (re.Pattern): a compiled regex to recognize words made of known phonemes _max (frozenset): the identifier of the supremum in the lattice _regexes (dict): a dictionnary of sound IDs to regex strings _pretty_str (dict): a dictionnary of sound IDs to pretty formatted strings _features (dict): a dictionnary of sound IDs to set of features _features_str (dict): a dictionnary of sound IDs to a string representing features _classes (dict): a dictionnary of sound IDs to a list of classes (ancestors) """ def __init__(self, table, shorthands_table, normalization): self._gap_score = None self._score_matrix = {} self._normalization = normalization self.context = sound_lattice_context(table) shorthands = Inventory.calc_shorthand(self.context.lattice, shorthands_table) log.info('Building classes of segments...') self._regexes = {} self._pretty_str = {} self._classes = {} self._features = {} self._features_str = {} for extent, intent in tqdm(self.context.lattice): if extent: Inventory._add_segment(self, extent, intent, shorthands) self.check_validity(self.context.lattice, table) self._max = max(self._classes, key=len) simple_sounds = [s for s in self._classes if self.is_leaf(s)] all_sounds = sorted(simple_sounds + list(self._normalization), key=len, reverse=True) self._segmenter = re.compile("(" + "|".join(all_sounds) + ")") self._legal_str = re.compile("(" + "|".join(all_sounds) + ")+")
[docs] @classmethod def from_file(cls, filename): """ Initializes the inventory Args: filename: path to a csv or tsv file with distinctive features """ table, shorthands_table, normalization = cls.read_sounds_file(filename) return cls(table, shorthands_table, normalization)
[docs] def check_validity(self, lattice, table): """ Check validity of this sound inventory for Qumin. Identifies when some segments are ancestors of others (some segments only differ from others through underspecification) Args: lattice: concept lattice table: table of sounds Raises: Exception """ not_actual_leaves = [] for leaf in table.index: lattice_node = frozenset(lattice[(leaf,)].extent) if len(lattice_node) > 1: not_actual_leaves.append((leaf, lattice_node)) if not_actual_leaves: alert = "" for leaf, lattice_node in not_actual_leaves: other = set(lattice_node) - {leaf} alert += "\n\t" + leaf + " is the same node as " + str(lattice_node) alert += "\n\t\t" + self.infos(lattice_node) for o in other: alert += "\n\t\t" + self.infos(self.get(o)) raise Exception("Warning, some segments are ancestors of other segments:" + alert)
[docs] @classmethod def calc_shorthand(cls, lattice, shorthands): """ Calculate shorthand names for some lattice nodes. Ex: ##C## in the sounds table might be a shorthand "C" for all consonants. Args: lattice: concept lattice shorthands: table or shorthand definitions Returns: a dictionary of intents to their shorthand names """ if shorthands is not None: shorthand_context = sound_lattice_context(shorthands) stack = shorthands.index.tolist() shorthands = {} for e, i in shorthand_context.lattice: full_intent = lattice[i].intent for sh in e: if sh in stack: shorthand_name = sh.strip("#") if shorthand_name: shorthands[full_intent] = sh.strip("#") stack.remove(sh) if not stack: break else: shorthands = {} return shorthands
[docs] @classmethod def read_sounds_file(cls, filename): """ Read a sound file from file. Args: filename: path to the file Returns: (table, shorthands, normalization) table (pd.DataFrame): normalized sound table shorthands (pd.DataFrame): table holding shorthands for some concepts normalization (dict): dictionary of normalizations for identical rows. """ log.info("Reading table %s", filename) table = pd.read_table(filename, header=0, dtype=str, index_col=False, sep=',', encoding="utf-8") sound_id = "sound_id" if sound_id not in table.columns: raise ValueError("Paralex sound tables must have a sound_id column.") drop = {"value", "UNICODE", "ALIAS", "Seg.", # Legacy columns "label", "tier", "CLTS_id", # Unused Paralex columns } deprecated_cols = table.columns.intersection({"value", "UNICODE", "ALIAS", "Seg."}) if not deprecated_cols.empty: log.warning(f"Usage of columns {' ,'.join(deprecated_cols)} is deprecated. Edit your sounds file !") for col in drop: if col in table.columns: table.drop(col, axis=1, inplace=True) shorten_feature_names(table) table[sound_id] = table[sound_id].astype(str) na_vals = {c: "-1" for c in table.columns} na_vals[sound_id] = "" table = table.fillna(na_vals) # Checking segments names legality for seg in table[sound_id]: if seg == "": raise ValueError("One of your segments doesn't have a name !") if seg.strip("#") == "": raise ValueError("The symbol \"#\" is reserved and can only " "be used in a shorthand name (#V# for a vowel, etc)") # Separate shorthand table shorthand_selection = table[sound_id].str.match("^#.+#$") shorthands = None if shorthand_selection.any(): shorthands = table[shorthand_selection] table = table[~shorthand_selection] shorthands.set_index(sound_id, inplace=True) shorthands = shorthands.map(str) # Why is this necessary ? table.set_index(sound_id, inplace=True) if table.empty: raise ValueError('It seems that the paralex sounds table ' 'has no usable sound definitions for Qumin.') log.info("Normalizing identical rows") attributes = list(table.columns) normalization = normalize(table, attributes) table.set_index("Normalized", inplace=True) table.drop_duplicates(inplace=True) log.debug("Normalization map: %s", normalization) table = table.map(lambda x: str(x)) return table, shorthands, normalization
def _add_segment(self, extent, intent, shorthands): """ Adds a single lattice concept to the inventory. A concept is a sound class. If there is a single sound in the class, the node represents that specific phoneme. Args: extent: list of phonemes in the sound class intent: list of features for this concept shorthands: dictionary of shorthand names """ # Define the shortest expression of this segment if possible shorthand = shorthands.get(intent, None) if len(intent) == 0: shorthand = "X" elif shorthand is None and len(extent) > 1: minimals = next(self.context.lattice[extent].attributes()) if minimals: shorthand = "[{}]".format(" ".join(minimals)) concept = self.context.lattice[extent] ancestors = ["|".join(sorted(c.extent)) for c in concept.upset()] classes = sorted(ancestors, key=len) seg_id = frozenset(extent) if len(extent) > 1 else extent[0] ordered = sorted(extent) self._regexes[seg_id] = _regex_or(ordered, sep=" ") if len(extent) == 1: self._pretty_str[seg_id] = seg_id else: self._pretty_str[seg_id] = "{" + ",".join(ordered) + "}" self._classes[seg_id] = set(classes) self._features[seg_id] = set(intent) self._features_str[seg_id] = shorthand or "[{}]".format(" ".join(sorted(intent)))
[docs] def regex(self, sound): """ Returns a regex representing a sound. Args: sound: identifier of a sound Returns: (str): regex string """ return self._regexes[sound]
[docs] def pretty_str(self, sound, **kwargs): """ Returns a pretty string representing a sound. Args: sound: identifier of a sound Returns: (str): pretty string """ return self._pretty_str[sound]
[docs] def features(self, sound, **kwargs): """ Returns a set of features representing a sound. Args: sound: identifier of a sound Returns: (set): features """ return self._features[sound]
[docs] def features_str(self, sound, **kwargs): """ Returns a string which described the features of a sound. Args: sound: identifier of a sound Returns: (str): features string """ return self._features_str[sound]
[docs] def shortest(self, sound, **kwargs): """ Returns a string which describes the sound in as little characters as possible. Args: sound: identifier of a sound Returns: (str): short string """ return min((self.pretty_str(sound), self.features_str(sound)), key=len)
[docs] @staticmethod def is_leaf(sound): """ Returns whether this sound is a leaf (a phoneme, rather than a sound class) Args: sound: identifier of a sound Returns: """ return isinstance(sound, str)
[docs] def infos(self, sound): """ String giving all useful information on a sound. Args: sound: identifier of a sound Returns: pretty string and features of a sound. """ return self.pretty_str(sound) + " = " + self._features_str[sound]
[docs] def inf(self, a, b): """ Checks if a is a descendant of b. a < b iff b has children and either a is a string which is part of b, or a is a subset of b. """ return (not self.is_leaf(b)) and ((a in b) or (not self.is_leaf(a) and a < b))
[docs] def similarity(self, a, b): """Computes phonological similarity (Frisch, 2004) Measure from "Similarity avoidance and the OCP" , Frisch, S. A.; Pierrehumbert, J. B. & Broe, M. B. *Natural Language & Linguistic Theory*, Springer, 2004, 22, 179-228, p. 198. We compute similarity by comparing the number of shared and unshared natural classes of two consonants, using the equation in (7). This equation is a direct extension of the Pierrehumbert (1993) feature similarity metric to the case of natural classes. (7) :math:`Similarity = \\frac{\\text{Shared natural classes}}{\\text{Shared natural classes } + \\text{Non-shared natural classes}}` """ if a == b: return 1 ca = self._classes[a] cb = self._classes[b] return len(ca & cb) / len(ca | cb)
[docs] def init_dissimilarity_matrix(self, gap_prop=0.5, **kwargs): """Computes score matrix with dissimilarity scores.""" # TODO: should this be delegated to morphalign ? # TODO: should this code all on integers ? costs = [] simple_sounds = [s for s in self._classes if self.is_leaf(s)] for a, b in combinations(simple_sounds, 2): cost = 1 - self.similarity(a, b) self._score_matrix[(a, b)] = self._score_matrix[(b, a)] = cost costs.append(cost) self._gap_score = np.quantile(np.array(costs), 0.5) * gap_prop for a in simple_sounds: self._score_matrix[(a, a)] = 0
[docs] def insert_cost(self, *_): """Returns the constant insertion/deletion cost""" return self._gap_score
[docs] def sub_cost(self, a, b): """ Returns the cost of aligning sounds `a` and `b` Args: a: sound identifier b: sound identifier Returns: (float): substitution cost """ return self._score_matrix[(a, b)]
[docs] @functools.lru_cache(maxsize=128) def get(self, descriptor): """ Get a sound using the lattice. Args: descriptor: iterable of phonemes OR iterable of features Returns: (str or frozenset) sound identifier """ try: s = self.context.lattice[descriptor].extent if len(s) == 1: return s[0] return frozenset(s) except KeyError: raise ValueError("Unknown sound descriptor: " + repr(descriptor))
[docs] def meet(self, *args): """Finds the lowest common ancestors of segments from their identifiers. Args: several sound identifiers Returns: lowest common ancestor identifier """ segments = set() for segment in args: if self.is_leaf(segment): segments.add(segment) else: segments |= segment return self.get(frozenset(segments))
[docs] @functools.lru_cache(maxsize=128) def transformation(self, a, b): """Find a transformation between a and b. The transformation is a pair of two maximal sets of segments related by a bijective phonological function. This function takes a pair of sound identifiers and calculates the function which relates these two segments. It then finds and returns the two maximal sets of segments related by this function. Example: In French, t -> s can be expressed by a phonological function which changes [-cont] and [-rel. ret] to [+cont] and [+rel. ret] These other segments are related by the same change: d -> z b -> v p -> f >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> a,b = inv.transformation("t","s") >>> a == frozenset({'d', 't', 'b', 'p'}) True >>> b == frozenset({'s', 'z', 'f', 'v'}) True Arguments: a,b (str): Segment identifiers. Returns: tuple of frozenset: two sets of sounds. """ def select_if_reciprocal(inst, segs, left, right): tmp = [] for x in inst.id_to_frozenset(segs): y = inst.get(frozenset((inst.features(x) - left) | right)) if y and type(y) is str: x_back = inst.get(frozenset((inst.features(y) - right) | left)) if x == x_back: tmp.append(x) return frozenset(tmp) # TODO: warning: this need not be a lattice node left, right = self.get_transform_features(a, b) A, B = self.get(left), self.get(right) A = select_if_reciprocal(self, A, left, right) B = select_if_reciprocal(self, B, right, left) return A, B
[docs] def id_to_frozenset(self, sound_id): if self.is_leaf(sound_id): return frozenset({sound_id}) return frozenset(sound_id)
[docs] def get_transform_features(self, left, right): """ Get the features corresponding to a transformation. Arguments: left (frozenset): set of phonemes right (frozenset): set of phonemes Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> inv.get_transform_features({"b","d"}, {"p","t"}) (frozenset({'+voi'}), frozenset({'-voi'})) """ t1 = self.features(self.get(self.id_to_frozenset(left))) t2 = self.features(self.get(self.id_to_frozenset(right))) f1 = t1 - t2 f2 = t2 - t1 return frozenset(f1), frozenset(f2)
[docs] def get_from_transform(self, a, transform): """ Get a segment from another according to a transformation tuple. Arguments: a (str): Segment alias transform (tuple): Couple of two segment IDs Example: >>> inv = Inventory.from_file("tests/data/frenchipa.csv") >>> inv.get_from_transform("d", ... (frozenset({"d","t"}), ... frozenset({"s","z"}))) 'z' """ a = self.features(a) f1, f2 = self.get_transform_features(*transform) return self.get(frozenset((a - f1) | f2))
[docs] def show_pool(self): """Return a string description of the whole segment pool.""" return "\n".join([self.infos(seg) for seg in sorted(self._classes, key=lambda x: len(x))])
[docs] def segment_form(self, wordform, resegment=False): """ Segment a form into phonemes (either following spaces, or using sound inventory) Args: wordform (str): phonemic form resegment (bool): Whether to ignore spaces in phon forms and re-compute phonemic segmentation Returns: list of phonemes """ wordform = wordform.strip(" ") if wordform == "": return [] if len(wordform) == 1: return [wordform] if resegment: excluded = {"", " "} joined = wordform.replace(" ", "") return [s for s in self._segmenter.split(joined) if s not in excluded] # If there's no space, check that the word is only 1 phoneme long if " " not in wordform: try: self.get((wordform,)) # check that this entire wordform is a valid phoneme except ValueError: raise ValueError(f"Forms are not space separated, eg. {wordform}, " f"please pass resegment=True or provide Paralex-compliant segmented forms.") return [phon for phon in wordform.split(" ") if phon]
[docs] def normalize(ipa, features): """Assign a normalized segment to groups of segments with identical rows. This function takes a segments table and adds **in place** a "Normalized" column. This column contains a common value for each segment with identical boolean values. The function also returns a translation table mapping indexes to normalized segments. Note: the index are expected to be one char length. ============ ============ ============== Index ..features.. Normalized ============ ============ ============== ɛ [...] E e [...] E ============ ============ ============== Arguments: ipa (:class:`pandas:pandas.DataFrame`): Dataframe of segments. Columns are features, UNICODE code point representation and segment names, indexes are segments. features (list): Feature columns' names. Returns: dict: translation table from the segment's name to its normalized name. """ def find_identical_rows(segment, table): seg_features = table.loc[segment, :] try: return (table == seg_features).all(axis=1) except ValueError: if seg_features.shape[0] > 1: raise ValueError("You have multiple definitions for {}\n{}".format(segment, seg_features)) ipa["Normalized"] = "" for segment in ipa[features].drop_duplicates().index: same_features_as_seg = find_identical_rows(segment, ipa[features]) if (ipa.loc[same_features_as_seg, "Normalized"] == "").all(): ipa.loc[same_features_as_seg, "Normalized"] = segment norm_map = {seg: norm for seg, norm in zip(ipa.index, ipa["Normalized"]) if seg != norm} return norm_map
[docs] def shorten_feature_names(table): headers = list(table.iloc[0]) if "Seg." in headers or "sound_id" in headers: raise ValueError("Using a second row of headers is not supported anymore.") short_features_names = [] for name in table.columns: if name == "sound_id" or len(name) <= 3: # Not a feature name short_features_names.append(name) else: if name in _to_short_feature: # Check standard names short_features_names.append(_to_short_feature[name]) elif name.lower() in _to_short_feature: # Uppercase short_features_names.append(_to_short_feature[name.lower()].upper()) else: # Make an abbreviation on the fly by shortening the label names = [name[:i] for i in range(3, len(name) + 1)] reserved_names = _short_features + short_features_names while names and names[0] in reserved_names: names.pop(0) if len(names) != 0: new_name = names[0] else: # Fallback strategy: append a unique integer key = 1 new_name = name[:3] + str(key) while new_name in reserved_names: key += 1 new_name = name[:3] + str(key) short_features_names.append(new_name) table.columns = short_features_names