# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Sacha Beniamine.
This module addresses the modelisation of phonological segments.
"""
import functools
import logging
import re
from itertools import combinations
import numpy as np
import pandas as pd
from concepts import Context
from tqdm import tqdm
log = logging.getLogger("Qumin")
_to_short_feature = {'anterior': 'ant', 'approximant': 'appr', 'back': 'back', 'click': 'click', 'consonantal': 'C',
'constr gl': 'cgl', 'constricted': 'constr', 'constricted glottis': 'cgl', 'continuant': 'cont',
'coronal': 'coro', 'delayed release': 'del.rel', 'distributed': 'dist', 'dorsal': 'dors',
'front': 'front', 'high': 'high', 'labial': 'lab', 'laryngeal': 'laryng', 'lateral': 'lat',
'long': 'long', 'low': 'low', 'nasal': 'nas', 'pharyngeal': 'phar', 'place': 'place',
'preaspirated': 'preasp', 'preglottalized': 'pregl', 'prenasal': 'prenas', 'round': 'round',
'sibilant': 'sib', 'sonorant': 'son', 'spread': 'spread', 'spread gl': 'spre.gl',
'spread glottis': 'sg', 'strident': 'stri', 'syllabic': 'syll', 'tap': 'tap', 'tense': 'tens',
'voice': 'voic', 'mid': 'mid', 'central': 'centr', 'compact': 'compact', 'diffuse': 'diff',
'abrupt': 'abrupt', 'checked': 'check', 'grave': 'grave', 'acute': 'acute', 'medial': 'med',
'flat': 'flat', 'sharp': 'sharp', 'trill': 'tril', 'labiodental': 'labdent'}
_short_features = [y for _, y in _to_short_feature.items()]
[docs]
def sound_lattice_context(dataframe):
""" Create a Context from a dataframe of properties.
Args:
dataframe (:class:`pandas:pandas.DataFrame`): A dataframe of sound / features incidence.
-1 means unapplicable
1 means +feature
0 means -feature
Returns:
concepts.Context: the created Context
"""
def feature_formatter(columns):
signs = ["-", "+"]
for c in columns:
key, val = c.split("=")
i = int(float(val))
if i < 2:
yield signs[int(float(val))] + key.replace(" ", "_")
else:
yield c
dataframe = dataframe.map(lambda x: None if x == "-1" else x)
dummies = pd.get_dummies(dataframe, prefix_sep="=")
dummies = dummies.map(lambda x: "X" if x == 1 else "")
dummies.columns = feature_formatter(dummies.columns)
return Context.fromstring(dummies.to_csv(), frmat='csv')
def _regex_or(sounds, sep=" "):
return "(?:" + "|".join(x + sep for x in sorted(sounds)) + ")"
[docs]
class Inventory(object):
"""The static `segments.Inventory` class describes a sound inventory.
>>> inv = Inventory.from_file("tests/data/frenchipa.csv")
Each sound class in the inventory is a concept in a FCA lattice.
Sound class identifiers are either strings (for phonemes)
or frozensets (for sound classes). Phonemes are the leaves of the hierarchy.
Sound classes can be seen as under-determined phonemes, and both phonemes and sound
classes are handled in the same way. For this reason, we call both "sound".
Attributes:
context: the FCA context underlying the feature space
_score_matrix (dict): a dictionnary of sound tuples to alignment score
_gap_score (float): a score for insertions
_normalization (dict): a dictionnary of sounds to their normalized counterparts
_segmenter (re.Pattern): a compiled regex to segment words into phonemes
_legal_str (re.Pattern): a compiled regex to recognize words made of known phonemes
_max (frozenset): the identifier of the supremum in the lattice
_regexes (dict): a dictionnary of sound IDs to regex strings
_pretty_str (dict): a dictionnary of sound IDs to pretty formatted strings
_features (dict): a dictionnary of sound IDs to set of features
_features_str (dict): a dictionnary of sound IDs to a string representing features
_classes (dict): a dictionnary of sound IDs to a list of classes (ancestors)
"""
def __init__(self, table, shorthands_table, normalization):
self._gap_score = None
self._score_matrix = {}
self._normalization = normalization
self.context = sound_lattice_context(table)
shorthands = Inventory.calc_shorthand(self.context.lattice, shorthands_table)
log.info('Building classes of segments...')
self._regexes = {}
self._pretty_str = {}
self._classes = {}
self._features = {}
self._features_str = {}
for extent, intent in tqdm(self.context.lattice):
if extent:
Inventory._add_segment(self, extent, intent, shorthands)
self.check_validity(self.context.lattice, table)
self._max = max(self._classes, key=len)
simple_sounds = [s for s in self._classes if self.is_leaf(s)]
all_sounds = sorted(simple_sounds + list(self._normalization),
key=len, reverse=True)
self._segmenter = re.compile("(" + "|".join(all_sounds) + ")")
self._legal_str = re.compile("(" + "|".join(all_sounds) + ")+")
[docs]
@classmethod
def from_file(cls, filename):
""" Initializes the inventory
Args:
filename: path to a csv or tsv file with distinctive features
"""
table, shorthands_table, normalization = cls.read_sounds_file(filename)
return cls(table, shorthands_table, normalization)
[docs]
def check_validity(self, lattice, table):
""" Check validity of this sound inventory for Qumin.
Identifies when some segments are ancestors of others
(some segments only differ from others through underspecification)
Args:
lattice: concept lattice
table: table of sounds
Raises: Exception
"""
not_actual_leaves = []
for leaf in table.index:
lattice_node = frozenset(lattice[(leaf,)].extent)
if len(lattice_node) > 1:
not_actual_leaves.append((leaf, lattice_node))
if not_actual_leaves:
alert = ""
for leaf, lattice_node in not_actual_leaves:
other = set(lattice_node) - {leaf}
alert += "\n\t" + leaf + " is the same node as " + str(lattice_node)
alert += "\n\t\t" + self.infos(lattice_node)
for o in other:
alert += "\n\t\t" + self.infos(self.get(o))
raise Exception("Warning, some segments are ancestors of other segments:" + alert)
[docs]
@classmethod
def calc_shorthand(cls, lattice, shorthands):
""" Calculate shorthand names for some lattice nodes.
Ex: ##C## in the sounds table might be a shorthand "C" for all consonants.
Args:
lattice: concept lattice
shorthands: table or shorthand definitions
Returns:
a dictionary of intents to their shorthand names
"""
if shorthands is not None:
shorthand_context = sound_lattice_context(shorthands)
stack = shorthands.index.tolist()
shorthands = {}
for e, i in shorthand_context.lattice:
full_intent = lattice[i].intent
for sh in e:
if sh in stack:
shorthand_name = sh.strip("#")
if shorthand_name:
shorthands[full_intent] = sh.strip("#")
stack.remove(sh)
if not stack:
break
else:
shorthands = {}
return shorthands
[docs]
@classmethod
def read_sounds_file(cls, filename):
""" Read a sound file from file.
Args:
filename: path to the file
Returns: (table, shorthands, normalization)
table (pd.DataFrame): normalized sound table
shorthands (pd.DataFrame): table holding shorthands for some concepts
normalization (dict): dictionary of normalizations for identical rows.
"""
log.info("Reading table %s", filename)
table = pd.read_table(filename, header=0, dtype=str,
index_col=False, sep=',',
encoding="utf-8")
sound_id = "sound_id"
if sound_id not in table.columns:
raise ValueError("Paralex sound tables must have a sound_id column.")
drop = {"value", "UNICODE", "ALIAS", "Seg.", # Legacy columns
"label", "tier", "CLTS_id", # Unused Paralex columns
}
deprecated_cols = table.columns.intersection({"value", "UNICODE", "ALIAS", "Seg."})
if not deprecated_cols.empty:
log.warning(f"Usage of columns {' ,'.join(deprecated_cols)} is deprecated. Edit your sounds file !")
for col in drop:
if col in table.columns:
table.drop(col, axis=1, inplace=True)
shorten_feature_names(table)
table[sound_id] = table[sound_id].astype(str)
na_vals = {c: "-1" for c in table.columns}
na_vals[sound_id] = ""
table = table.fillna(na_vals)
# Checking segments names legality
for seg in table[sound_id]:
if seg == "":
raise ValueError("One of your segments doesn't have a name !")
if seg.strip("#") == "":
raise ValueError("The symbol \"#\" is reserved and can only "
"be used in a shorthand name (#V# for a vowel, etc)")
# Separate shorthand table
shorthand_selection = table[sound_id].str.match("^#.+#$")
shorthands = None
if shorthand_selection.any():
shorthands = table[shorthand_selection]
table = table[~shorthand_selection]
shorthands.set_index(sound_id, inplace=True)
shorthands = shorthands.map(str) # Why is this necessary ?
table.set_index(sound_id, inplace=True)
if table.empty:
raise ValueError('It seems that the paralex sounds table '
'has no usable sound definitions for Qumin.')
log.info("Normalizing identical rows")
attributes = list(table.columns)
normalization = normalize(table, attributes)
table.set_index("Normalized", inplace=True)
table.drop_duplicates(inplace=True)
log.debug("Normalization map: %s", normalization)
table = table.map(lambda x: str(x))
return table, shorthands, normalization
def _add_segment(self, extent, intent, shorthands):
""" Adds a single lattice concept to the inventory.
A concept is a sound class. If there is a single sound in the class,
the node represents that specific phoneme.
Args:
extent: list of phonemes in the sound class
intent: list of features for this concept
shorthands: dictionary of shorthand names
"""
# Define the shortest expression of this segment if possible
shorthand = shorthands.get(intent, None)
if len(intent) == 0:
shorthand = "X"
elif shorthand is None and len(extent) > 1:
minimals = next(self.context.lattice[extent].attributes())
if minimals:
shorthand = "[{}]".format(" ".join(minimals))
concept = self.context.lattice[extent]
ancestors = ["|".join(sorted(c.extent)) for c in concept.upset()]
classes = sorted(ancestors, key=len)
seg_id = frozenset(extent) if len(extent) > 1 else extent[0]
ordered = sorted(extent)
self._regexes[seg_id] = _regex_or(ordered, sep=" ")
if len(extent) == 1:
self._pretty_str[seg_id] = seg_id
else:
self._pretty_str[seg_id] = "{" + ",".join(ordered) + "}"
self._classes[seg_id] = set(classes)
self._features[seg_id] = set(intent)
self._features_str[seg_id] = shorthand or "[{}]".format(" ".join(sorted(intent)))
[docs]
def regex(self, sound):
""" Returns a regex representing a sound.
Args:
sound: identifier of a sound
Returns:
(str): regex string
"""
return self._regexes[sound]
[docs]
def pretty_str(self, sound, **kwargs):
""" Returns a pretty string representing a sound.
Args:
sound: identifier of a sound
Returns:
(str): pretty string
"""
return self._pretty_str[sound]
[docs]
def features(self, sound, **kwargs):
""" Returns a set of features representing a sound.
Args:
sound: identifier of a sound
Returns:
(set): features
"""
return self._features[sound]
[docs]
def features_str(self, sound, **kwargs):
""" Returns a string which described the features of a sound.
Args:
sound: identifier of a sound
Returns:
(str): features string
"""
return self._features_str[sound]
[docs]
def shortest(self, sound, **kwargs):
""" Returns a string which describes the sound in as little characters as possible.
Args:
sound: identifier of a sound
Returns:
(str): short string
"""
return min((self.pretty_str(sound), self.features_str(sound)), key=len)
[docs]
@staticmethod
def is_leaf(sound):
""" Returns whether this sound is a leaf (a phoneme, rather than a sound class)
Args:
sound: identifier of a sound
Returns:
"""
return isinstance(sound, str)
[docs]
def infos(self, sound):
""" String giving all useful information on a sound.
Args:
sound: identifier of a sound
Returns:
pretty string and features of a sound.
"""
return self.pretty_str(sound) + " = " + self._features_str[sound]
[docs]
def inf(self, a, b):
""" Checks if a is a descendant of b.
a < b iff b has children and either a is a string
which is part of b, or a is a subset of b.
"""
return (not self.is_leaf(b)) and ((a in b) or (not self.is_leaf(a) and a < b))
[docs]
def similarity(self, a, b):
"""Computes phonological similarity (Frisch, 2004)
Measure from "Similarity avoidance and the OCP" , Frisch, S. A.; Pierrehumbert, J. B. & Broe,
M. B. *Natural Language & Linguistic Theory*, Springer, 2004, 22, 179-228, p. 198.
We compute similarity by comparing the number of shared and unshared natural classes
of two consonants, using the equation in (7). This equation is a direct extension
of the Pierrehumbert (1993) feature similarity metric to the case of natural classes.
(7) :math:`Similarity = \\frac{\\text{Shared natural classes}}{\\text{Shared natural classes } + \\text{Non-shared natural classes}}`
"""
if a == b:
return 1
ca = self._classes[a]
cb = self._classes[b]
return len(ca & cb) / len(ca | cb)
[docs]
def init_dissimilarity_matrix(self, gap_prop=0.5, **kwargs):
"""Computes score matrix with dissimilarity scores."""
# TODO: should this be delegated to morphalign ?
# TODO: should this code all on integers ?
costs = []
simple_sounds = [s for s in self._classes if self.is_leaf(s)]
for a, b in combinations(simple_sounds, 2):
cost = 1 - self.similarity(a, b)
self._score_matrix[(a, b)] = self._score_matrix[(b, a)] = cost
costs.append(cost)
self._gap_score = np.quantile(np.array(costs), 0.5) * gap_prop
for a in simple_sounds:
self._score_matrix[(a, a)] = 0
[docs]
def insert_cost(self, *_):
"""Returns the constant insertion/deletion cost"""
return self._gap_score
[docs]
def sub_cost(self, a, b):
""" Returns the cost of aligning sounds `a` and `b`
Args:
a: sound identifier
b: sound identifier
Returns: (float): substitution cost
"""
return self._score_matrix[(a, b)]
[docs]
@functools.lru_cache(maxsize=128)
def get(self, descriptor):
""" Get a sound using the lattice.
Args:
descriptor: iterable of phonemes OR iterable of features
Returns: (str or frozenset) sound identifier
"""
try:
s = self.context.lattice[descriptor].extent
if len(s) == 1:
return s[0]
return frozenset(s)
except KeyError:
raise ValueError("Unknown sound descriptor: " + repr(descriptor))
[docs]
def meet(self, *args):
"""Finds the lowest common ancestors of segments from their identifiers.
Args: several sound identifiers
Returns:
lowest common ancestor identifier
"""
segments = set()
for segment in args:
if self.is_leaf(segment):
segments.add(segment)
else:
segments |= segment
return self.get(frozenset(segments))
[docs]
def id_to_frozenset(self, sound_id):
if self.is_leaf(sound_id):
return frozenset({sound_id})
return frozenset(sound_id)
[docs]
def show_pool(self):
"""Return a string description of the whole segment pool."""
return "\n".join([self.infos(seg)
for seg in sorted(self._classes, key=lambda x: len(x))])
[docs]
def normalize(ipa, features):
"""Assign a normalized segment to groups of segments with identical rows.
This function takes a segments table
and adds **in place** a "Normalized" column.
This column contains a common value
for each segment with identical boolean values.
The function also returns a translation table
mapping indexes to normalized segments.
Note: the index are expected to be one char length.
============ ============ ==============
Index ..features.. Normalized
============ ============ ==============
ɛ [...] E
e [...] E
============ ============ ==============
Arguments:
ipa (:class:`pandas:pandas.DataFrame`):
Dataframe of segments. Columns are features,
UNICODE code point representation and segment names,
indexes are segments.
features (list): Feature columns' names.
Returns:
dict: translation table from
the segment's name to its normalized name.
"""
def find_identical_rows(segment, table):
seg_features = table.loc[segment, :]
try:
return (table == seg_features).all(axis=1)
except ValueError:
if seg_features.shape[0] > 1:
raise ValueError("You have multiple definitions for {}\n{}".format(segment,
seg_features))
ipa["Normalized"] = ""
for segment in ipa[features].drop_duplicates().index:
same_features_as_seg = find_identical_rows(segment, ipa[features])
if (ipa.loc[same_features_as_seg, "Normalized"] == "").all():
ipa.loc[same_features_as_seg, "Normalized"] = segment
norm_map = {seg: norm for seg, norm in zip(ipa.index, ipa["Normalized"]) if
seg != norm}
return norm_map
[docs]
def shorten_feature_names(table):
headers = list(table.iloc[0])
if "Seg." in headers or "sound_id" in headers:
raise ValueError("Using a second row of headers is not supported anymore.")
short_features_names = []
for name in table.columns:
if name == "sound_id" or len(name) <= 3: # Not a feature name
short_features_names.append(name)
else:
if name in _to_short_feature: # Check standard names
short_features_names.append(_to_short_feature[name])
elif name.lower() in _to_short_feature: # Uppercase
short_features_names.append(_to_short_feature[name.lower()].upper())
else:
# Make an abbreviation on the fly by shortening the label
names = [name[:i] for i in range(3, len(name) + 1)]
reserved_names = _short_features + short_features_names
while names and names[0] in reserved_names:
names.pop(0)
if len(names) != 0:
new_name = names[0]
else: # Fallback strategy: append a unique integer
key = 1
new_name = name[:3] + str(key)
while new_name in reserved_names:
key += 1
new_name = name[:3] + str(key)
short_features_names.append(new_name)
table.columns = short_features_names