Source code for qumin.representations.segments

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Sacha Beniamine.

This module addresses the modelisation of phonological segments.
"""
import functools
import logging
import re
from itertools import combinations

import numpy as np
import pandas as pd
from concepts import Context
from tqdm import tqdm

log = logging.getLogger("Qumin")

_to_short_feature = {'anterior': 'ant', 'approximant': 'appr', 'back': 'back', 'click': 'click', 'consonantal': 'C',
                     'constr gl': 'cgl', 'constricted': 'constr', 'constricted glottis': 'cgl', 'continuant': 'cont',
                     'coronal': 'coro', 'delayed release': 'del.rel', 'distributed': 'dist', 'dorsal': 'dors',
                     'front': 'front', 'high': 'high', 'labial': 'lab', 'laryngeal': 'laryng', 'lateral': 'lat',
                     'long': 'long', 'low': 'low', 'nasal': 'nas', 'pharyngeal': 'phar', 'place': 'place',
                     'preaspirated': 'preasp', 'preglottalized': 'pregl', 'prenasal': 'prenas', 'round': 'round',
                     'sibilant': 'sib', 'sonorant': 'son', 'spread': 'spread', 'spread gl': 'spre.gl',
                     'spread glottis': 'sg', 'strident': 'stri', 'syllabic': 'syll', 'tap': 'tap', 'tense': 'tens',
                     'voice': 'voic', 'mid': 'mid', 'central': 'centr', 'compact': 'compact', 'diffuse': 'diff',
                     'abrupt': 'abrupt', 'checked': 'check', 'grave': 'grave', 'acute': 'acute', 'medial': 'med',
                     'flat': 'flat', 'sharp': 'sharp', 'trill': 'tril', 'labiodental': 'labdent'}
_short_features = [y for _, y in _to_short_feature.items()]



[docs]
def sound_lattice_context(dataframe):
    """ Create a Context from a dataframe of properties.

    Args:
        dataframe (:class:`pandas:pandas.DataFrame`): A dataframe of sound / features incidence.
            -1 means unapplicable
            1 means +feature
            0 means -feature

    Returns:
        concepts.Context: the created Context
    """

    def feature_formatter(columns):
        signs = ["-", "+"]
        for c in columns:
            key, val = c.split("=")
            i = int(float(val))
            if i < 2:
                yield signs[int(float(val))] + key.replace(" ", "_")
            else:
                yield c

    dataframe = dataframe.map(lambda x: None if x == "-1" else x)
    dummies = pd.get_dummies(dataframe, prefix_sep="=")
    dummies = dummies.map(lambda x: "X" if x == 1 else "")
    dummies.columns = feature_formatter(dummies.columns)
    return Context.fromstring(dummies.to_csv(), frmat='csv')



def _regex_or(sounds, sep=" "):
    return "(?:" + "|".join(x + sep for x in sorted(sounds)) + ")"


[docs]
class Form(str):
    """ A form is a string of sounds, separated by spaces.
    If a form is provided as defective, this information is still stored
    as a Form object with empty content. Defectiveness can be tested with:


        >>> inv = Inventory.from_file("tests/data/frenchipa.csv")
        >>> Form('').is_defective()
        True

    By default, we segment by cutting on spaces.
    If resegment=True, we remove spaces, and segment using the sound inventory's list of valid phonemes.

    Sounds might be more than one character long.
    Forms are strings, they are segmented at the object creation.

    Attributes:
        tokens (Tuple): Tuple of phonemes contained in this form. For defective entries,
            tokens are an empty tuple.
        id (str): form_id of the corresponding form according to the Paralex package.
            If unknown, `None` will be assigned.
    """

    def __new__(cls, contents, form_id=None):
        if contents != "" and not contents.endswith(" "):
            contents = contents + " "
        return str.__new__(cls, contents)


[docs]
    def __init__(self, string, form_id=None):
        """ The constructor assumes everything is already clean and normalized"""
        self.id = form_id
        self.tokens = tuple([tok for tok in string.split(" ") if tok])  # Ignore final empty string



[docs]
    @classmethod
    def from_raw(cls, string, inventory, form_id=None, resegment=False):
        """ Use inventory to build a cleaned and normalized Form.

        Args:
            string: raw string for this form
            inventory (segments.Inventory): sound inventory
            form_id: form identifier
            resegment (bool): defaults to False. Whether to re-segment phoneme tokens.

        Returns: a formatted Form
        """
        contents = ""
        if string != "":
            tokens = inventory.segment_form(string, resegment=resegment)
            tokens = tuple(inventory._normalization.get(c, c) for c in tokens)
            if inventory._legal_str.fullmatch("".join(tokens)) is None:
                raise ValueError("Unknown sound in: " + repr(string))
            contents = " ".join(tokens)
        return cls(contents, form_id)


    def __getnewargs__(self):
        return (" ".join(self.tokens), self.id)


[docs]
    def is_defective(self):
        return self == ''


    def __repr__(self):
        segmented = " ".join(self.tokens) if self else "#DEF#"
        return f"Form({segmented}, id='{self.id}')" if hasattr(self, "id") and self.id else f"Form({segmented})"

    def __str__(self):
        return "".join([x.strip() for x in self.tokens]) if self else '#DEF#'




[docs]
class Inventory(object):
    """The static `segments.Inventory` class describes a sound inventory.


    >>> inv = Inventory.from_file("tests/data/frenchipa.csv")

    Each sound class in the inventory is a concept in a FCA lattice.
    Sound class identifiers are either strings (for phonemes)
    or frozensets (for sound classes). Phonemes are the leaves of the hierarchy.

    Sound classes can be seen as under-determined phonemes, and both phonemes and sound
    classes are handled in the same way. For this reason, we call both "sound".

    Attributes:
        context: the FCA context underlying the feature space
        _score_matrix (dict): a dictionnary of sound tuples to alignment score
        _gap_score (float): a score for insertions
        _normalization (dict): a dictionnary of sounds to their normalized counterparts
        _segmenter (re.Pattern): a compiled regex to segment words into phonemes
        _legal_str (re.Pattern): a compiled regex to recognize words made of known phonemes
        _max (frozenset): the identifier of the supremum in the lattice
        _regexes (dict): a dictionnary of sound IDs to regex strings
        _pretty_str (dict): a dictionnary of sound IDs to pretty formatted strings
        _features (dict): a dictionnary of sound IDs to set of features
        _features_str (dict): a dictionnary of sound IDs to a string representing features
        _classes (dict): a dictionnary of sound IDs to a list of classes (ancestors)

    """

    def __init__(self, table, shorthands_table, normalization):
        self._gap_score = None
        self._score_matrix = {}
        self._normalization = normalization

        self.context = sound_lattice_context(table)
        shorthands = Inventory.calc_shorthand(self.context.lattice, shorthands_table)

        log.info('Building classes of segments...')
        self._regexes = {}
        self._pretty_str = {}
        self._classes = {}
        self._features = {}
        self._features_str = {}
        for extent, intent in tqdm(self.context.lattice):
            if extent:
                Inventory._add_segment(self, extent, intent, shorthands)

        self.check_validity(self.context.lattice, table)

        self._max = max(self._classes, key=len)

        simple_sounds = [s for s in self._classes if self.is_leaf(s)]
        all_sounds = sorted(simple_sounds + list(self._normalization),
                            key=len, reverse=True)
        self._segmenter = re.compile("(" + "|".join(all_sounds) + ")")
        self._legal_str = re.compile("(" + "|".join(all_sounds) + ")+")


[docs]
    @classmethod
    def from_file(cls, filename):
        """ Initializes the inventory

        Args:
            filename: path to a csv or tsv file with distinctive features
        """
        table, shorthands_table, normalization = cls.read_sounds_file(filename)
        return cls(table, shorthands_table, normalization)



[docs]
    def check_validity(self, lattice, table):
        """  Check validity of this sound inventory for Qumin.

        Identifies when some segments are ancestors of others
        (some segments only differ from others through underspecification)

        Args:
            lattice: concept lattice
            table: table of sounds

        Raises: Exception
        """
        not_actual_leaves = []
        for leaf in table.index:
            lattice_node = frozenset(lattice[(leaf,)].extent)
            if len(lattice_node) > 1:
                not_actual_leaves.append((leaf, lattice_node))

        if not_actual_leaves:
            alert = ""
            for leaf, lattice_node in not_actual_leaves:
                other = set(lattice_node) - {leaf}
                alert += "\n\t" + leaf + " is the same node as " + str(lattice_node)
                alert += "\n\t\t" + self.infos(lattice_node)
                for o in other:
                    alert += "\n\t\t" + self.infos(self.get(o))
            raise Exception("Warning, some segments are  ancestors of other segments:" + alert)



[docs]
    @classmethod
    def calc_shorthand(cls, lattice, shorthands):
        """ Calculate shorthand names for some lattice nodes.

        Ex: ##C## in the sounds table might be a shorthand "C" for all consonants.

        Args:
            lattice: concept lattice
            shorthands: table or shorthand definitions

        Returns:
            a dictionary of intents to their shorthand names
        """
        if shorthands is not None:
            shorthand_context = sound_lattice_context(shorthands)

            stack = shorthands.index.tolist()
            shorthands = {}

            for e, i in shorthand_context.lattice:
                full_intent = lattice[i].intent
                for sh in e:
                    if sh in stack:
                        shorthand_name = sh.strip("#")
                        if shorthand_name:
                            shorthands[full_intent] = sh.strip("#")
                            stack.remove(sh)
                if not stack:
                    break
        else:
            shorthands = {}
        return shorthands



[docs]
    @classmethod
    def read_sounds_file(cls, filename):
        """ Read a sound file from file.

        Args:
            filename: path to the file

        Returns: (table, shorthands, normalization)
            table (pd.DataFrame): normalized sound table
            shorthands (pd.DataFrame): table holding shorthands for some concepts
            normalization (dict): dictionary of normalizations for identical rows.
        """
        log.info("Reading table %s", filename)

        table = pd.read_table(filename, header=0, dtype=str,
                              index_col=False, sep=',',
                              encoding="utf-8")

        sound_id = "sound_id"
        if sound_id not in table.columns:
            raise ValueError("Paralex sound tables must have a sound_id column.")

        drop = {"value", "UNICODE", "ALIAS", "Seg.",  # Legacy columns
                "label", "tier", "CLTS_id",  # Unused Paralex columns
                }
        deprecated_cols = table.columns.intersection({"value", "UNICODE", "ALIAS", "Seg."})
        if not deprecated_cols.empty:
            log.warning(f"Usage of columns {' ,'.join(deprecated_cols)} is deprecated. Edit your sounds file !")

        for col in drop:
            if col in table.columns:
                table.drop(col, axis=1, inplace=True)

        shorten_feature_names(table)

        table[sound_id] = table[sound_id].astype(str)
        na_vals = {c: "-1" for c in table.columns}
        na_vals[sound_id] = ""
        table = table.fillna(na_vals)

        # Checking segments names legality
        for seg in table[sound_id]:
            if seg == "":
                raise ValueError("One of your segments doesn't have a name !")
            if seg.strip("#") == "":
                raise ValueError("The symbol \"#\" is reserved and can only "
                                 "be used in a shorthand name (#V# for a vowel, etc)")


        # Separate shorthand table
        shorthand_selection = table[sound_id].str.match("^#.+#$")
        shorthands = None
        if shorthand_selection.any():
            shorthands = table[shorthand_selection]
            table = table[~shorthand_selection]
            shorthands.set_index(sound_id, inplace=True)
            shorthands = shorthands.map(str)  # Why is this necessary ?
        table.set_index(sound_id, inplace=True)

        if table.empty:
            raise ValueError('It seems that the paralex sounds table '
                             'has no usable sound definitions for Qumin.')


        log.info("Normalizing identical rows")
        attributes = list(table.columns)
        normalization = normalize(table, attributes)
        table.set_index("Normalized", inplace=True)
        table.drop_duplicates(inplace=True)
        log.debug("Normalization map: %s", normalization)

        table = table.map(lambda x: str(x))

        return table, shorthands, normalization


    def _add_segment(self,  extent, intent, shorthands):
        """ Adds a single lattice concept to the inventory.

        A concept is a sound class. If there is a single sound in the class,
        the node represents that specific phoneme.

        Args:
            extent: list of phonemes in the sound class
            intent: list of features for this concept
            shorthands: dictionary of shorthand names
        """
        # Define the shortest expression of this segment if possible
        shorthand = shorthands.get(intent, None)
        if len(intent) == 0:
            shorthand = "X"
        elif shorthand is None and len(extent) > 1:
            minimals = next(self.context.lattice[extent].attributes())
            if minimals:
                shorthand = "[{}]".format(" ".join(minimals))
        concept = self.context.lattice[extent]
        ancestors = ["|".join(sorted(c.extent)) for c in concept.upset()]
        classes = sorted(ancestors, key=len)
        seg_id = frozenset(extent) if len(extent) > 1 else extent[0]
        ordered = sorted(extent)
        self._regexes[seg_id] = _regex_or(ordered, sep=" ")
        if len(extent) == 1:
            self._pretty_str[seg_id] = seg_id
        else:
            self._pretty_str[seg_id] = "{" + ",".join(ordered) + "}"
        self._classes[seg_id] = set(classes)
        self._features[seg_id] = set(intent)
        self._features_str[seg_id] = shorthand or "[{}]".format(" ".join(sorted(intent)))


[docs]
    def regex(self, sound):
        """ Returns a regex representing a sound.

        Args:
            sound: identifier of a sound

        Returns:
            (str): regex string
        """
        return self._regexes[sound]



[docs]
    def pretty_str(self, sound, **kwargs):
        """ Returns a pretty string representing a sound.

        Args:
            sound: identifier of a sound

        Returns:
            (str): pretty string
        """
        return self._pretty_str[sound]



[docs]
    def features(self, sound, **kwargs):
        """ Returns a set of features representing a sound.

        Args:
            sound: identifier of a sound

        Returns:
            (set): features
        """
        return self._features[sound]



[docs]
    def features_str(self, sound, **kwargs):
        """ Returns a string which described the features of a sound.

        Args:
            sound: identifier of a sound

        Returns:
            (str): features string
        """
        return self._features_str[sound]



[docs]
    def shortest(self, sound, **kwargs):
        """ Returns a string which describes the sound in as little characters as possible.

        Args:
            sound: identifier of a sound

        Returns:
            (str): short string
        """
        return min((self.pretty_str(sound), self.features_str(sound)), key=len)



[docs]
    @staticmethod
    def is_leaf(sound):
        """ Returns whether this sound is a leaf (a phoneme, rather than a sound class)

        Args:
            sound: identifier of a sound

        Returns:

        """
        return isinstance(sound, str)



[docs]
    def infos(self, sound):
        """ String giving all useful information on a sound.

        Args:
            sound: identifier of a sound

        Returns:
            pretty string and features of a sound.

        """
        return self.pretty_str(sound) + " = " + self._features_str[sound]



[docs]
    def inf(self, a, b):
        """ Checks if a is a descendant of b.

        a < b iff b has children and either a is a string
        which is part of b, or a is a subset of b.
        """
        return (not self.is_leaf(b)) and ((a in b) or (not self.is_leaf(a) and a < b))



[docs]
    def similarity(self, a, b):
        """Computes phonological similarity  (Frisch, 2004)

        Measure from "Similarity avoidance and the OCP" , Frisch, S. A.; Pierrehumbert, J. B. & Broe,
        M. B. *Natural Language & Linguistic Theory*, Springer, 2004, 22, 179-228, p. 198.

        We compute similarity by comparing the number of shared and unshared natural classes
        of two consonants, using the equation in (7). This equation is a direct extension
        of the Pierrehumbert (1993) feature similarity metric to the case of natural classes.

        (7) :math:`Similarity = \\frac{\\text{Shared natural classes}}{\\text{Shared natural classes } + \\text{Non-shared natural classes}}`
        """
        if a == b:
            return 1
        ca = self._classes[a]
        cb = self._classes[b]
        return len(ca & cb) / len(ca | cb)



[docs]
    def init_dissimilarity_matrix(self, gap_prop=0.5, **kwargs):
        """Computes score matrix with dissimilarity scores."""
        # TODO: should this be delegated to morphalign ?
        # TODO: should this code all on integers ?
        costs = []
        simple_sounds = [s for s in self._classes if self.is_leaf(s)]
        for a, b in combinations(simple_sounds, 2):
            cost = 1 - self.similarity(a, b)
            self._score_matrix[(a, b)] = self._score_matrix[(b, a)] = cost
            costs.append(cost)

        self._gap_score = np.quantile(np.array(costs), 0.5) * gap_prop
        for a in simple_sounds:
            self._score_matrix[(a, a)] = 0



[docs]
    def insert_cost(self, *_):
        """Returns the constant insertion/deletion cost"""
        return self._gap_score



[docs]
    def sub_cost(self, a, b):
        """ Returns the cost of aligning sounds `a` and `b`

        Args:
            a: sound identifier
            b: sound identifier

        Returns: (float): substitution cost

        """
        return self._score_matrix[(a, b)]




[docs]
    @functools.lru_cache(maxsize=128)
    def get(self, descriptor):
        """ Get a sound using the lattice.

        Args:
            descriptor: iterable of phonemes OR iterable of features

        Returns: (str or frozenset) sound identifier

        """
        try:
            s = self.context.lattice[descriptor].extent
            if len(s) == 1:
                return s[0]
            return frozenset(s)
        except KeyError:
            raise ValueError("Unknown sound descriptor: " + repr(descriptor))



[docs]
    def meet(self, *args):
        """Finds the lowest common ancestors of segments from their identifiers.

        Args: several sound identifiers

        Returns:
            lowest common ancestor identifier
        """
        segments = set()
        for segment in args:
            if self.is_leaf(segment):
                segments.add(segment)
            else:
                segments |= segment
        return self.get(frozenset(segments))



[docs]
    @functools.lru_cache(maxsize=128)
    def transformation(self, a, b):
        """Find a transformation between a and b.

        The transformation is a pair of two maximal sets of segments related by a bijective phonological function.

        This function takes a pair of sound identifiers and calculates the function which relates
        these two segments. It then finds and returns the two maximal sets of segments related by this function.

        Example:
            In French, t -> s can be expressed by a phonological function
            which changes [-cont] and [-rel. ret] to [+cont] and [+rel. ret]

            These other segments are related by the same change:
            d -> z
            b -> v
            p -> f

            >>> inv = Inventory.from_file("tests/data/frenchipa.csv")
            >>> a,b = inv.transformation("t","s")
            >>> a == frozenset({'d', 't', 'b', 'p'})
            True
            >>> b == frozenset({'s', 'z', 'f', 'v'})
            True

        Arguments:
            a,b (str): Segment identifiers.

        Returns:
            tuple of frozenset: two sets of sounds.
        """

        def select_if_reciprocal(inst, segs, left, right):
            tmp = []
            for x in inst.id_to_frozenset(segs):
                y = inst.get(frozenset((inst.features(x) - left) | right))
                if y and type(y) is str:
                    x_back = inst.get(frozenset((inst.features(y) - right) | left))
                    if x == x_back:
                        tmp.append(x)
            return frozenset(tmp)  # TODO: warning: this need not be a lattice node

        left, right = self.get_transform_features(a, b)
        A, B = self.get(left), self.get(right)
        A = select_if_reciprocal(self, A, left, right)
        B = select_if_reciprocal(self, B, right, left)
        return A, B



[docs]
    def id_to_frozenset(self, sound_id):
        if self.is_leaf(sound_id):
            return frozenset({sound_id})
        return frozenset(sound_id)



[docs]
    def get_transform_features(self, left, right):
        """ Get the features corresponding to a transformation.

        Arguments:
            left (frozenset): set of phonemes
            right (frozenset): set of phonemes

        Example:
            >>> inv = Inventory.from_file("tests/data/frenchipa.csv")
            >>> inv.get_transform_features({"b","d"}, {"p","t"})
            (frozenset({'+voi'}), frozenset({'-voi'}))
        """

        t1 = self.features(self.get(self.id_to_frozenset(left)))
        t2 = self.features(self.get(self.id_to_frozenset(right)))
        f1 = t1 - t2
        f2 = t2 - t1
        return frozenset(f1), frozenset(f2)



[docs]
    def get_from_transform(self, a, transform):
        """ Get a segment from another according to a transformation tuple.

        Arguments:
            a (str): Segment alias
            transform (tuple): Couple of two segment IDs

        Example:
            >>> inv = Inventory.from_file("tests/data/frenchipa.csv")
            >>> inv.get_from_transform("d",
            ...                                     (frozenset({"d","t"}),
            ...                                     frozenset({"s","z"})))
            'z'
        """
        a = self.features(a)
        f1, f2 = self.get_transform_features(*transform)
        return self.get(frozenset((a - f1) | f2))



[docs]
    def show_pool(self):
        """Return a string description of the whole segment pool."""
        return "\n".join([self.infos(seg)
                          for seg in sorted(self._classes, key=lambda x: len(x))])



[docs]
    def segment_form(self, wordform, resegment=False):
        """ Segment a form into phonemes (either following spaces, or using sound inventory)

        Args:
            wordform (str): phonemic form
            resegment (bool): Whether to ignore spaces in phon forms and re-compute phonemic segmentation

        Returns:
            list of phonemes
        """
        wordform = wordform.strip(" ")
        if wordform == "": return []
        if len(wordform) == 1: return [wordform]
        if resegment:
            excluded = {"", " "}
            joined = wordform.replace(" ", "")
            return [s for s in self._segmenter.split(joined) if s not in excluded]

        # If there's no space, check that the word is only 1 phoneme long
        if " " not in wordform:
            try:
                self.get((wordform,))  # check that this entire wordform is a valid phoneme
            except ValueError:
                raise ValueError(f"Forms are not space separated, eg. {wordform}, "
                                 f"please pass resegment=True or provide Paralex-compliant segmented forms.")
        return [phon for phon in wordform.split(" ") if phon]





[docs]
def normalize(ipa, features):
    """Assign a normalized segment to groups of segments with identical rows.

    This function takes a segments table
    and adds **in place** a "Normalized" column.
    This column contains a common value
    for each segment with identical boolean values.
    The function also returns a translation table
    mapping indexes to normalized segments.

    Note: the index are expected to be one char length.

    ============ ============ ==============
    Index        ..features..  Normalized
    ============ ============ ==============
    ɛ               [...]       E
    e               [...]       E
    ============ ============ ==============

    Arguments:
        ipa (:class:`pandas:pandas.DataFrame`):
            Dataframe of segments. Columns are features,
            UNICODE code point representation and segment names,
            indexes are segments.
        features (list): Feature columns' names.

    Returns:
        dict: translation table from
            the segment's name to its normalized name.
    """

    def find_identical_rows(segment, table):
        seg_features = table.loc[segment, :]
        try:
            return (table == seg_features).all(axis=1)
        except ValueError:
            if seg_features.shape[0] > 1:
                raise ValueError("You have multiple definitions for {}\n{}".format(segment,
                                                                                   seg_features))

    ipa["Normalized"] = ""

    for segment in ipa[features].drop_duplicates().index:
        same_features_as_seg = find_identical_rows(segment, ipa[features])

        if (ipa.loc[same_features_as_seg, "Normalized"] == "").all():
            ipa.loc[same_features_as_seg, "Normalized"] = segment

    norm_map = {seg: norm for seg, norm in zip(ipa.index, ipa["Normalized"]) if
                seg != norm}

    return norm_map




[docs]
def shorten_feature_names(table):
    headers = list(table.iloc[0])
    if "Seg." in headers or "sound_id" in headers:
        raise ValueError("Using a second row of headers is not supported anymore.")
    short_features_names = []
    for name in table.columns:
        if name == "sound_id" or len(name) <= 3:  # Not a feature name
            short_features_names.append(name)
        else:
            if name in _to_short_feature:  # Check standard names
                short_features_names.append(_to_short_feature[name])
            elif name.lower() in _to_short_feature:  # Uppercase
                short_features_names.append(_to_short_feature[name.lower()].upper())
            else:
                # Make an abbreviation on the fly by shortening the label
                names = [name[:i] for i in range(3, len(name) + 1)]
                reserved_names = _short_features + short_features_names
                while names and names[0] in reserved_names:
                    names.pop(0)
                if len(names) != 0:
                    new_name = names[0]
                else:  # Fallback strategy: append a unique integer
                    key = 1
                    new_name = name[:3] + str(key)
                    while new_name in reserved_names:
                        key += 1
                        new_name = name[:3] + str(key)
                short_features_names.append(new_name)
    table.columns = short_features_names