Source code for qumin.representations.alignment

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Sacha Beniamine.

This module is used to align sequences.
"""
from itertools import zip_longest

import numpy as np



[docs]
def commonprefix(*args):
    """Given a list of strings, returns the longest common prefix"""
    if not args:
        return None
    shortest, *args = sorted(args, key=len)
    zipped = zip_longest(*args)
    for i, c in enumerate(shortest):
        zipped_i = next(zipped)
        if any([s != c for s in zipped_i]):
            return shortest[:i]
    return shortest




[docs]
def commonsuffix(*args):
    """Given a list of strings, returns the longest common suffix"""
    return commonprefix(*(x[::-1] for x in args))[::-1]



def _all_min(iterable):
    """Return all minimum elements from an iterable according to the first element of its tuples."""
    minimums = []
    minkeyval = iterable[-1][0]
    for x in iterable:
        keyval = x[0]
        if keyval < minkeyval:
            minimums = [x]
            minkeyval = keyval
        elif keyval == minkeyval:
            minimums.append(x)
    return minimums



[docs]
def edits_ins_cost(*_):
    return 1




[docs]
def edits_sub_cost(a, b):
    return int(a != b)




[docs]
def multi_sub_cost(a, b):
    return int(b not in a)




[docs]
def align_multi(*strings, **kwargs):
    """ Levenshtein-style alignment over arguments, two by two."""
    if len(strings) == 1:
        return [(elem,) for elem in strings[0]]

    kwargs["fillvalue"] = kwargs.get("fillvalue", "")
    kwargs["insert_cost"] = edits_ins_cost
    kwargs["sub_cost"] = multi_sub_cost

    def flatten_alignment(alignment):
        for a, b in alignment:
            try:
                yield a | {b}
            except TypeError:  # a is the fillvalue
                yield {a, b}

    first_seq = [{s} for s in strings[0]]
    aligned = first_seq

    for i in range(1, len(strings)):
        aligned = list(flatten_alignment(align_auto(aligned, strings[i], **kwargs)[0]))
    return aligned




[docs]
def align_auto(s1, s2, insert_cost, sub_cost, distance_only=False, fillvalue="", **kwargs):
    """Return all the best alignments of two words according to some edit distance matrix.

    Arguments:
        s1 (str): first word to align
        s2 (str): second word to align
        insert_cost (Callable): A function which takes one value and returns an insertion cost
        sub_cost (Callable): A function which takes two values and returns a substitution cost
        distance_only (bool): defaults to False. If True, returns only the best distance. If False, returns an alignment.
        fillvalue: (optional) the value with which to pad when iterable have varying lengths. Default:  "".

    Returns:
        Either an alignment (a `list` of `list` of zipped tuples), or a distance (if `distance_only` is True).
    """
    m = len(s1)
    n = len(s2)
    paths = np.empty((m + 1, n + 1), dtype=list)
    paths[0, 0] = [(0, (0, 0), ("", ""))]

    for i, a in enumerate(s1):
        paths[i + 1, 0] = [(paths[i, 0][0][0] + insert_cost(a), (i, 0), (a, fillvalue))]

    for j, b in enumerate(s2):
        paths[0, j + 1] = [(paths[0, j][0][0] + insert_cost(b), (0, j), (fillvalue, b))]

    for i in range(1, m + 1):
        a = s1[i - 1]

        for j in range(1, n + 1):
            b = s2[j - 1]
            subcost = paths[i - 1, j - 1][0][0] + sub_cost(a, b)
            insb = paths[i, j - 1][0][0] + insert_cost(b)
            insa = paths[i - 1, j][0][0] + insert_cost(a)

            paths[i, j] = _all_min([(subcost, (i - 1, j - 1), (a, b)),
                                    (insb, (i, j - 1), (fillvalue, b)),
                                    (insa, (i - 1, j), (a, fillvalue))])
    if distance_only:
        return paths[-1][-1][0][0]
    else:
        return _multibacktrack(paths)



def _multibacktrack(paths):
    max_i, max_j = paths.shape
    stack = [([],  # empty alignment
              (max_i - 1, max_j - 1),  # start at last cell
              set())  # No cell is forbidden.
             ]
    solutions = []
    while stack:
        current_path, (i, j), visited = stack.pop(0)
        if (i, j) in visited:
            continue  # abandon this path, it is redundant
        else:
            visited.add((i, j))
            if i == 0 and j == 0:
                solutions.append(current_path)
            else:
                # ins_del = {(i - 1, j), (i, j - 1)}
                for step in paths[i, j]:
                    _, idxs, action = step
                    # Share the same visited set unless perfect match
                    new_visited = visited if action[0] != action[1] else set()
                    stack.append(([action] + current_path,
                                  idxs,
                                  new_visited))
    return solutions



[docs]
def align_left(*args, **kwargs):
    """Align left all arguments (wrapper around zip_longest).

    Examples:
        >>> align_left("mɪs","mas")
        [('m', 'm'), ('ɪ', 'a'), ('s', 's')]
        >>> align_left("mɪs","mɪst")
        [('m', 'm'), ('ɪ', 'ɪ'), ('s', 's'), ('', 't')]
        >>> align_left("mɪs","amɪs")
        [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('', 's')]
        >>> align_left("mɪst","amɪs")
        [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('t', 's')]


    Arguments:
        *args: any number of iterables >= 2
        fillvalue: the value with which to pad when iterable have varying lengths. Default:  "".

    Returns:
        a `list` of zipped tuples, left aligned.
    """
    if "fillvalue" not in kwargs:
        kwargs["fillvalue"] = ""
    return list(zip_longest(*args, **kwargs))




[docs]
def align_right(*iterables, **kwargs):
    """Align right all arguments. Zip longest with right alignment.

    Examples:
        >>> align_right("mɪs","mas")
        [('m', 'm'), ('ɪ', 'a'), ('s', 's')]
        >>> align_right("mɪs","mɪst")
        [('', 'm'), ('m', 'ɪ'), ('ɪ', 's'), ('s', 't')]
        >>> align_right("mɪs","amɪs")
        [('', 'a'), ('m', 'm'), ('ɪ', 'ɪ'), ('s', 's')]
        >>> align_right("mɪst","amɪs")
        [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('t', 's')]


    Arguments:
        *iterables: any number of iterables >= 2
        fillvalue: the value with which to pad when iterable have varying lengths. Default:  "".

    Returns:
        a `list` of zipped tuples, right aligned.
    """
    if "fillvalue" not in kwargs:
        kwargs["fillvalue"] = ""
    reverse = list(zip_longest(*[x[::-1] for x in iterables], **kwargs))
    return reverse[::-1]




[docs]
def align_baseline(*args, **kwargs):
    """ Simple alignment intended as an inflectional baseline. (Albright & Hayes 2002)

    single change, either suffixal, or suffixal, or infixal.
    This doesn't work well when there is both a prefix and a suffix.
    Used as a baseline for evaluation of the auto-aligned patterns.

    see "Modeling English Past Tense Intuitions with Minimal Generalization", Albright, A. & Hayes, B.
    *Proceedings of the ACL-02 Workshop on Morphological and Phonological Learning* - Volume 6,
    Association for Computational Linguistics, 2002, 58-69, page 2 :

        "The exact procedure for finding a word-specific
        rule is as follows: given an input pair (X, Y), the
        model first finds the maximal left-side substring
        shared by the two forms (e.g., #mɪs), to create the
        C term (left side context). The model then exam-
        ines the remaining material and finds the maximal
        substring shared on the right side, to create the D
        term (right side context). The remaining material is
        the change; the non-shared string from the first
        form is the A term, and from the second form is the
        B term."

    Examples:
        >>> align_baseline("mɪs","mas")
        [('m', 'm'), ('ɪ', 'a'), ('s', 's')]
        >>> align_baseline("mɪs","mɪst")
        [('m', 'm'), ('ɪ', 'ɪ'), ('s', 's'), ('', 't')]
        >>> align_baseline("mɪs","amɪs")
        [('', 'a'), ('m', 'm'), ('ɪ', 'ɪ'), ('s', 's')]
        >>> align_baseline("mɪst","amɪs")
        [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('t', 's')]


    Arguments:
        *args: any number of iterables >= 2
        fillvalue: the value with which to pad when iterable have varying lengths. Default:  "".

    Returns:
        a `list` of zipped tuples.
    """

    fillvalue = kwargs.get("fillvalue", "")
    la = len(args)

    C = commonprefix(*args)
    rest = [x[len(C):] for x in args]
    D = commonsuffix(*rest)

    if C:
        C = list(zip(*[C for _ in range(la)]))
    else:
        C = []
    if D:
        AB = [x[:-len(D)] for x in rest]
        D = list(zip(*[D for _ in range(la)]))
    else:
        AB = rest
        D = []

    AB = list(zip_longest(*AB, fillvalue=fillvalue))

    return C + AB + D