Source code for qumin.representations.alignment

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Sacha Beniamine.

This module is used to align sequences.
"""
from itertools import zip_longest

import numpy as np


[docs] def commonprefix(*args): """Given a list of strings, returns the longest common prefix""" if not args: return None shortest, *args = sorted(args, key=len) zipped = zip_longest(*args) for i, c in enumerate(shortest): zipped_i = next(zipped) if any([s != c for s in zipped_i]): return shortest[:i] return shortest
[docs] def commonsuffix(*args): """Given a list of strings, returns the longest common suffix""" return commonprefix(*(x[::-1] for x in args))[::-1]
def _all_min(iterable): """Return all minimum elements from an iterable according to the first element of its tuples.""" minimums = [] minkeyval = iterable[-1][0] for x in iterable: keyval = x[0] if keyval < minkeyval: minimums = [x] minkeyval = keyval elif keyval == minkeyval: minimums.append(x) return minimums
[docs] def edits_ins_cost(*_): return 1
[docs] def edits_sub_cost(a, b): return int(a != b)
[docs] def multi_sub_cost(a, b): return int(b not in a)
[docs] def align_multi(*strings, **kwargs): """ Levenshtein-style alignment over arguments, two by two.""" if len(strings) == 1: return [(elem,) for elem in strings[0]] kwargs["fillvalue"] = kwargs.get("fillvalue", "") kwargs["insert_cost"] = edits_ins_cost kwargs["sub_cost"] = multi_sub_cost def flatten_alignment(alignment): for a, b in alignment: try: yield a | {b} except TypeError: # a is the fillvalue yield {a, b} first_seq = [{s} for s in strings[0]] aligned = first_seq for i in range(1, len(strings)): aligned = list(flatten_alignment(align_auto(aligned, strings[i], **kwargs)[0])) return aligned
[docs] def align_auto(s1, s2, insert_cost, sub_cost, distance_only=False, fillvalue="", **kwargs): """Return all the best alignments of two words according to some edit distance matrix. Arguments: s1 (str): first word to align s2 (str): second word to align insert_cost (Callable): A function which takes one value and returns an insertion cost sub_cost (Callable): A function which takes two values and returns a substitution cost distance_only (bool): defaults to False. If True, returns only the best distance. If False, returns an alignment. fillvalue: (optional) the value with which to pad when iterable have varying lengths. Default: "". Returns: Either an alignment (a `list` of `list` of zipped tuples), or a distance (if `distance_only` is True). """ m = len(s1) n = len(s2) paths = np.empty((m + 1, n + 1), dtype=list) paths[0, 0] = [(0, (0, 0), ("", ""))] for i, a in enumerate(s1): paths[i + 1, 0] = [(paths[i, 0][0][0] + insert_cost(a), (i, 0), (a, fillvalue))] for j, b in enumerate(s2): paths[0, j + 1] = [(paths[0, j][0][0] + insert_cost(b), (0, j), (fillvalue, b))] for i in range(1, m + 1): a = s1[i - 1] for j in range(1, n + 1): b = s2[j - 1] subcost = paths[i - 1, j - 1][0][0] + sub_cost(a, b) insb = paths[i, j - 1][0][0] + insert_cost(b) insa = paths[i - 1, j][0][0] + insert_cost(a) paths[i, j] = _all_min([(subcost, (i - 1, j - 1), (a, b)), (insb, (i, j - 1), (fillvalue, b)), (insa, (i - 1, j), (a, fillvalue))]) if distance_only: return paths[-1][-1][0][0] else: return _multibacktrack(paths)
def _multibacktrack(paths): max_i, max_j = paths.shape stack = [([], # empty alignment (max_i - 1, max_j - 1), # start at last cell set()) # No cell is forbidden. ] solutions = [] while stack: current_path, (i, j), visited = stack.pop(0) if (i, j) in visited: continue # abandon this path, it is redundant else: visited.add((i, j)) if i == 0 and j == 0: solutions.append(current_path) else: # ins_del = {(i - 1, j), (i, j - 1)} for step in paths[i, j]: _, idxs, action = step # Share the same visited set unless perfect match new_visited = visited if action[0] != action[1] else set() stack.append(([action] + current_path, idxs, new_visited)) return solutions
[docs] def align_left(*args, **kwargs): """Align left all arguments (wrapper around zip_longest). Examples: >>> align_left("mɪs","mas") [('m', 'm'), ('ɪ', 'a'), ('s', 's')] >>> align_left("mɪs","mɪst") [('m', 'm'), ('ɪ', 'ɪ'), ('s', 's'), ('', 't')] >>> align_left("mɪs","amɪs") [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('', 's')] >>> align_left("mɪst","amɪs") [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('t', 's')] Arguments: *args: any number of iterables >= 2 fillvalue: the value with which to pad when iterable have varying lengths. Default: "". Returns: a `list` of zipped tuples, left aligned. """ if "fillvalue" not in kwargs: kwargs["fillvalue"] = "" return list(zip_longest(*args, **kwargs))
[docs] def align_right(*iterables, **kwargs): """Align right all arguments. Zip longest with right alignment. Examples: >>> align_right("mɪs","mas") [('m', 'm'), ('ɪ', 'a'), ('s', 's')] >>> align_right("mɪs","mɪst") [('', 'm'), ('m', 'ɪ'), ('ɪ', 's'), ('s', 't')] >>> align_right("mɪs","amɪs") [('', 'a'), ('m', 'm'), ('ɪ', 'ɪ'), ('s', 's')] >>> align_right("mɪst","amɪs") [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('t', 's')] Arguments: *iterables: any number of iterables >= 2 fillvalue: the value with which to pad when iterable have varying lengths. Default: "". Returns: a `list` of zipped tuples, right aligned. """ if "fillvalue" not in kwargs: kwargs["fillvalue"] = "" reverse = list(zip_longest(*[x[::-1] for x in iterables], **kwargs)) return reverse[::-1]
[docs] def align_baseline(*args, **kwargs): """ Simple alignment intended as an inflectional baseline. (Albright & Hayes 2002) single change, either suffixal, or suffixal, or infixal. This doesn't work well when there is both a prefix and a suffix. Used as a baseline for evaluation of the auto-aligned patterns. see "Modeling English Past Tense Intuitions with Minimal Generalization", Albright, A. & Hayes, B. *Proceedings of the ACL-02 Workshop on Morphological and Phonological Learning* - Volume 6, Association for Computational Linguistics, 2002, 58-69, page 2 : "The exact procedure for finding a word-specific rule is as follows: given an input pair (X, Y), the model first finds the maximal left-side substring shared by the two forms (e.g., #mɪs), to create the C term (left side context). The model then exam- ines the remaining material and finds the maximal substring shared on the right side, to create the D term (right side context). The remaining material is the change; the non-shared string from the first form is the A term, and from the second form is the B term." Examples: >>> align_baseline("mɪs","mas") [('m', 'm'), ('ɪ', 'a'), ('s', 's')] >>> align_baseline("mɪs","mɪst") [('m', 'm'), ('ɪ', 'ɪ'), ('s', 's'), ('', 't')] >>> align_baseline("mɪs","amɪs") [('', 'a'), ('m', 'm'), ('ɪ', 'ɪ'), ('s', 's')] >>> align_baseline("mɪst","amɪs") [('m', 'a'), ('ɪ', 'm'), ('s', 'ɪ'), ('t', 's')] Arguments: *args: any number of iterables >= 2 fillvalue: the value with which to pad when iterable have varying lengths. Default: "". Returns: a `list` of zipped tuples. """ fillvalue = kwargs.get("fillvalue", "") la = len(args) C = commonprefix(*args) rest = [x[len(C):] for x in args] D = commonsuffix(*rest) if C: C = list(zip(*[C for _ in range(la)])) else: C = [] if D: AB = [x[:-len(D)] for x in rest] D = list(zip(*[D for _ in range(la)])) else: AB = rest D = [] AB = list(zip_longest(*AB, fillvalue=fillvalue)) return C + AB + D