Source code for qumin.representations.frequencies

# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Jules Bouton.

Class for frequency management.
"""

import pandas as pd
from tqdm import tqdm
import paralex as px
import frictionless as fl
import logging
tqdm.pandas()

log = logging.getLogger("Qumin")



[docs]
class Frequencies(object):
    """Frequency management for a Paralex dataset. Frequencies are built for forms,
    lexemes and cells.

    The parsed frequency columns or tables should conform to the Paralex principles:
        - An empty value means that there is no measure available
        - A zero value means that there is a measure, which is zero

    When aggregating accross rows, any empty cell yields a uniform distribution
    for the whole set of rows, whereas zeros are taken into account. This behaviour
    can be disabled for some functions by passing skipna=True.

    Examples:

        >>> p = fl.Package('tests/data/TestPackage/test.package.json')
        >>> f = Frequencies(p)
        >>> print(f.info().to_markdown())
        | Table   | Source      |   Records |   Sum(f) |   Mean(f) |
        |:--------|:------------|----------:|---------:|----------:|
        | forms   | forms_table |        22 |      519 |   27.3158 |
        | lexemes | forms_table |         4 |      519 |  129.75   |
        | cells   | forms_table |         4 |      519 |  129.75   |


    Attributes:
        p (frictionless.Package): package to analyze
        source (Dict[str, str]): source used by default for each table.
            Contains either a value for the source field of a Paralex frequency table,
            or the name of the table used to extract the frequency.
        forms (:class:`pandas:pandas.DataFrame`):
            Table of frequency values associated to a form_id.
        lexemes (:class:`pandas:pandas.DataFrame`):
            Table of frequency values associated to a lexeme_id.
        cells (:class:`pandas:pandas.DataFrame`):
            Table of frequency values associated to a cell_id.
        """

    p = None
    col_names = ["lexeme", "cell", "form"]
    source = {"cells": None,
              "lexemes": None,
              "forms": None}


[docs]
    def __init__(self, package, *args, source=False, **kwargs):
        """Constructor for Frequencies. We gather and store frequencies
        for forms, lexemes and cells. Behaviour is the following:

        - If `force_uniform` is `True`, we use the paradigms table to generate a Uniform distribution.
        - If not, we try to get a frequency column from the tables: form, lexemes, cell
        - If any of those is missing, we use the frequencies table.
        - If we can't use the frequency table, we fall back to a uniform.

        Arguments:
            package (frictionless.Package): package to analyze
            source (Dict[str, str]): name of the source to use when several are available.
            **kwargs: keyword arguments for frequency reading methods.
        """

        self.p = package

        if source:
            self.source.update(source)

        self._read_aggregate_frequencies("forms", *args, **kwargs)
        self._read_aggregate_frequencies("lexemes", *args, **kwargs)
        self._read_aggregate_frequencies("cells", *args, **kwargs)


    def _read_aggregate_frequencies(self, name, force_uniform=False):
        """
        Recover frequency information for forms, cells or lexemes.

        Arguments:
            name(str): Frequency table to build. Either forms, cells or lexemes.
            paradigms (pandas.DataFrame): full paradigms. To ensure that the same forms are used.
            force_uniform (bool): Whether to replace everywhere real frequencies
                by empty uniform distributions. Defaults to False

        """
        if self.p.has_resource(name):
            table = px.read_table(name, self.p).set_index(name[:-1] + "_id")
        else: # This can be name="lexemes" or "cells" -- the forms table can't not be there !
            log.warning(f"Table for {name} couldn't be found in this dataset.")
            table = pd.DataFrame({"source":pd.NA,
                                  "value":"empty"},
                                 index=self.forms.loc[:, name[:-1]].unique())
            force_uniform = True

        # There are 4 different situations:
        # 1. Reading frequencies from the given table.
        if not force_uniform and "frequency" in table.columns:
            log.info(f'{name}: Frequencies in the table. Reading them.')
            table['source'] = f'{name}_table'
            table.rename({"frequency": "value"}, axis=1, inplace=True)
            self.source[name] = name + '_table'

        # 2. For forms, try to read from the frequencies table.
        elif not force_uniform and name == 'forms' and self.p.has_resource("frequencies"):
            log.info('No frequencies in the paradigms table, looking for a frequency table.')
            freq = px.read_table('frequencies', self.p, index_col='freq_id',
                                 usecols=['freq_id', 'form', 'value', 'source'])

            freq_col = freq.columns
            if "form" not in freq_col:
                raise ValueError("No form column in the frequency table."
                                 "I can't build frequency information for forms."
                                 "You should probably pass uniform=True "
                                 "or report this issue.")

            if "source" not in freq_col:
                freq['source'] = 'frequencies_table'
                self.source['forms'] = 'frequencies_table'
            elif self.source['forms'] is None:
                self.source['forms'] = list(freq['source'].unique())[0]
                log.info(f"No default source provided for frequencies. Using {self.source['forms']}")

            # We use the form_id column to match both dataframes
            freq.set_index('form', inplace=True)

            missing_idx = ~table.index.isin(freq.index)
            if missing_idx.any():
                log.warning(f"The frequencies table does not contain "
                            f"a row for every form_id row."
                            f"Missing:\n{table.loc[missing_idx].head()}")

            table.loc[:, ['value', 'source']] = freq.loc[:, ['value', 'source']]

        # 3. For cells and lexemes build from the forms table.
        # TODO read directly from the frequencies table if possible
        elif not force_uniform and name != 'forms' and (self.has_frequencies('forms')):
            log.info(f'{name}: No frequencies in the {name} table, building from the forms table.')
            freq = self.forms.groupby(name[:-1]).value.sum()
            table.loc[freq.index, "value"] = freq.values
            table['source'] = 'forms_table'
            self.source[name] = 'forms_table'

        # 4. Building a fake uniform frequency distribution.
        else:
            if not force_uniform:
                log.warning(f"Frequency information for {name} couldn't be found "
                            "in this dataset.")
            log.info(f'{name}: Building empty frequencies.')
            table['source'] = 'empty'
            table['value'] = pd.NA
            self.source[name] = 'empty'

        if name == "forms":
            # Check for duplicate overabundant phon_forms and sum the frequencies.
            # This handles cases where the orth_form is different and has two records.
            # Paradigms should be read only once, and this code shouldn't be redundant with
            # the main script. This should be fixed elsewhere. TODO
            table['form'] = table.phon_form
            dup = table.duplicated(subset=self.col_names, keep=False)
            if dup.any():
                table.loc[dup, 'value'] = \
                    table.loc[dup].groupby(self.col_names).value.transform('sum')
                table.drop_duplicates(subset=self.col_names, inplace=True)
            cols = ['cell', 'lexeme', 'value', 'source']

        else:
            cols = ['value', 'source']

        # We save the resulting table
        table.sort_index(inplace=True)
        table.index.name = name[:-1]
        table.index = table.index.astype('str')
        setattr(self, name, table[cols])


[docs]
    def drop_unused(self, paradigms):
        """
        If the paradigms table implied some sampling / filtering,
        make sure that the frequencies are also sampled.
        """

        self.forms = self.forms[self.forms.index.astype(str).isin(paradigms.index)]

        # TODO it would be nice to recompute the lexeme/cell frequencies
        # based on the forms that we kept.


[docs]
    def get_absolute_freq(self, mean=False, group_on=False, skipna=False, **kwargs):
        """
        Return the frequency of an item for a given source

        The frequency of an item is defined as the sum of the frequencies of this item
        across all rows.

        Examples:

            >>> p = fl.Package('tests/data/TestPackage/test.package.json')
            >>> f = Frequencies(p)
            >>> f.get_absolute_freq(filters={'lexeme':'q'}, group_on="index", skipna=True)
            form
            11    12.0
            12     6.0
            14    20.0
            18     NaN
            23    20.0
            Name: value, dtype: float64
            >>> float(f.get_absolute_freq(filters={'lexeme':'q'}))
            nan
            >>> float(f.get_absolute_freq(filters={'cell':'third'}, mean=True, skipna=True))
            20.0
            >>> f.get_absolute_freq(group_on=['lexeme'])
            lexeme
            k    203.0
            p      NaN
            q      NaN
            s     63.0
            Name: value, dtype: float64

        Todo:
            Replace if mean/else by an aggfunc parameter, once skipna will be supported
                by pandas functions.

        Arguments:
            group_on (List[str]): columns for which absolute frequencies should be computed.
                If `False`, aggregates across all records.
            mean (bool): Defaults to False. If True, returns a mean instead of a sum.
            skipna(bool): Defaults to False. Skip `nan` values for sums or means.

        Returns:
            `pandas.Series`: a Series which contains the output values.
                The index is either the original one, or the grouping columns.
        """

        # Filter using keys from mapping dict
        sublist = self._filter_frequencies(**kwargs)

        if group_on == "index":
            return sublist.value
        elif group_on is False:
            groups = [True] * len(sublist)
        else:
            groups = group_on

        if mean:
            def func(x): return x.mean(skipna=skipna)
        else:
            def func(x): return x.sum(skipna=skipna)

        result = sublist.groupby(by=groups, group_keys=False).value.apply(func)

        if group_on is False:
            return result.iloc[0]
        else:
            return result



[docs]
    def get_relative_freq(self, group_on=False, uniform_duplicates=False, **kwargs):
        """
        Returns the relative frequencies of a set of rows according to a set of grouping columns.
        If any of the values is empty, we generate a Uniform distribution for this group.

        Note:
            To avoid long computations, we use C implementations.
            Unfortunately, `skipna` is not yet implemented in `GroupBy.sum`. For this reason,
            we use a more complex pipeline of C functions.

        Todo:
            Replace the pipeline by a much simpler .transform(sum, skipna=False), once possible.

        Examples:

            >>> p = fl.Package('tests/data/TestPackage/test.package.json')
            >>> f = Frequencies(p)
            >>> f.get_relative_freq(filters={'lexeme': 'p', 'cell':'first'}, group_on=["lexeme"])['result'].values
            array([0.05882353, 0.94117647])
            >>> f.get_relative_freq(filters={'lexeme': 's', 'cell':'second'}, group_on=["lexeme"])['result'].values
            array([0., 1.])
            >>> f.get_relative_freq(filters={'cell':"third"}, group_on=["cell"])['result'].values
            array([0.25, 0.25, 0.25, 0.25])
            >>> f.get_relative_freq(filters={'lexeme':'p'}, group_on=["lexeme", "cell"])['result'].values
            array([0.05882353, 0.94117647, 1.        , 1.        , 1.        ])
            >>> f.get_relative_freq(filters={'lexeme':'s', 'cell': 'first'}, group_on=["lexeme", "cell"]).result.values
            array([0.33333333, 0.33333333, 0.33333333])

        Arguments:
            group_on (List[str]): column on which relative frequencies should be computed
            uniform_duplicates (bool): Whether to give a uniform weight to duplicate items
                or a relative weight based on tokens.

        Returns:
            `pandas.DataFrame`: a DataFrame which contains a `result` column with the output value.
                The index is the original one. The grouping columns are also provided.
        """

        # Filter using keys from mapping dict
        sublist = self._filter_frequencies(**kwargs)

        if group_on is False:
            groups = [True] * len(sublist)
            col_names = list()
        else:
            groups = group_on
            col_names = list(group_on)

        # 1. We first get the nb of items in each group
        sublist['result'] = sublist\
            .groupby(groups, sort=False).value\
            .transform("size")

        sublist.result = sublist.result.astype('float64')

        # 2. If there are any NaN values, we give a uniform frequency to the group
        any_nan = sublist.groupby(groups).value.transform(lambda x: x.isna().any())

        # 3. If a whole group contains zeros, we give a uniform frequency to the group
        all_zero = sublist.groupby(groups).value.transform(lambda x: (x == 0).all())

        # Apply 2 and 3 or apply everywhere if uniform_duplicates
        selector = (sublist.result != 1) & (any_nan | all_zero | uniform_duplicates)
        sublist.loc[selector, 'result'] = 1/sublist.loc[selector, 'result']

        # 4. If all values are filled and if the group is bigger than one, we sum the frequencies
        selector = sublist.result > 1

        if selector.any():
            if group_on is False:
                groups = selector

            sublist.loc[selector, 'result'] = sublist.loc[selector, 'value']/sublist.loc[selector]\
                .groupby(groups, sort=False).value.transform('sum')

        return sublist[col_names + ["result"]]


    def _filter_frequencies(self, data="forms", source=None, filters={}, inplace=False):
        """Filters the dataframe based on a set of filters
        provided as a dictionary.

        Arguments:
            filters (dict): a mapping of the following kind `{"lexeme": value,
                "cell": value, "form": value}`.
            data(str): name of one of the three tables (forms, lexemes, cells)
            source (str): the name of the source to use. If nothing is provided,
                the default source is selected.
            inplace (bool): whether the filter should operate in place or not. Defaults to False.
        """
        missing = set(filters.keys())-set(self.col_names)
        if missing:
            log.warning("You passed some column names that don't exist. They will be ignored: %s",
                        ", ".join(missing))

        def _listify(x):
            """Ensure that passed values of mapping are list-like objects"""
            if isinstance(x, str):
                x = [x]
            else:
                try:
                    iter(x)
                except TypeError:
                    x = [x]
                else:
                    x = list(x)
            return x

        mapping = {k: _listify(v) for k, v in filters.items() if v is not None and k not in missing}

        if source is None:
            source = self.source[data]
        if source is not False:
            mapping["source"] = [source]

        freq = getattr(self, data).copy()
        idx_name = freq.index.name
        freq.reset_index(inplace=True)

        def _selector(mapping):
            """Avoid repetition of this complex line"""
            if mapping:
                return freq.loc[freq[list(mapping)].isin(mapping).all(axis=1)]\
                    .copy().set_index(idx_name)
            return freq.set_index(idx_name)

        if inplace:
            setattr(self, data, _selector(mapping))
        else:
            return _selector(mapping)


[docs]
    def has_frequencies(self, table="forms"):
        """
        Returns True if the requested contains real frequencies.

        Parameters:
            table (str): name of the table to test.
        """

        return self.source[table] != "empty"



[docs]
    def info(self):
        """Returns a convenient DataFrame with summary statistics.

        Returns:
            `pandas.DataFrame`: A summary of statistics about this Frequencies handler.
        """
        metrics = []
        for i in ['forms', 'lexemes', 'cells']:
            data = getattr(self, i)
            metrics.append([i, self.source[i], len(data),
                            data.value.sum(), data.value.mean()])
        return pd.DataFrame(metrics, columns=['Table', 'Source', 'Records', 'Sum(f)', 'Mean(f)'])\
            .set_index('Table')




if __name__ == "__main__":
    import doctest
    doctest.testmod()