# -*- coding: utf-8 -*-
# !/usr/bin/env python
"""author: Jules Bouton.
Class for frequency management.
"""
import pandas as pd
from tqdm import tqdm
import paralex as px
import frictionless as fl
import logging
tqdm.pandas()
log = logging.getLogger("Qumin")
[docs]
class Frequencies(object):
"""Frequency management for a Paralex dataset. Frequencies are built for forms,
lexemes and cells.
The parsed frequency columns or tables should conform to the Paralex principles:
- An empty value means that there is no measure available
- A zero value means that there is a measure, which is zero
When aggregating accross rows, any empty cell yields a uniform distribution
for the whole set of rows, whereas zeros are taken into account. This behaviour
can be disabled for some functions by passing skipna=True.
Examples:
>>> p = fl.Package('tests/data/TestPackage/test.package.json')
>>> f = Frequencies(p)
>>> print(f.info().to_markdown())
| Table | Source | Records | Sum(f) | Mean(f) |
|:--------|:------------|----------:|---------:|----------:|
| forms | forms_table | 22 | 519 | 27.3158 |
| lexemes | forms_table | 4 | 519 | 129.75 |
| cells | forms_table | 4 | 519 | 129.75 |
Attributes:
p (frictionless.Package): package to analyze
source (Dict[str, str]): source used by default for each table.
Contains either a value for the source field of a Paralex frequency table,
or the name of the table used to extract the frequency.
forms (:class:`pandas:pandas.DataFrame`):
Table of frequency values associated to a form_id.
lexemes (:class:`pandas:pandas.DataFrame`):
Table of frequency values associated to a lexeme_id.
cells (:class:`pandas:pandas.DataFrame`):
Table of frequency values associated to a cell_id.
"""
p = None
col_names = ["lexeme", "cell", "form"]
source = {"cells": None,
"lexemes": None,
"forms": None}
[docs]
def __init__(self, package, *args, source=False, **kwargs):
"""Constructor for Frequencies. We gather and store frequencies
for forms, lexemes and cells. Behaviour is the following:
- If `force_uniform` is `True`, we use the paradigms table to generate a Uniform distribution.
- If not, we try to get a frequency column from the tables: form, lexemes, cell
- If any of those is missing, we use the frequencies table.
- If we can't use the frequency table, we fall back to a uniform.
Arguments:
package (frictionless.Package): package to analyze
source (Dict[str, str]): name of the source to use when several are available.
**kwargs: keyword arguments for frequency reading methods.
"""
self.p = package
if source:
self.source.update(source)
self._read_aggregate_frequencies("forms", *args, **kwargs)
self._read_aggregate_frequencies("lexemes", *args, **kwargs)
self._read_aggregate_frequencies("cells", *args, **kwargs)
def _read_aggregate_frequencies(self, name, force_uniform=False):
"""
Recover frequency information for forms, cells or lexemes.
Arguments:
name(str): Frequency table to build. Either forms, cells or lexemes.
paradigms (pandas.DataFrame): full paradigms. To ensure that the same forms are used.
force_uniform (bool): Whether to replace everywhere real frequencies
by empty uniform distributions. Defaults to False
"""
if self.p.has_resource(name):
table = px.read_table(name, self.p).set_index(name[:-1] + "_id")
else: # This can be name="lexemes" or "cells" -- the forms table can't not be there !
log.warning(f"Table for {name} couldn't be found in this dataset.")
table = pd.DataFrame({"source":pd.NA,
"value":"empty"},
index=self.forms.loc[:, name[:-1]].unique())
force_uniform = True
# There are 4 different situations:
# 1. Reading frequencies from the given table.
if not force_uniform and "frequency" in table.columns:
log.info(f'{name}: Frequencies in the table. Reading them.')
table['source'] = f'{name}_table'
table.rename({"frequency": "value"}, axis=1, inplace=True)
self.source[name] = name + '_table'
# 2. For forms, try to read from the frequencies table.
elif not force_uniform and name == 'forms' and self.p.has_resource("frequencies"):
log.info('No frequencies in the paradigms table, looking for a frequency table.')
freq = px.read_table('frequencies', self.p, index_col='freq_id',
usecols=['freq_id', 'form', 'value', 'source'])
freq_col = freq.columns
if "form" not in freq_col:
raise ValueError("No form column in the frequency table."
"I can't build frequency information for forms."
"You should probably pass uniform=True "
"or report this issue.")
if "source" not in freq_col:
freq['source'] = 'frequencies_table'
self.source['forms'] = 'frequencies_table'
elif self.source['forms'] is None:
self.source['forms'] = list(freq['source'].unique())[0]
log.info(f"No default source provided for frequencies. Using {self.source['forms']}")
# We use the form_id column to match both dataframes
freq.set_index('form', inplace=True)
missing_idx = ~table.index.isin(freq.index)
if missing_idx.any():
log.warning(f"The frequencies table does not contain "
f"a row for every form_id row."
f"Missing:\n{table.loc[missing_idx].head()}")
table.loc[freq.index, ['value', 'source']] = freq[['value', 'source']]
# 3. For cells and lexemes build from the forms table.
# TODO read directly from the frequencies table if possible
elif not force_uniform and name != 'forms' and (self.has_frequencies('forms')):
log.info(f'{name}: No frequencies in the {name} table, building from the forms table.')
freq = self.forms.groupby(name[:-1]).value.sum()
table.loc[freq.index, "value"] = freq.values
table['source'] = 'forms_table'
self.source[name] = 'forms_table'
# 4. Building a fake uniform frequency distribution.
else:
if not force_uniform:
log.warning(f"Frequency information for {name} couldn't be found "
"in this dataset.")
log.info(f'{name}: Building empty frequencies.')
table['source'] = 'empty'
table['value'] = pd.NA
self.source[name] = 'empty'
if name == "forms":
# Check for duplicate overabundant phon_forms and sum the frequencies.
# This handles cases where the orth_form is different and has two records.
# Paradigms should be read only once, and this code shouldn't be redundant with
# the main script. This should be fixed elsewhere. TODO
table['form'] = table.phon_form
dup = table.duplicated(subset=self.col_names, keep=False)
if dup.any():
table.loc[dup, 'value'] = \
table.loc[dup].groupby(self.col_names).value.transform('sum')
table.drop_duplicates(subset=self.col_names, inplace=True)
cols = ['cell', 'lexeme', 'value', 'source']
else:
cols = ['value', 'source']
# We save the resulting table
table.sort_index(inplace=True)
table.index.name = name[:-1]
table.index = table.index.astype('str')
setattr(self, name, table[cols])
[docs]
def drop_unused(self, paradigms):
"""
If the paradigms table implied some sampling / filtering,
make sure that the frequencies are also sampled.
"""
self.forms = self.forms[self.forms.index.astype(str).isin(paradigms.index)]
# TODO it would be nice to recompute the lexeme/cell frequencies
# based on the forms that we kept.
[docs]
def get_absolute_freq(self, mean=False, group_on=False, skipna=False, **kwargs):
"""
Return the frequency of an item for a given source
The frequency of an item is defined as the sum of the frequencies of this item
across all rows.
Examples:
>>> p = fl.Package('tests/data/TestPackage/test.package.json')
>>> f = Frequencies(p)
>>> f.get_absolute_freq(filters={'lexeme':'q'}, group_on="index", skipna=True)
form
11 12.0
12 6.0
14 20.0
18 NaN
23 20.0
Name: value, dtype: float64
>>> float(f.get_absolute_freq(filters={'lexeme':'q'}))
nan
>>> float(f.get_absolute_freq(filters={'cell':'third'}, mean=True, skipna=True))
20.0
>>> f.get_absolute_freq(group_on=['lexeme'])
lexeme
k 203.0
p NaN
q NaN
s 63.0
Name: value, dtype: float64
Todo:
Replace if mean/else by an aggfunc parameter, once skipna will be supported
by pandas functions.
Arguments:
group_on (List[str]): columns for which absolute frequencies should be computed.
If `False`, aggregates across all records.
mean (bool): Defaults to False. If True, returns a mean instead of a sum.
skipna(bool): Defaults to False. Skip `nan` values for sums or means.
Returns:
`pandas.Series`: a Series which contains the output values.
The index is either the original one, or the grouping columns.
"""
# Filter using keys from mapping dict
sublist = self._filter_frequencies(**kwargs)
if group_on == "index":
return sublist.value
elif group_on is False:
groups = [True] * len(sublist)
else:
groups = group_on
if mean:
def func(x): return x.mean(skipna=skipna)
else:
def func(x): return x.sum(skipna=skipna)
result = sublist.groupby(by=groups, group_keys=False).value.apply(func)
if group_on is False:
return result.iloc[0]
else:
return result
[docs]
def get_relative_freq(self, group_on=False, uniform_duplicates=False, **kwargs):
"""
Returns the relative frequencies of a set of rows according to a set of grouping columns.
If any of the values is empty, we generate a Uniform distribution for this group.
Note:
To avoid long computations, we use C implementations.
Unfortunately, `skipna` is not yet implemented in `GroupBy.sum`. For this reason,
we use a more complex pipeline of C functions.
Todo:
Replace the pipeline by a much simpler .transform(sum, skipna=False), once possible.
Examples:
>>> p = fl.Package('tests/data/TestPackage/test.package.json')
>>> f = Frequencies(p)
>>> f.get_relative_freq(filters={'lexeme': 'p', 'cell':'first'}, group_on=["lexeme"])['result'].values
array([0.05882353, 0.94117647])
>>> f.get_relative_freq(filters={'lexeme': 's', 'cell':'second'}, group_on=["lexeme"])['result'].values
array([0., 1.])
>>> f.get_relative_freq(filters={'cell':"third"}, group_on=["cell"])['result'].values
array([0.25, 0.25, 0.25, 0.25])
>>> f.get_relative_freq(filters={'lexeme':'p'}, group_on=["lexeme", "cell"])['result'].values
array([0.05882353, 0.94117647, 1. , 1. , 1. ])
>>> f.get_relative_freq(filters={'lexeme':'s', 'cell': 'first'}, group_on=["lexeme", "cell"]).result.values
array([0.33333333, 0.33333333, 0.33333333])
Arguments:
group_on (List[str]): column on which relative frequencies should be computed
uniform_duplicates (bool): Whether to give a uniform weight to duplicate items
or a relative weight based on tokens.
Returns:
`pandas.DataFrame`: a DataFrame which contains a `result` column with the output value.
The index is the original one. The grouping columns are also provided.
"""
# Filter using keys from mapping dict
sublist = self._filter_frequencies(**kwargs)
if group_on is False:
groups = [True] * len(sublist)
col_names = list()
else:
groups = group_on
col_names = list(group_on)
# 1. We first get the nb of items in each group
sublist['result'] = sublist\
.groupby(groups, sort=False).value\
.transform("size")
sublist.result = sublist.result.astype('float64')
# 2. If there are any NaN values, we give a uniform frequency to the group
any_nan = sublist.groupby(groups).value.transform(lambda x: x.isna().any())
# 3. If a whole group contains zeros, we give a uniform frequency to the group
all_zero = sublist.groupby(groups).value.transform(lambda x: (x == 0).all())
# Apply 2 and 3 or apply everywhere if uniform_duplicates
selector = (sublist.result != 1) & (any_nan | all_zero | uniform_duplicates)
sublist.loc[selector, 'result'] = 1/sublist.loc[selector, 'result']
# 4. If all values are filled and if the group is bigger than one, we sum the frequencies
selector = sublist.result > 1
if selector.any():
if group_on is False:
groups = selector
sublist.loc[selector, 'result'] = sublist.loc[selector, 'value']/sublist.loc[selector]\
.groupby(groups, sort=False).value.transform('sum')
return sublist[col_names + ["result"]]
def _filter_frequencies(self, data="forms", source=None, filters={}, inplace=False):
"""Filters the dataframe based on a set of filters
provided as a dictionary.
Arguments:
filters (dict): a mapping of the following kind `{"lexeme": value,
"cell": value, "form": value}`.
data(str): name of one of the three tables (forms, lexemes, cells)
source (str): the name of the source to use. If nothing is provided,
the default source is selected.
inplace (bool): whether the filter should operate in place or not. Defaults to False.
"""
missing = set(filters.keys())-set(self.col_names)
if missing:
log.warning("You passed some column names that don't exist. They will be ignored: %s",
", ".join(missing))
def _listify(x):
"""Ensure that passed values of mapping are list-like objects"""
if isinstance(x, str):
x = [x]
else:
try:
iter(x)
except TypeError:
x = [x]
else:
x = list(x)
return x
mapping = {k: _listify(v) for k, v in filters.items() if v is not None and k not in missing}
if source is None:
source = self.source[data]
if source is not False:
mapping["source"] = [source]
freq = getattr(self, data).copy()
idx_name = freq.index.name
freq.reset_index(inplace=True)
def _selector(mapping):
"""Avoid repetition of this complex line"""
if mapping:
return freq.loc[freq[list(mapping)].isin(mapping).all(axis=1)]\
.copy().set_index(idx_name)
return freq.set_index(idx_name)
if inplace:
setattr(self, data, _selector(mapping))
else:
return _selector(mapping)
[docs]
def has_frequencies(self, table="forms"):
"""
Returns True if the requested contains real frequencies.
Parameters:
table (str): name of the table to test.
"""
return self.source[table] != "empty"
[docs]
def info(self):
"""Returns a convenient DataFrame with summary statistics.
Returns:
`pandas.DataFrame`: A summary of statistics about this Frequencies handler.
"""
metrics = []
for i in ['forms', 'lexemes', 'cells']:
data = getattr(self, i)
metrics.append([i, self.source[i], len(data),
data.value.sum(), data.value.mean()])
return pd.DataFrame(metrics, columns=['Table', 'Source', 'Records', 'Sum(f)', 'Mean(f)'])\
.set_index('Table')
if __name__ == "__main__":
import doctest
doctest.testmod()