Source code for qumin.predictability

# -*- coding: utf-8 -*-
# !/usr/bin/python3

import numpy as np
import pandas as pd


[docs] def P(x, weights=None, subset=None): """ Return the probability distribution of unique elements in a :class:`pandas.core.series.Series`. The default is a Uniform probability distribution, where each token in `x` has the same probability. If weights are provided, they will be used as the probability of the tokens. Example: >>> P(pd.Series(["A", "B", "B"])) A 0.333333 B 0.666667 Name: proportion, dtype: float64 >>> P(pd.Series(["A", "B", "B"]), weights=pd.Series([2, 1, 1])) A 0.5 B 0.5 dtype: float64 Arguments: x (:class:`pandas.core.series.Series`): A series of data. weights (:class:`pandas.core.series.Series`): A series of weights. subset (Iterable): Only give the distribution for a subset of values. Returns: :class:`pandas.core.series.Series`: A Series which index are x's unique elements and which values are their probability in x. """ if (subset is not None) and (weights is not None): return weights[subset].groupby(x[subset]).sum() / weights[subset].sum() elif subset is not None: return x[subset].value_counts(normalize=True, sort=False) elif weights is not None: return weights.groupby(x).sum() / weights.sum() else: return x.value_counts(normalize=True, sort=False)
[docs] def cond_P(A, B, subset=None): """Return the conditional probability distribution P(A|B) for elements in two :class:`pandas.core.series.Series`. Arguments: A (:class:`pandas.core.series.Series`): A series of data. B (:class:`pandas.core.series.Series`): A series of data. subset (Iterable): Only give the distribution for a subset of values. Return: :class:`pandas.core.series.Series`: A Series whith two indexes. The first index is from the elements of B, the second from the elements of A. The values are the P(A|B). """ if subset is None: cond_events = A.groupby(B, sort=False) else: cond_events = A[subset].groupby(B[subset], sort=False) return P(cond_events)
[docs] def cond_entropy(A, B, **kwargs): """Calculate the conditional entropy of A knowing B, two series of data points. Presupposes that values in the series are of the same type, typically tuples. Arguments: A (:class:`pandas.core.series.Series`): A series of data. B (:class:`pandas.core.series.Series`): A series of data. Return: H(A|B) """ return entropy(P(A + B, **kwargs)) - entropy(P(B, **kwargs))
[docs] def entropy(A): """Calculate the entropy for a series of probabilities. Since some probabilities may be null, we keep only positive values. This does not affect the result of the computation. Arguments: A (:class:`pandas.core.series.Series`): A series of numeric values. Return: H(A)""" pos = A > 0 return -(A[pos] * np.log2(A[pos])).sum()
[docs] def cond_entropy_slow(df, classes, subset=None): """ Calculate the conditional entropy through a slower method (with iterations across all groups). Arguments: df (pandas.DataFrame): the patterns distribution. classes (pandas.Series): the known features that are used to group the patterns. Uses token frequencies to weight the patterns. """ def compute_group_ent(group): return entropy(P(group.pattern, weights=group.f_pair)) * group.f_pred.sum() return ( 0 + df.groupby(classes) .apply(compute_group_ent, include_groups=False).sum() / df.f_pred.sum() )
[docs] def cond_psuccess(df, classes): """ Calculate the conditional probability of success. Arguments: df (pandas.DataFrame): the patterns distribution. classes (pandas.Series): the known features that are used to group the patterns. Uses token frequencies to weight the patterns. """ def compute_group_psuccess(group): group["psuccess"] = group.pattern.map(P(group.pattern, weights=group.f_pair)) return group df = ( df.groupby(classes, as_index=False, group_keys=False) .apply(compute_group_psuccess, include_groups=False) .groupby(['form_x', 'lexeme'], observed=False)[['f_pred', 'psuccess']] .sum() ) return 0 + ((df.psuccess * df.f_pred) / df.f_pred.sum()).sum()