Source code for qumin.predictability
# -*- coding: utf-8 -*-
# !/usr/bin/python3
import numpy as np
import pandas as pd
[docs]
def P(x, weights=None, subset=None):
"""
Return the probability distribution of unique elements in a :class:`pandas.core.series.Series`.
The default is a Uniform probability distribution, where each token in `x` has the same
probability. If weights are provided, they will be used as the probability of the tokens.
Example:
>>> P(pd.Series(["A", "B", "B"]))
A 0.333333
B 0.666667
Name: proportion, dtype: float64
>>> P(pd.Series(["A", "B", "B"]), weights=pd.Series([2, 1, 1]))
A 0.5
B 0.5
dtype: float64
Arguments:
x (:class:`pandas.core.series.Series`): A series of data.
weights (:class:`pandas.core.series.Series`): A series of weights.
subset (Iterable): Only give the distribution for a subset of values.
Returns:
:class:`pandas.core.series.Series`: A Series which index are x's unique elements
and which values are their probability in x.
"""
if (subset is not None) and (weights is not None):
return weights[subset].groupby(x[subset]).sum() / weights[subset].sum()
elif subset is not None:
return x[subset].value_counts(normalize=True, sort=False)
elif weights is not None:
return weights.groupby(x).sum() / weights.sum()
else:
return x.value_counts(normalize=True, sort=False)
[docs]
def cond_P(A, B, subset=None):
"""Return the conditional probability distribution P(A|B) for elements in two :class:`pandas.core.series.Series`.
Arguments:
A (:class:`pandas.core.series.Series`): A series of data.
B (:class:`pandas.core.series.Series`): A series of data.
subset (Iterable): Only give the distribution for a subset of values.
Return:
:class:`pandas.core.series.Series`: A Series whith two indexes.
The first index is from the elements of B, the second from the elements of A.
The values are the P(A|B).
"""
if subset is None:
cond_events = A.groupby(B, sort=False)
else:
cond_events = A[subset].groupby(B[subset], sort=False)
return P(cond_events)
[docs]
def cond_entropy(A, B, **kwargs):
"""Calculate the conditional entropy of A knowing B, two series of data points.
Presupposes that values in the series are of the same type, typically tuples.
Arguments:
A (:class:`pandas.core.series.Series`): A series of data.
B (:class:`pandas.core.series.Series`): A series of data.
Return:
H(A|B)
"""
return entropy(P(A + B, **kwargs)) - entropy(P(B, **kwargs))
[docs]
def entropy(A):
"""Calculate the entropy for a series of probabilities.
Since some probabilities may be null, we keep only positive values.
This does not affect the result of the computation.
Arguments:
A (:class:`pandas.core.series.Series`): A series of numeric values.
Return:
H(A)"""
pos = A > 0
return -(A[pos] * np.log2(A[pos])).sum()
[docs]
def cond_entropy_slow(df, classes, subset=None):
"""
Calculate the conditional entropy through a slower method (with iterations across all groups).
Arguments:
df (pandas.DataFrame): the patterns distribution.
classes (pandas.Series): the known features that are used to group the patterns.
Uses token frequencies to weight the patterns.
"""
def compute_group_ent(group):
return entropy(P(group.pattern, weights=group.f_pair)) * group.f_pred.sum()
return (
0 +
df.groupby(classes)
.apply(compute_group_ent, include_groups=False).sum()
/ df.f_pred.sum()
)
[docs]
def cond_psuccess(df, classes):
"""
Calculate the conditional probability of success.
Arguments:
df (pandas.DataFrame): the patterns distribution.
classes (pandas.Series): the known features that are used to group the patterns.
Uses token frequencies to weight the patterns.
"""
def compute_group_psuccess(group):
group["psuccess"] = group.pattern.map(P(group.pattern, weights=group.f_pair))
return group
df = (
df.groupby(classes, as_index=False, group_keys=False)
.apply(compute_group_psuccess, include_groups=False)
.groupby(['form_x', 'lexeme'], observed=False)[['f_pred', 'psuccess']]
.sum()
)
return 0 + ((df.psuccess * df.f_pred) / df.f_pred.sum()).sum()