Source code for qumin.utils.config
from typing import Optional, List, Any
from enum import Enum
from omegaconf.omegaconf import SI
from hydra.core.config_store import ConfigStore
from dataclasses import dataclass, field
[docs]
@dataclass(kw_only=True)
class HeatmapDisplayConfig:
"""
Set to True/False to show or hide detailed information on the heatmap
Arguments:
n_pairs: Whether to display the number of pairs.
freq_margins: Whether to display frequency margins on heatmaps.
"""
n_pairs: bool = True
freq_margins: bool = True
[docs]
@dataclass(kw_only=True)
class HeatmapConfig:
"""
Arguments:
label: Lexeme column to use as label (for microclass heatmap, eg. inflection_class)
cmap: Colormap name
exhaustive_labels: by default, seaborn shows only some labels on
the heatmap for readability.
This forces seaborn to print all labels.
dense: Use initials instead of full labels (only for entropy heatmap)
annotate: Display values on the heatmap. (only for entropy heatmap)
order: Priority list for sorting features (for entropy heatmap)
ex: [number, case]). If no features-values file available,
you can use the key `cells` to provide an ordered list of cells to display. Special value "autosort" in order to sort by cell similarity.
cols: List of features to show in columns (for zones heatmap)
ex: [Mode, Tense]). All other features will constitute rows.
display: Options to switch on/off additional heatmaps.
"""
label: Optional[str] = None
cmap: Optional[str] = None # Colormap name
exhaustive_labels: bool = False
dense: bool = False
annotate: bool = False
order: Optional[Any] = None
cols: Optional[Any] = None
display: HeatmapDisplayConfig = field(default_factory=HeatmapDisplayConfig)
[docs]
@dataclass(kw_only=True)
class TokenFreqConfig:
"""
Whether to use token frequencies for...
Arguments:
patterns: The probability of the patterns.
predictors: The probability of the predictor classes and of the predictor forms.
overabundant: The weighting of overabundant cellmates
cells: The weighting of the measures across different cells.
"""
patterns: bool = False
predictors: bool = False
overabundant: bool = False
cells: bool = False
[docs]
@dataclass(kw_only=True)
class PredictabilityConfig:
"""
Configuration for entropy calculations.
Arguments:
vis: Whether to create a heatmap of the metrics and of interpredictability zones.
n: Compute entropy for prediction from with n predictors.
features: Feature column in the Lexeme table.
Features will be considered known in conditional probabilities: P(X~Y|X,f1,f2...)
importResults: Import previous entropy computation results.
with any file, use to compute entropy heatmap
with n-1 predictors, allows for acceleration on nPreds entropy computation.
token_freq:
Whether to use token frequencies for...
"""
vis: bool = True
n: List[int] = field(default_factory=lambda: [1])
features: Optional[Any] = None
importResults: Optional[str] = None
token_freq: TokenFreqConfig = field(default_factory=TokenFreqConfig)
[docs]
@dataclass(kw_only=True)
class OverabundantPatternsConfig:
"""
Configuration for the processing of overabundant forms.
Arguments:
keep: Whether to keep overabundant entries
freq: Whether to prioritize overabundant forms by frequency (fallback on file order)
tags: Tags to prefer when dropping overabundance (fallback on freq)
"""
keep: bool = False
freq: bool = True
tags: Optional[Any] = None
[docs]
class Kind(str, Enum):
"""
Kind of algorithm for the patterns.
Arguments:
phon: phonological distance
edits: simple edit distance
"""
phon = "phon"
edits = "edits"
[docs]
@dataclass(kw_only=True)
class PatternsConfig:
"""
Configuration for the ``patterns`` action.
Arguments:
kind: Options are (see docs): phon, edits
defective: Whether to keep defective entries
gap_proportion: Proportion of the median score used to set the gap score
optim_mem: Attempt to use a little bit less memory
overabundant: Configuration for overabundance
"""
kind: Kind = Kind.phon
defective: bool = False
gap_proportion: float = .4
optim_mem: bool = False
overabundant: OverabundantPatternsConfig = field(default_factory=OverabundantPatternsConfig)
[docs]
@dataclass(kw_only=True)
class LatticeConfig():
"""
Configuration for the ``lattice`` action.
Arguments:
shorten: Drop redundant columns altogether.
Useful for big contexts, but loses information.
The lattice shape and stats will be the same.
Avoid using with --html
aoc: Only attribute and object concepts
html: Export to html
ctxt: Export as a context
stat: Output stats about the lattice
pdf: Export as pdf
png: Export as png
"""
shorten: bool = False
aoc: bool = False
html: bool = False
ctxt: bool = False
stat: bool = False
pdf: bool = True
png: bool = False
[docs]
class Actions(str, Enum):
"""
Available actions. Each action triggers a different script.
.. versionchanged:: 3.2.0
Actions ``H`` and ``ent_heatmap`` are replaced by ``pred`` and ``pred_heatmap``.
"""
patterns = "patterns"
pred = "pred"
lattice = "lattice"
macroclasses = "macroclasses"
heatmap = "heatmap"
pred_heatmap = "pred_heatmap"
# TODO remove in 4.0.0.
H = "H"
ent_heatmap = "ent_heatmap"
[docs]
@dataclass(kw_only=True)
class QuminConfig:
"""
Arguments:
action: Action, one of: patterns, pred (H is deprecated), lattice, macroclasses, heatmap, pred_heatmap (ent_heatmap is deprecated)
data: Path to paralex.package.json paradigms, segments
cells: Cells to use (subset)
pos: Parts of speech to use (subset)
patterns: Path to pattern computation metadata. If null, will compute patterns.
lexemes: Lexemes to use (subset), path to a file with one lexeme id per row
sample_lexemes: A number of lexemes to sample, for debug purposes.
sample_cells: A number of cells to sample, for debug purposes. Samples by frequency if possible, otherwise randomly.
force_random: Whether to force random sampling.
seed: Random seed for reproducible random effects.
force: Whether to overpass RAM usage security (2GB)
cpus: Number of cpus to use for big computations
Defaults to 1. 0 sets the number of available cpus to the maximum - 2.
WARNING: cpus > 1 is unavailable for now in Windows and Mac.
Whether to ignore spaces in phon forms and re-compute phonemic segmentation
resegment: Whether to resegment phonological forms.
checkSegments: Whether to control if all forms contain licit segments.
pats: Configuration for the ``patterns`` action.
lattice: Configuration for the ``lattice`` action.
heatmap: Configuration for the ``pred_heatmap`` action.
pred: Configuration for the ``pred`` action.
.. versionchanged:: 3.2.0
Namespace ``entropy`` is replaced by ``pred``.
"""
action: Actions = Actions.patterns
data: str
patterns: Optional[str] = None
pos: Optional[Any] = None
lexemes: Optional[str] = None
cells: Optional[Any] = None
sample_lexemes: Optional[int] = None
sample_cells: Optional[int] = None
force_random: bool = False
seed: int = 1
force: bool = False
cpus: int = 1
resegment: bool = False
checkSegments: bool = True
pats: PatternsConfig = field(default_factory=PatternsConfig)
lattice: LatticeConfig = field(default_factory=LatticeConfig)
heatmap: HeatmapConfig = field(default_factory=HeatmapConfig)
pred: PredictabilityConfig = field(default_factory=PredictabilityConfig)
# TODO remove in 4.0.0.
entropy: PredictabilityConfig = SI("${oc.deprecated:pred}")
[docs]
def register_config():
"""
Registering the Config class with the name 'qumin'.
"""
cs = ConfigStore.instance()
cs.store(name="qumin_core", node=QuminConfig)