Source code for qumin.utils.config

from typing import Optional, List, Any
from enum import Enum
from omegaconf.omegaconf import SI
from hydra.core.config_store import ConfigStore
from dataclasses import dataclass, field


[docs] @dataclass(kw_only=True) class HeatmapDisplayConfig: """ Set to True/False to show or hide detailed information on the heatmap Arguments: n_pairs: Whether to display the number of pairs. freq_margins: Whether to display frequency margins on heatmaps. """ n_pairs: bool = True freq_margins: bool = True
[docs] @dataclass(kw_only=True) class HeatmapConfig: """ Arguments: label: Lexeme column to use as label (for microclass heatmap, eg. inflection_class) cmap: Colormap name exhaustive_labels: by default, seaborn shows only some labels on the heatmap for readability. This forces seaborn to print all labels. dense: Use initials instead of full labels (only for entropy heatmap) annotate: Display values on the heatmap. (only for entropy heatmap) order: Priority list for sorting features (for entropy heatmap) ex: [number, case]). If no features-values file available, you can use the key `cells` to provide an ordered list of cells to display. Special value "autosort" in order to sort by cell similarity. cols: List of features to show in columns (for zones heatmap) ex: [Mode, Tense]). All other features will constitute rows. display: Options to switch on/off additional heatmaps. """ label: Optional[str] = None cmap: Optional[str] = None # Colormap name exhaustive_labels: bool = False dense: bool = False annotate: bool = False order: Optional[Any] = None cols: Optional[Any] = None display: HeatmapDisplayConfig = field(default_factory=HeatmapDisplayConfig)
[docs] @dataclass(kw_only=True) class TokenFreqConfig: """ Whether to use token frequencies for... Arguments: patterns: The probability of the patterns. predictors: The probability of the predictor classes and of the predictor forms. overabundant: The weighting of overabundant cellmates cells: The weighting of the measures across different cells. """ patterns: bool = False predictors: bool = False overabundant: bool = False cells: bool = False
[docs] @dataclass(kw_only=True) class PredictabilityConfig: """ Configuration for entropy calculations. Arguments: vis: Whether to create a heatmap of the metrics and of interpredictability zones. n: Compute entropy for prediction from with n predictors. features: Feature column in the Lexeme table. Features will be considered known in conditional probabilities: P(X~Y|X,f1,f2...) importResults: Import previous entropy computation results. with any file, use to compute entropy heatmap with n-1 predictors, allows for acceleration on nPreds entropy computation. token_freq: Whether to use token frequencies for... """ vis: bool = True n: List[int] = field(default_factory=lambda: [1]) features: Optional[Any] = None importResults: Optional[str] = None token_freq: TokenFreqConfig = field(default_factory=TokenFreqConfig)
[docs] @dataclass(kw_only=True) class OverabundantPatternsConfig: """ Configuration for the processing of overabundant forms. Arguments: keep: Whether to keep overabundant entries freq: Whether to prioritize overabundant forms by frequency (fallback on file order) tags: Tags to prefer when dropping overabundance (fallback on freq) """ keep: bool = False freq: bool = True tags: Optional[Any] = None
[docs] class Kind(str, Enum): """ Kind of algorithm for the patterns. Arguments: phon: phonological distance edits: simple edit distance """ phon = "phon" edits = "edits"
[docs] @dataclass(kw_only=True) class PatternsConfig: """ Configuration for the ``patterns`` action. Arguments: kind: Options are (see docs): phon, edits defective: Whether to keep defective entries gap_proportion: Proportion of the median score used to set the gap score optim_mem: Attempt to use a little bit less memory overabundant: Configuration for overabundance """ kind: Kind = Kind.phon defective: bool = False gap_proportion: float = .4 optim_mem: bool = False overabundant: OverabundantPatternsConfig = field(default_factory=OverabundantPatternsConfig)
[docs] @dataclass(kw_only=True) class LatticeConfig(): """ Configuration for the ``lattice`` action. Arguments: shorten: Drop redundant columns altogether. Useful for big contexts, but loses information. The lattice shape and stats will be the same. Avoid using with --html aoc: Only attribute and object concepts html: Export to html ctxt: Export as a context stat: Output stats about the lattice pdf: Export as pdf png: Export as png """ shorten: bool = False aoc: bool = False html: bool = False ctxt: bool = False stat: bool = False pdf: bool = True png: bool = False
[docs] class Actions(str, Enum): """ Available actions. Each action triggers a different script. .. versionchanged:: 3.2.0 Actions ``H`` and ``ent_heatmap`` are replaced by ``pred`` and ``pred_heatmap``. """ patterns = "patterns" pred = "pred" lattice = "lattice" macroclasses = "macroclasses" heatmap = "heatmap" pred_heatmap = "pred_heatmap" # TODO remove in 4.0.0. H = "H" ent_heatmap = "ent_heatmap"
[docs] @dataclass(kw_only=True) class QuminConfig: """ Arguments: action: Action, one of: patterns, pred (H is deprecated), lattice, macroclasses, heatmap, pred_heatmap (ent_heatmap is deprecated) data: Path to paralex.package.json paradigms, segments cells: Cells to use (subset) pos: Parts of speech to use (subset) patterns: Path to pattern computation metadata. If null, will compute patterns. lexemes: Lexemes to use (subset), path to a file with one lexeme id per row sample_lexemes: A number of lexemes to sample, for debug purposes. sample_cells: A number of cells to sample, for debug purposes. Samples by frequency if possible, otherwise randomly. force_random: Whether to force random sampling. seed: Random seed for reproducible random effects. force: Whether to overpass RAM usage security (2GB) cpus: Number of cpus to use for big computations Defaults to 1. 0 sets the number of available cpus to the maximum - 2. WARNING: cpus > 1 is unavailable for now in Windows and Mac. Whether to ignore spaces in phon forms and re-compute phonemic segmentation resegment: Whether to resegment phonological forms. checkSegments: Whether to control if all forms contain licit segments. pats: Configuration for the ``patterns`` action. lattice: Configuration for the ``lattice`` action. heatmap: Configuration for the ``pred_heatmap`` action. pred: Configuration for the ``pred`` action. .. versionchanged:: 3.2.0 Namespace ``entropy`` is replaced by ``pred``. """ action: Actions = Actions.patterns data: str patterns: Optional[str] = None pos: Optional[Any] = None lexemes: Optional[str] = None cells: Optional[Any] = None sample_lexemes: Optional[int] = None sample_cells: Optional[int] = None force_random: bool = False seed: int = 1 force: bool = False cpus: int = 1 resegment: bool = False checkSegments: bool = True pats: PatternsConfig = field(default_factory=PatternsConfig) lattice: LatticeConfig = field(default_factory=LatticeConfig) heatmap: HeatmapConfig = field(default_factory=HeatmapConfig) pred: PredictabilityConfig = field(default_factory=PredictabilityConfig) # TODO remove in 4.0.0. entropy: PredictabilityConfig = SI("${oc.deprecated:pred}")
[docs] def register_config(): """ Registering the Config class with the name 'qumin'. """ cs = ConfigStore.instance() cs.store(name="qumin_core", node=QuminConfig)