Source code for qumin.utils.config

from typing import Optional, List, Any
from enum import Enum
from omegaconf.omegaconf import SI
from hydra.core.config_store import ConfigStore
from dataclasses import dataclass, field



[docs]
@dataclass(kw_only=True)
class HeatmapDisplayConfig:
    """
    Set to True/False to show or hide detailed information on the heatmap

    Arguments:
        n_pairs: Whether to display the number of pairs.
        freq_margins: Whether to display frequency margins on heatmaps.
    """
    n_pairs: bool = True
    freq_margins: bool = True




[docs]
@dataclass(kw_only=True)
class HeatmapConfig:
    """
    Arguments:
        label: Lexeme column to use as label (for microclass heatmap, eg. inflection_class)
        cmap: Colormap name
        exhaustive_labels: by default, seaborn shows only some labels on
            the heatmap for readability.
            This forces seaborn to print all labels.
        dense: Use initials instead of full labels (only for entropy heatmap)
        annotate: Display values on the heatmap. (only for entropy heatmap)
        order: Priority list for sorting features (for entropy heatmap)
            ex: [number, case]). If no features-values file available,
            you can use the key `cells` to provide an ordered list of cells to display. Special value "autosort" in order to sort by cell similarity.
        cols: List of features to show in columns (for zones heatmap)
            ex: [Mode, Tense]). All other features will constitute rows.
        display: Options to switch on/off additional heatmaps.
    """
    label: Optional[str] = None
    cmap: Optional[str] = None                       # Colormap name
    exhaustive_labels: bool = False
    dense: bool = False
    annotate: bool = False
    order: Optional[Any] = None
    cols: Optional[Any] = None

    display: HeatmapDisplayConfig = field(default_factory=HeatmapDisplayConfig)




[docs]
@dataclass(kw_only=True)
class TokenFreqConfig:
    """
    Whether to use token frequencies for...

    Arguments:
        patterns: The probability of the patterns.
        predictors: The probability of the predictor classes and of the predictor forms.
        overabundant: The weighting of overabundant cellmates
        cells: The weighting of the measures across different cells.
    """
    patterns: bool = False
    predictors: bool = False
    overabundant: bool = False
    cells: bool = False




[docs]
@dataclass(kw_only=True)
class PredictabilityConfig:
    """
    Configuration for entropy calculations.

    Arguments:
        vis: Whether to create a heatmap of the metrics and of interpredictability zones.
        n: Compute entropy for prediction from with n predictors.
        features: Feature column in the Lexeme table.
            Features will be considered known in conditional probabilities: P(X~Y|X,f1,f2...)
        importResults: Import previous entropy computation results.
            with any file, use to compute entropy heatmap
            with n-1 predictors, allows for acceleration on nPreds entropy computation.
        token_freq:
            Whether to use token frequencies for...
    """
    vis: bool = True
    n: List[int] = field(default_factory=lambda: [1])
    features: Optional[Any] = None
    importResults: Optional[str] = None
    token_freq: TokenFreqConfig = field(default_factory=TokenFreqConfig)




[docs]
@dataclass(kw_only=True)
class OverabundantPatternsConfig:
    """
    Configuration for the processing of overabundant forms.

    Arguments:
        keep: Whether to keep overabundant entries
        freq: Whether to prioritize overabundant forms by frequency (fallback on file order)
        tags: Tags to prefer when dropping overabundance (fallback on freq)
    """
    keep: bool = False
    freq: bool = True
    tags: Optional[Any] = None




[docs]
class Kind(str, Enum):
    """
    Kind of algorithm for the patterns.

    Arguments:
        phon: phonological distance
        edits: simple edit distance
    """
    phon = "phon"
    edits = "edits"




[docs]
@dataclass(kw_only=True)
class PatternsConfig:
    """
    Configuration for the ``patterns`` action.

    Arguments:
        kind: Options are (see docs): phon, edits
        defective: Whether to keep defective entries
        gap_proportion: Proportion of the median score used to set the gap score
        optim_mem: Attempt to use a little bit less memory
        overabundant: Configuration for overabundance

    """
    kind: Kind = Kind.phon
    defective: bool = False
    gap_proportion: float = .4
    optim_mem: bool = False

    overabundant: OverabundantPatternsConfig = field(default_factory=OverabundantPatternsConfig)




[docs]
@dataclass(kw_only=True)
class LatticeConfig():
    """
    Configuration for the ``lattice`` action.

    Arguments:
        shorten: Drop redundant columns altogether.
            Useful for big contexts, but loses information.
            The lattice shape and stats will be the same.
            Avoid using with --html
        aoc: Only attribute and object concepts
        html: Export to html
        ctxt: Export as a context
        stat: Output stats about the lattice
        pdf: Export as pdf
        png: Export as png
    """
    shorten: bool = False
    aoc: bool = False
    html: bool = False
    ctxt: bool = False
    stat: bool = False
    pdf: bool = True
    png: bool = False




[docs]
class Actions(str, Enum):
    """

    Available actions. Each action triggers a different script.

    .. versionchanged:: 3.2.0
        Actions ``H`` and ``ent_heatmap`` are replaced by ``pred`` and ``pred_heatmap``.
    """
    patterns = "patterns"
    pred = "pred"
    lattice = "lattice"
    macroclasses = "macroclasses"
    heatmap = "heatmap"
    pred_heatmap = "pred_heatmap"

    # TODO remove in 4.0.0.
    H = "H"
    ent_heatmap = "ent_heatmap"




[docs]
@dataclass(kw_only=True)
class QuminConfig:
    """
    Arguments:
        action: Action, one of: patterns, pred (H is deprecated), lattice, macroclasses, heatmap, pred_heatmap (ent_heatmap is deprecated)
        data: Path to paralex.package.json paradigms, segments
        cells: Cells to use (subset)
        pos: Parts of speech to use (subset)
        patterns: Path to pattern computation metadata. If null, will compute patterns.
        lexemes: Lexemes to use (subset), path to a file with one lexeme id per row
        sample_lexemes: A number of lexemes to sample, for debug purposes.
        sample_cells: A number of cells to sample, for debug purposes. Samples by frequency if possible, otherwise randomly.
        force_random: Whether to force random sampling.
        seed: Random seed for reproducible random effects.
        force: Whether to overpass RAM usage security (2GB)
        cpus: Number of cpus to use for big computations
            Defaults to 1. 0 sets the number of available cpus to the maximum - 2.
            WARNING: cpus > 1 is unavailable for now in Windows and Mac.
            Whether to ignore spaces in phon forms and re-compute phonemic segmentation
        resegment: Whether to resegment phonological forms.
        checkSegments: Whether to control if all forms contain licit segments.
        pats: Configuration for the ``patterns`` action.
        lattice: Configuration for the ``lattice`` action.
        heatmap: Configuration for the ``pred_heatmap`` action.
        pred: Configuration for the ``pred`` action.

    .. versionchanged:: 3.2.0
        Namespace ``entropy`` is replaced by ``pred``.

    """
    action: Actions = Actions.patterns
    data: str
    patterns: Optional[str] = None
    pos: Optional[Any] = None
    lexemes: Optional[str] = None
    cells: Optional[Any] = None
    sample_lexemes: Optional[int] = None
    sample_cells: Optional[int] = None
    force_random: bool = False
    seed: int = 1
    force: bool = False
    cpus: int = 1

    resegment: bool = False
    checkSegments: bool = True

    pats: PatternsConfig = field(default_factory=PatternsConfig)
    lattice: LatticeConfig = field(default_factory=LatticeConfig)
    heatmap: HeatmapConfig = field(default_factory=HeatmapConfig)
    pred: PredictabilityConfig = field(default_factory=PredictabilityConfig)

    # TODO remove in 4.0.0.
    entropy: PredictabilityConfig = SI("${oc.deprecated:pred}")




[docs]
def register_config():
    """
    Registering the Config class with the name 'qumin'.
    """
    cs = ConfigStore.instance()
    cs.store(name="qumin_core", node=QuminConfig)