Source code for qumin.utils.metadata

# -*- coding: utf-8 -*-
# !/usr/bin/python3
import datetime
import logging
from pathlib import Path

import hydra
from frictionless import Package, Resource
from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig

from .. import __version__
from ..representations.paradigms import Paradigms
from ..representations.patternstore import PatternStore

log = logging.getLogger()


[docs] def diff(msg, old, new): msg += f"\n\t- Old parameter: {old}\n\t- Current parameter: {new}" return msg
[docs] def get_recursively(cfg, key): for k in key.split('.'): cfg = cfg.get(k) return cfg
[docs] class Metadata(): """Metadata manager for Qumin scripts. Wrapper around the Frictionless Package class. Basic usage : 1. Register Metadata manager; 2. Get an absolute path to the metadata folder; 3. Write to that path; 4. After writing a file, register it and set metadata (description, custom dict); 5. Export the JSON descriptor. The Metadata class can easily be used in scripts that reuse Qumin results. In that case, one *must* pass a value to `runtime_path`, if hydra is not set (which is very likely if you write a simple script). Examples: .. code-block:: python import omegaconf cfg = omegaconf.dictconfig.DictConfig({data="myparalex/package.json"}) md = Metadata(cfg=cfg, path="myprevious_run/metadata.json") name = 'path/myfile.txt' filename = md.get_path(name) # Open an IO stream and write to ``filename``. md.register_file(name, description="My nice file", custom={"property": "value"}) md.save_metadata(path) Attributes: start (datetime) : timestamp at the beginning of the run. prefix (Path) : normalized prefix for the output files cfg (OmegaConf): all arguments passed to the python script paralex (frictionless.Package): a frictionless Package representing a dataset. """
[docs] def __init__(self, path=None, cfg=None, rundir_path=None): """ Arguments: cfg (OmegaConf.dictconfig.DictConfig): arguments passed to the script. path (str): Path to a Frictionless descriptor of a previous run to be imported. rundir_path (str): Directory that should be used to export the results. Useful only if path is None and hydra is not used. """ if path and (Path(path).is_dir() or Path(path).suffix != ".json"): raise ValueError("Since v3.0.0, to import previous computation results, " "Qumin expects a path to the metadata.json descriptor shipped with the results. " "Please refer to the documentation for further details.") self.package = Package(path) if path else Package() self.cfg = cfg if path: prefix_path = self.package.basepath elif rundir_path: prefix_path = rundir_path else: prefix_path = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir self.prefix = Path(prefix_path) if path is None: self.start = datetime.datetime.now() self.paralex = Package(cfg.data) self.package.name = self.start.strftime("qumin_results_%Hh%M_%Y%m%d") self.package.title = "Qumin Computation Results" self.package.homepage = "https://qumin.readthedocs.io/" self.package.description = "This package contains the output of a Qumin run. " \ "It can be imported by other Qumin scripts." self.package.created = datetime.datetime.now().isoformat() self.package.custom['qumin_version'] = __version__ if cfg: self.package.custom['omega_conf'] = OmegaConf.to_container(cfg) self.package.custom['paralex_dataset'] = self.paralex.to_dict() else: self.cfg = DictConfig( self.package.custom['omega_conf'])
[docs] def get_table_path(self, table_name): """ Return the path to a dataset table """ dataset = self.paralex basepath = Path(dataset.basepath or "./") return basepath / dataset.get_resource(table_name).path
[docs] def get_resource_path(self, resource): """ Return the full path to a resource Arguments: resource (str): A resource name Return: pathlib.Path: a path to the resource. """ return self.prefix / self.package.get_resource(resource).path
[docs] def save_metadata(self): """ Save the metadata as a JSON file.""" end = datetime.datetime.now() self.package.custom['duration'] = {"start": str(self.start), "end": str(end), "delta": str(end - self.start) } self.package._basepath = str(self.prefix) self.package.infer() self.package.to_json(self.prefix / 'metadata.json')
[docs] def get_path(self, rel_path): """ Return an absolute path to a file and create parent directories. Arguments: rel_path (str): relative path to the file or folder. Returns: pathlib.Path: absolute path to the file or folder. """ path = Path(self.prefix) / rel_path if rel_path[-1] != "/": path.parent.mkdir(parents=True, exist_ok=True) else: path.mkdir(parents=True, exist_ok=True) return path
[docs] def register_file(self, rel_path, name=None, custom=None, **kwargs): """ Add a file as a frictionless resource. Arguments: rel_path (str or pathlib.Path): the relative path to the file. name (str): name of the resource. By default, this will be the name of the file without the extension. custom (dict): Custom properties to save. **kwargs (dict): Optional keyword arguments passed to Resource, e.g. `description`. """ rel_path = str(rel_path) if isinstance(name, str): kwargs['name'] = name res = Resource(path=rel_path, **kwargs) if custom is not None: res.custom = custom self.package.add_resource(res)
[docs] def get_paradigm_conf(self, cfg): """ Load paradigm creation keywords from previous run. A few security checks are performed to ensure the user didn't pass contradictory arguments. If this is the case, a warning is thrown and old arguments are kept. Under some conditions, arguments can be overwritten (e.g. cells list). Arguments: cfg (OmegaConf.dictconfig.DictConfig): Arguments passed to the current run. These arguments might override arguments from the previous run, under specific conditions. """ # Default, restore previous run cfg: paradigm_conf = dict( defective=self.cfg.pats.defective, overabundant=self.cfg.pats.overabundant, cells=self.cfg.cells, pos=self.cfg.pos, sample_lexemes=self.cfg.sample_lexemes, sample_cells=self.cfg.sample_cells, sample_kws=dict(force_random=self.cfg.force_random, seed=self.cfg.seed), resegment=self.cfg.resegment, # Args to overwrite lexemes_list=cfg.lexemes, force=cfg.force, ) # Test mandatory lists that can be overwritten for arg in ['cells', 'pos']: keep = False old = self.cfg.get(arg) new = cfg.get(arg) if new is not None: if old is not None and set(new) - set(old): log.warning(diff( f"Adding unseen {arg} is not allowed. " "Using the setting from the previous run.", old, new)) keep = True # Test we are not sampling on a different list if arg == "cells" and self.cfg.sample_cells is not None: if new != old: log.warning(diff( "The previous run used cell sampling, " "thus you can't change the list of cells." 'Using the setting from the previous run.', old, new)) keep = True # If all tests passed, update the configuration if not keep: paradigm_conf[arg] = new # Test values that should be exactly the same for arg in ['pats.defective', 'pats.overabundant', 'resegment']: old = get_recursively(self.cfg, arg) new = get_recursively(cfg, arg) if new and new != old: log.warning(diff( f"You passed new values for {arg} which are different in the previous run." " Using the setting from the previous run.", old, new)) # Test lexemes are a subset of the previous sample old = self.cfg.get("lexemes") new = cfg.get("lexemes") if new and new != old: log.warning(diff( "You passed a new lexemes list. Make sure the new one is " "a subset of the previous one (if any) or Qumin may unexpectedly fail.", old, new)) # Test sampling strategy didn't change defaults = { "seed": 1, "force_random": False, "sample_lexemes": None, "sample_cells": None} different = [] if (self.cfg.sample_lexemes is not None or self.cfg.sample_cells is not None): for k, v in defaults.items(): if cfg[k] not in [v, self.cfg[k]]: different.append(k) if different: log.warning(diff( "You passed new values for sampling which are different in the previous run." " Using the setting from the previous run.", {k: self.cfg[k] for k in different}, {k: cfg[k] for k in different})) return paradigm_conf
[docs] def get_pattern_conf(self): """ Load pattern creation keywords from previous run. No security checks: all relevant arguments have already been tested when loading the paradigms. """ pattern_conf = dict( defective=self.cfg.pats.defective, overabundant=self.cfg.pats.overabundant.keep ) return pattern_conf
[docs] def get_paradigms(self, md, **kwargs): """ Creates paradigms with a stable config strategy. Arguments: md (qumin.utils.metadata.Metadata): Metadata handler of the current run. kwargs (dict): Additional keyword arguments are passed to `qumin.representations.paradigms.Paradigms`. """ paradigm_conf = self.get_paradigm_conf(md.cfg) paradigm_conf.update(kwargs) return Paradigms(md.paralex, **paradigm_conf, )
[docs] def get_patterns(self, paradigms, **kwargs): """ Creates patterns with a stable config strategy. Arguments: paradigms (qumin.representations.paradigms.Paradigms): Paradigms representation. kwargs (dict): Additional keyword arguments are passed `patterns.from_file()`. """ pattern_conf = self.get_pattern_conf() pattern_conf.update(kwargs) patterns = PatternStore() patterns.from_file(self, paradigms, **pattern_conf, ) return patterns