Source code for qumin.utils.metadata
# -*- coding: utf-8 -*-
# !/usr/bin/python3
import datetime
import logging
from pathlib import Path
import hydra
from frictionless import Package, Resource
from omegaconf import OmegaConf
from omegaconf.dictconfig import DictConfig
from .. import __version__
from ..representations.paradigms import Paradigms
from ..representations.patternstore import PatternStore
log = logging.getLogger()
[docs]
def diff(msg, old, new):
msg += f"\n\t- Old parameter: {old}\n\t- Current parameter: {new}"
return msg
[docs]
class Metadata():
"""Metadata manager for Qumin scripts. Wrapper around the Frictionless Package class.
Basic usage :
1. Register Metadata manager;
2. Get an absolute path to the metadata folder;
3. Write to that path;
4. After writing a file, register it and set metadata (description, custom dict);
5. Export the JSON descriptor.
The Metadata class can easily be used in scripts that reuse Qumin results.
In that case, one *must* pass a value to `runtime_path`, if hydra is not set
(which is very likely if you write a simple script).
Examples:
.. code-block:: python
import omegaconf
cfg = omegaconf.dictconfig.DictConfig({data="myparalex/package.json"})
md = Metadata(cfg=cfg, path="myprevious_run/metadata.json")
name = 'path/myfile.txt'
filename = md.get_path(name)
# Open an IO stream and write to ``filename``.
md.register_file(name, description="My nice file", custom={"property": "value"})
md.save_metadata(path)
Attributes:
start (datetime) : timestamp at the beginning of the run.
prefix (Path) : normalized prefix for the output files
cfg (OmegaConf): all arguments passed to the python script
paralex (frictionless.Package): a frictionless Package representing a dataset.
"""
[docs]
def __init__(self, path=None, cfg=None, rundir_path=None):
"""
Arguments:
cfg (OmegaConf.dictconfig.DictConfig):
arguments passed to the script.
path (str): Path to a Frictionless descriptor of a previous run to be imported.
rundir_path (str): Directory that should be used to export the results.
Useful only if path is None and hydra is not used.
"""
if path and (Path(path).is_dir() or Path(path).suffix != ".json"):
raise ValueError("Since v3.0.0, to import previous computation results, "
"Qumin expects a path to the metadata.json descriptor shipped with the results. "
"Please refer to the documentation for further details.")
self.package = Package(path) if path else Package()
self.cfg = cfg
if path:
prefix_path = self.package.basepath
elif rundir_path:
prefix_path = rundir_path
else:
prefix_path = hydra.core.hydra_config.HydraConfig.get().runtime.output_dir
self.prefix = Path(prefix_path)
if path is None:
self.start = datetime.datetime.now()
self.paralex = Package(cfg.data)
self.package.name = self.start.strftime("qumin_results_%Hh%M_%Y%m%d")
self.package.title = "Qumin Computation Results"
self.package.homepage = "https://qumin.readthedocs.io/"
self.package.description = "This package contains the output of a Qumin run. " \
"It can be imported by other Qumin scripts."
self.package.created = datetime.datetime.now().isoformat()
self.package.custom['qumin_version'] = __version__
if cfg:
self.package.custom['omega_conf'] = OmegaConf.to_container(cfg)
self.package.custom['paralex_dataset'] = self.paralex.to_dict()
else:
self.cfg = DictConfig(
self.package.custom['omega_conf'])
[docs]
def get_table_path(self, table_name):
""" Return the path to a dataset table """
dataset = self.paralex
basepath = Path(dataset.basepath or "./")
return basepath / dataset.get_resource(table_name).path
[docs]
def get_resource_path(self, resource):
""" Return the full path to a resource
Arguments:
resource (str): A resource name
Return:
pathlib.Path: a path to the resource.
"""
return self.prefix / self.package.get_resource(resource).path
[docs]
def save_metadata(self):
""" Save the metadata as a JSON file."""
end = datetime.datetime.now()
self.package.custom['duration'] = {"start": str(self.start),
"end": str(end),
"delta": str(end - self.start)
}
self.package._basepath = str(self.prefix)
self.package.infer()
self.package.to_json(self.prefix / 'metadata.json')
[docs]
def get_path(self, rel_path):
""" Return an absolute path to a file and create parent directories.
Arguments:
rel_path (str): relative path to the file or folder.
Returns:
pathlib.Path: absolute path to the file or folder.
"""
path = Path(self.prefix) / rel_path
if rel_path[-1] != "/":
path.parent.mkdir(parents=True, exist_ok=True)
else:
path.mkdir(parents=True, exist_ok=True)
return path
[docs]
def register_file(self, rel_path, name=None, custom=None, **kwargs):
""" Add a file as a frictionless resource.
Arguments:
rel_path (str or pathlib.Path): the relative path to the file.
name (str): name of the resource. By default, this will be the name
of the file without the extension.
custom (dict): Custom properties to save.
**kwargs (dict): Optional keyword arguments passed to Resource,
e.g. `description`.
"""
rel_path = str(rel_path)
if isinstance(name, str):
kwargs['name'] = name
res = Resource(path=rel_path, **kwargs)
if custom is not None:
res.custom = custom
self.package.add_resource(res)
[docs]
def get_paradigm_conf(self, cfg):
"""
Load paradigm creation keywords from previous run.
A few security checks are performed to ensure the user
didn't pass contradictory arguments. If this is the case,
a warning is thrown and old arguments are kept.
Under some conditions, arguments can be overwritten (e.g. cells list).
Arguments:
cfg (OmegaConf.dictconfig.DictConfig): Arguments passed to the current run.
These arguments might override arguments from the previous run, under
specific conditions.
"""
# Default, restore previous run cfg:
paradigm_conf = dict(
defective=self.cfg.pats.defective,
overabundant=self.cfg.pats.overabundant,
cells=self.cfg.cells,
pos=self.cfg.pos,
sample_lexemes=self.cfg.sample_lexemes,
sample_cells=self.cfg.sample_cells,
sample_kws=dict(force_random=self.cfg.force_random,
seed=self.cfg.seed),
resegment=self.cfg.resegment,
# Args to overwrite
lexemes_list=cfg.lexemes,
force=cfg.force,
)
# Test mandatory lists that can be overwritten
for arg in ['cells', 'pos']:
keep = False
old = self.cfg.get(arg)
new = cfg.get(arg)
if new is not None:
if old is not None and set(new) - set(old):
log.warning(diff(
f"Adding unseen {arg} is not allowed. "
"Using the setting from the previous run.",
old, new))
keep = True
# Test we are not sampling on a different list
if arg == "cells" and self.cfg.sample_cells is not None:
if new != old:
log.warning(diff(
"The previous run used cell sampling, "
"thus you can't change the list of cells."
'Using the setting from the previous run.',
old, new))
keep = True
# If all tests passed, update the configuration
if not keep:
paradigm_conf[arg] = new
# Test values that should be exactly the same
for arg in ['pats.defective', 'pats.overabundant',
'resegment']:
old = get_recursively(self.cfg, arg)
new = get_recursively(cfg, arg)
if new and new != old:
log.warning(diff(
f"You passed new values for {arg} which are different in the previous run."
" Using the setting from the previous run.",
old, new))
# Test lexemes are a subset of the previous sample
old = self.cfg.get("lexemes")
new = cfg.get("lexemes")
if new and new != old:
log.warning(diff(
"You passed a new lexemes list. Make sure the new one is "
"a subset of the previous one (if any) or Qumin may unexpectedly fail.",
old, new))
# Test sampling strategy didn't change
defaults = {
"seed": 1,
"force_random": False,
"sample_lexemes": None,
"sample_cells": None}
different = []
if (self.cfg.sample_lexemes is not None or
self.cfg.sample_cells is not None):
for k, v in defaults.items():
if cfg[k] not in [v, self.cfg[k]]:
different.append(k)
if different:
log.warning(diff(
"You passed new values for sampling which are different in the previous run."
" Using the setting from the previous run.",
{k: self.cfg[k] for k in different},
{k: cfg[k] for k in different}))
return paradigm_conf
[docs]
def get_pattern_conf(self):
""" Load pattern creation keywords from previous run.
No security checks: all relevant arguments have already
been tested when loading the paradigms.
"""
pattern_conf = dict(
defective=self.cfg.pats.defective,
overabundant=self.cfg.pats.overabundant.keep
)
return pattern_conf
[docs]
def get_paradigms(self, md, **kwargs):
"""
Creates paradigms with a stable config strategy.
Arguments:
md (qumin.utils.metadata.Metadata): Metadata handler of the current run.
kwargs (dict): Additional keyword arguments are passed to
`qumin.representations.paradigms.Paradigms`.
"""
paradigm_conf = self.get_paradigm_conf(md.cfg)
paradigm_conf.update(kwargs)
return Paradigms(md.paralex,
**paradigm_conf,
)
[docs]
def get_patterns(self, paradigms, **kwargs):
"""
Creates patterns with a stable config strategy.
Arguments:
paradigms (qumin.representations.paradigms.Paradigms): Paradigms representation.
kwargs (dict): Additional keyword arguments are passed `patterns.from_file()`.
"""
pattern_conf = self.get_pattern_conf()
pattern_conf.update(kwargs)
patterns = PatternStore()
patterns.from_file(self,
paradigms,
**pattern_conf,
)
return patterns