Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Training loggers #379

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ repos:
# ruff
- repo: https://github.com/charliermarsh/ruff-pre-commit
# Ruff version.
rev: 'v0.6.4'
rev: 'v0.9.6'
hooks:
- id: ruff
args: ['--config', 'pyproject.toml', '--fix', '--show-fixes']
Expand Down
4 changes: 4 additions & 0 deletions changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,15 @@
- Added a new unit test suite to validate the tuning script.
- `docs/tutorials/tuning.md`: New tutorial for hyperparameter tuning.
- Provided a [detailed tutorial](./docs/tutorials/tuning.md) on hyperparameter tuning, covering usage scenarios and configuration options.
- Added grad spike detection to the `edsnlp.train` script, and per weight layer gradient logging.
- Added support for multiple loggers (`tensorboard`, `wandb`, `comet_ml`, `aim`, `mlflow`, `clearml`, `dvclive`, `csv`, `json`, `rich`) in `edsnlp.train` via the `logger` parameter. Default is [`json` and `rich`] for backward compatibility.
- Added clickable snippets in the documentation for more registered functions

### Fixed

- Support packaging with poetry 2.0
- Solve pickling issues with multiprocessing when pytorch is installed
- Fixed mini-batch accumulation for multi-task training

# v0.15.0 (2024-12-13)

Expand Down
72 changes: 55 additions & 17 deletions docs/scripts/clickable_snippets.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Based on https://github.com/darwindarak/mdx_bib
import os
import re
from bisect import bisect_right
from collections import defaultdict
from typing import Tuple

import jedi
Expand All @@ -22,11 +22,7 @@

from bs4 import BeautifulSoup

BRACKET_RE = re.compile(r"\[([^\[]+)\]")
CITE_RE = re.compile(r"@([\w_:-]+)")
DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
INDENT_RE = re.compile(r"\A\t| {4}(.*)")

# Used to match href in HTML to replace with a relative path
HREF_REGEX = (
r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)"
r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
Expand All @@ -42,6 +38,15 @@
(?![a-zA-Z0-9._-])
"""

REGISTRY_REGEX = r"""(?x)
(?<![a-zA-Z0-9._-])
<span[^>]*>(?:"|&\#39;|&quot;)@([a-zA-Z0-9._-]*)(?:"|&\#39;|&quot;)<\/span>\s*
<span[^>]*>:<\/span>\s*
<span[^>]*>\s*<\/span>\s*
<span[^>]*>(?:"|&\#39;|&quot;)?([a-zA-Z0-9._-]*)(?:"|&\#39;|&quot;)?<\/span>
(?![a-zA-Z0-9._-])
"""

CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"


Expand All @@ -62,11 +67,15 @@ def on_config(self, config: MkDocsConfig):
plugin.load_config(plugin_config)

@classmethod
def get_ep_namespace(cls, ep, namespace):
def get_ep_namespace(cls, ep, namespace=None):
if hasattr(ep, "select"):
return ep.select(group=namespace)
return ep.select(group=namespace) if namespace else list(ep._all)
else: # dict
return ep.get(namespace, [])
return (
ep.get(namespace, [])
if namespace
else (x for g in ep.values() for x in g)
)

@mkdocs.plugins.event_priority(-1000)
def on_post_page(
Expand Down Expand Up @@ -94,18 +103,26 @@ def on_post_page(
autorefs: AutorefsPlugin = config["plugins"]["autorefs"]
ep = entry_points()
page_url = os.path.join("/", page.file.url)
spacy_factories_entry_points = {
factories_entry_points = {
ep.name: ep.value
for ep in (
*self.get_ep_namespace(ep, "spacy_factories"),
*self.get_ep_namespace(ep, "edsnlp_factories"),
)
}

def replace_component(match):
full_group = match.group(0)
all_entry_points = defaultdict(dict)
for ep in self.get_ep_namespace(ep):
if ep.group.startswith("edsnlp_") or ep.group.startswith("spacy_"):
group = ep.group.split("_", 1)[1]
all_entry_points[group][ep.name] = ep.value

# This method is meant for replacing any component that
# appears in a "eds.component" format, no matter if it is
# preceded by a "@factory" or not.
def replace_factory_component(match):
full_match = match.group(0)
name = "eds." + match.group(1)
ep = spacy_factories_entry_points.get(name)
ep = factories_entry_points.get(name)
preceding = output[match.start(0) - 50 : match.start(0)]
if ep is not None and "DEFAULT:" not in preceding:
try:
Expand All @@ -114,16 +131,37 @@ def replace_component(match):
pass
else:
return f"<a href={url}>{name}</a>"
return full_group
return full_match

# This method is meant for replacing any component that
# appears in a "@registry": "component" format
def replace_any_registry_component(match):
full_match = match.group(0)
group = match.group(1)
name = match.group(2)
ep = all_entry_points[group].get(name)
preceding = output[match.start(0) - 50 : match.start(0)]
if ep is not None and "DEFAULT:" not in preceding:
try:
url = autorefs.get_item_url(ep.replace(":", "."))
except KeyError:
pass
else:
repl = f'<a href={url} class="discrete-link">{name}</a>'
before = full_match[: match.start(2) - match.start(0)]
after = full_match[match.end(2) - match.start(0) :]
return before + repl + after
return full_match

def replace_link(match):
relative_url = url = match.group(1) or match.group(2) or match.group(3)
if url.startswith("/"):
relative_url = os.path.relpath(url, page_url)
return f'"{relative_url}"'

output = regex.sub(PIPE_REGEX, replace_component, output)
output = regex.sub(HTML_PIPE_REGEX, replace_component, output)
output = regex.sub(PIPE_REGEX, replace_factory_component, output)
output = regex.sub(HTML_PIPE_REGEX, replace_factory_component, output)
output = regex.sub(REGISTRY_REGEX, replace_any_registry_component, output)

all_snippets = ""
all_offsets = []
Expand Down
Empty file added docs/training/index.md
Empty file.
154 changes: 154 additions & 0 deletions docs/training/loggers.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Loggers

When training a model, it is important to keep track of the training process, model performance at different stages, and statistics about the training data over time. This is where loggers come in. Loggers are used to store such information to be able to analyze and visualize it later.

The EDS-NLP training API (`edsnlp.train`) relies on `accelerate` integration of popular loggers, as well as a few custom loggers.
You can configure loggers in `edsnlp.train` via the `logger` parameter of the `train` function by specifying:

- a string or a class instance or partially initialized class instance of a logger, e.g.

=== "Via the Python API"
```{ .python .no-check }
from edsnlp.training.loggers import CSVLogger
from edsnlp.training import train

logger = CSVLogger.draft()
train(..., logger=logger)
# or train(..., logger="csv")
```

=== "Via a config file"
```yaml
train:
...
logger:
"@loggers": csv
...
```


- or a list of string / logger instances, e.g.

=== "Via the Python API"
```{ .python .no-check }
from edsnlp.training.loggers import CSVLogger
from edsnlp.training import train

loggers = ["tensorboard", CSVLogger.draft(...)]
train(..., logger=loggers)
```

=== "Via a config file"
```yaml
train:
...
logger:
- tensorboard # as a string
- "@loggers": csv # as a (partially) instanciated logger
...
```

!!! note "Draft objects"

`edsnlp.train` will provide a default project name and logging dir for loggers that require these parameters, but it is
recommended to set the project name explicitly in the logger configuration. For these loggers, if you don't want to set
the project name yourself, you can either:

- call `CSVLogger.draft(...)` without the normal init parameters minus the `project_name` or `logging_dir` parameters,
which will cause a `Draft[CSVLogger]` object to be returned if some required parameters are missing
- or use `"@loggers": csv` in the config file, which will also cause a `Draft[CSVLogger]` object to be returned if some required
parameters are missing

If you do not want a `Draft` object to be returned, call `CSVLogger` directly.

The supported loggers are listed below.

### RichLogger {: #edsnlp.training.loggers.RichLogger }

::: edsnlp.training.loggers.RichLogger.__init__
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### CSVLogger {: #edsnlp.training.loggers.CSVLogger }

::: edsnlp.training.loggers.CSVLogger.__init__
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### JSONLogger {: #edsnlp.training.loggers.JSONLogger }

::: edsnlp.training.loggers.JSONLogger.__init__
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### TensorBoardLogger {: #edsnlp.training.loggers.TensorBoardLogger }

::: edsnlp.training.loggers.TensorBoardLogger
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### AimLogger {: #edsnlp.training.loggers.AimLogger }

::: edsnlp.training.loggers.AimLogger
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### WandBLogger {: #edsnlp.training.loggers.WandBLogger }

::: edsnlp.training.loggers.WandBLogger
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### MLflowLogger {: #edsnlp.training.loggers.MLflowLogger }

::: edsnlp.training.loggers.MLflowLogger
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### CometMLLogger {: #edsnlp.training.loggers.CometMLLogger }

::: edsnlp.training.loggers.CometMLLogger
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true

### DVCLiveLogger {: #edsnlp.training.loggers.DVCLiveLogger }

::: edsnlp.training.loggers.DVCLiveLogger
options:
sections: ["text", "parameters"]
heading_level: 4
show_bases: false
show_source: false
only_class_level: true
Loading
Loading