aphp · percevalw · Feb 12, 2025 · Feb 19, 2025 · Feb 16, 2025 · Feb 16, 2025
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -22,7 +22,7 @@ repos:
   # ruff
   - repo: https://github.com/charliermarsh/ruff-pre-commit
     # Ruff version.
-    rev: 'v0.6.4'
+    rev: 'v0.9.6'
     hooks:
       - id: ruff
         args: ['--config', 'pyproject.toml', '--fix', '--show-fixes']

diff --git a/changelog.md b/changelog.md
@@ -10,6 +10,11 @@
 - `docs/tutorials/tuning.md`: New tutorial for hyperparameter tuning.
 - Provided a [detailed tutorial](./docs/tutorials/tuning.md) on hyperparameter tuning, covering usage scenarios and configuration options.
 - `ScheduledOptimizer` (e.g., `@core: "optimizer"`) now supports importing optimizers using their qualified name (e.g., `optim: "torch.optim.Adam"`).
+- Added grad spike detection to the `edsnlp.train` script, and per weight layer gradient logging.
+- Added support for multiple loggers (`tensorboard`, `wandb`, `comet_ml`, `aim`, `mlflow`, `clearml`, `dvclive`, `csv`, `json`, `rich`) in `edsnlp.train` via the `logger` parameter. Default is [`json` and `rich`] for backward compatibility.
+- Added clickable snippets in the documentation for more registered functions
+- New trainable `eds.relation_detector_ffn` component to detect relations between entities. These relations are stored in each entity: `head._.rel[relation_label] = [tail1, tail2, ...]`.
+- Load "Status" annotator notes as `status` dict attribute
 
 ### Changed
 
@@ -27,6 +32,7 @@
   - Ensure we don't overwrite the RNG of the data reader when calling `stream.shuffle()` with no seed
   - Raise an error if the batch size in `stream.shuffle(batch_size=...)` is not compatible with the stream
 - `eds.split` now keeps doc and span attributes in the sub-documents.
+- Fixed mini-batch accumulation for multi-task training
 
 # v0.15.0 (2024-12-13)
 

diff --git a/docs/scripts/clickable_snippets.py b/docs/scripts/clickable_snippets.py
@@ -1,7 +1,7 @@
 # Based on https://github.com/darwindarak/mdx_bib
 import os
-import re
 from bisect import bisect_right
+from collections import defaultdict
 from typing import Tuple
 
 import jedi
@@ -22,11 +22,7 @@
 
 from bs4 import BeautifulSoup
 
-BRACKET_RE = re.compile(r"\[([^\[]+)\]")
-CITE_RE = re.compile(r"@([\w_:-]+)")
-DEF_RE = re.compile(r"\A {0,3}\[@([\w_:-]+)\]:\s*(.*)")
-INDENT_RE = re.compile(r"\A\t| {4}(.*)")
-
+# Used to match href in HTML to replace with a relative path
 HREF_REGEX = (
     r"(?<=<\s*(?:a[^>]*href|img[^>]*src)=)"
     r'(?:"([^"]*)"|\'([^\']*)|[ ]*([^ =>]*)(?![a-z]+=))'
@@ -42,6 +38,15 @@
 (?![a-zA-Z0-9._-])
 """
 
+REGISTRY_REGEX = r"""(?x)
+(?<![a-zA-Z0-9._-])
+<span[^>]*>(?:"|&\#39;|&quot;)@([a-zA-Z0-9._-]*)(?:"|&\#39;|&quot;)<\/span>\s*
+<span[^>]*>:<\/span>\s*
+<span[^>]*>\s*<\/span>\s*
+<span[^>]*>(?:"|&\#39;|&quot;)?([a-zA-Z0-9._-]*)(?:"|&\#39;|&quot;)?<\/span>
+(?![a-zA-Z0-9._-])
+"""
+
 CITATION_RE = r"(\[@(?:[\w_:-]+)(?: *, *@(?:[\w_:-]+))*\])"
 
 
@@ -62,11 +67,15 @@ def on_config(self, config: MkDocsConfig):
         plugin.load_config(plugin_config)
 
     @classmethod
-    def get_ep_namespace(cls, ep, namespace):
+    def get_ep_namespace(cls, ep, namespace=None):
         if hasattr(ep, "select"):
-            return ep.select(group=namespace)
+            return ep.select(group=namespace) if namespace else list(ep._all)
         else:  # dict
-            return ep.get(namespace, [])
+            return (
+                ep.get(namespace, [])
+                if namespace
+                else (x for g in ep.values() for x in g)
+            )
 
     @mkdocs.plugins.event_priority(-1000)
     def on_post_page(
@@ -94,18 +103,26 @@ def on_post_page(
         autorefs: AutorefsPlugin = config["plugins"]["autorefs"]
         ep = entry_points()
         page_url = os.path.join("/", page.file.url)
-        spacy_factories_entry_points = {
+        factories_entry_points = {
             ep.name: ep.value
             for ep in (
                 *self.get_ep_namespace(ep, "spacy_factories"),
                 *self.get_ep_namespace(ep, "edsnlp_factories"),
             )
         }
-
-        def replace_component(match):
-            full_group = match.group(0)
+        all_entry_points = defaultdict(dict)
+        for ep in self.get_ep_namespace(ep):
+            if ep.group.startswith("edsnlp_") or ep.group.startswith("spacy_"):
+                group = ep.group.split("_", 1)[1]
+                all_entry_points[group][ep.name] = ep.value
+
+        # This method is meant for replacing any component that
+        # appears in a "eds.component" format, no matter if it is
+        # preceded by a "@factory" or not.
+        def replace_factory_component(match):
+            full_match = match.group(0)
             name = "eds." + match.group(1)
-            ep = spacy_factories_entry_points.get(name)
+            ep = factories_entry_points.get(name)
             preceding = output[match.start(0) - 50 : match.start(0)]
             if ep is not None and "DEFAULT:" not in preceding:
                 try:
@@ -114,16 +131,37 @@ def replace_component(match):
                     pass
                 else:
                     return f"<a href={url}>{name}</a>"
-            return full_group
+            return full_match
+
+        # This method is meant for replacing any component that
+        # appears in a "@registry": "component" format
+        def replace_any_registry_component(match):
+            full_match = match.group(0)
+            group = match.group(1)
+            name = match.group(2)
+            ep = all_entry_points[group].get(name)
+            preceding = output[match.start(0) - 50 : match.start(0)]
+            if ep is not None and "DEFAULT:" not in preceding:
+                try:
+                    url = autorefs.get_item_url(ep.replace(":", "."))
+                except KeyError:
+                    pass
+                else:
+                    repl = f'<a href={url} class="discrete-link">{name}</a>'
+                    before = full_match[: match.start(2) - match.start(0)]
+                    after = full_match[match.end(2) - match.start(0) :]
+                    return before + repl + after
+            return full_match
 
         def replace_link(match):
             relative_url = url = match.group(1) or match.group(2) or match.group(3)
             if url.startswith("/"):
                 relative_url = os.path.relpath(url, page_url)
             return f'"{relative_url}"'
 
-        output = regex.sub(PIPE_REGEX, replace_component, output)
-        output = regex.sub(HTML_PIPE_REGEX, replace_component, output)
+        output = regex.sub(PIPE_REGEX, replace_factory_component, output)
+        output = regex.sub(HTML_PIPE_REGEX, replace_factory_component, output)
+        output = regex.sub(REGISTRY_REGEX, replace_any_registry_component, output)
 
         all_snippets = ""
         all_offsets = []

diff --git a/docs/training/index.md b/docs/training/index.md
diff --git a/docs/training/loggers.md b/docs/training/loggers.md
@@ -0,0 +1,154 @@
+# Loggers
+
+When training a model, it is important to keep track of the training process, model performance at different stages, and statistics about the training data over time. This is where loggers come in. Loggers are used to store such information to be able to analyze and visualize it later.
+
+The EDS-NLP training API (`edsnlp.train`) relies on `accelerate` integration of popular loggers, as well as a few custom loggers.
+You can configure loggers in `edsnlp.train` via the `logger` parameter of the `train` function by specifying:
+
+- a string or a class instance or partially initialized class instance of a logger, e.g.
+
+    === "Via the Python API"
+        ```{ .python .no-check }
+        from edsnlp.training.loggers import CSVLogger
+        from edsnlp.training import train
+
+        logger = CSVLogger.draft()
+        train(..., logger=logger)
+        # or train(..., logger="csv")
+        ```
+
+    === "Via a config file"
+        ```yaml
+        train:
+          ...
+          logger:
+            "@loggers": csv
+            ...
+        ```
+
+
+- or a list of string / logger instances, e.g.
+
+    === "Via the Python API"
+        ```{ .python .no-check }
+        from edsnlp.training.loggers import CSVLogger
+        from edsnlp.training import train
+
+        loggers = ["tensorboard", CSVLogger.draft(...)]
+        train(..., logger=loggers)
+        ```
+
+    === "Via a config file"
+        ```yaml
+        train:
+          ...
+          logger:
+              - tensorboard  # as a string
+              - "@loggers": csv  # as a (partially) instanciated logger
+                ...
+        ```
+
+!!! note "Draft objects"
+
+    `edsnlp.train` will provide a default project name and logging dir for loggers that require these parameters, but it is
+recommended to set the project name explicitly in the logger configuration. For these loggers, if you don't want to set
+the project name yourself, you can either:
+
+    - call `CSVLogger.draft(...)` without the normal init parameters minus the `project_name` or `logging_dir` parameters,
+      which will cause a `Draft[CSVLogger]` object to be returned if some required parameters are missing
+    - or use `"@loggers": csv` in the config file, which will also cause a `Draft[CSVLogger]` object to be returned if some required
+      parameters are missing
+
+    If you do not want a `Draft` object to be returned, call `CSVLogger` directly.
+
+The supported loggers are listed below.
+
+### RichLogger {: #edsnlp.training.loggers.RichLogger }
+
+::: edsnlp.training.loggers.RichLogger.__init__
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### CSVLogger {: #edsnlp.training.loggers.CSVLogger }
+
+::: edsnlp.training.loggers.CSVLogger.__init__
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### JSONLogger {: #edsnlp.training.loggers.JSONLogger }
+
+::: edsnlp.training.loggers.JSONLogger.__init__
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### TensorBoardLogger {: #edsnlp.training.loggers.TensorBoardLogger }
+
+::: edsnlp.training.loggers.TensorBoardLogger
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### AimLogger {: #edsnlp.training.loggers.AimLogger }
+
+::: edsnlp.training.loggers.AimLogger
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### WandBLogger {: #edsnlp.training.loggers.WandBLogger }
+
+::: edsnlp.training.loggers.WandBLogger
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### MLflowLogger {: #edsnlp.training.loggers.MLflowLogger }
+
+::: edsnlp.training.loggers.MLflowLogger
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### CometMLLogger {: #edsnlp.training.loggers.CometMLLogger }
+
+::: edsnlp.training.loggers.CometMLLogger
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true
+
+### DVCLiveLogger {: #edsnlp.training.loggers.DVCLiveLogger }
+
+::: edsnlp.training.loggers.DVCLiveLogger
+    options:
+        sections: ["text", "parameters"]
+        heading_level: 4
+        show_bases: false
+        show_source: false
+        only_class_level: true