NNPDF · achiefa · Jun 5, 2026
diff --git a/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py b/n3fit/src/n3fit/backends/keras_backend/MetaLayer.py
@@ -1,21 +1,26 @@
 """
-    The class MetaLayer is an extension of the backend Layer class
-    with a number of methods and helpers to facilitate writing new custom layers
-    in such a way that the new custom layer don't need to rely in anything backend-dependent
+The class MetaLayer is an extension of the backend Layer class
+with a number of methods and helpers to facilitate writing new custom layers
+in such a way that the new custom layer don't need to rely in anything backend-dependent
 
-    In other words, if you want to implement a new layer and need functions not included here
-    it is better to add a new method which is just a call to the relevant backend-dependent function
-    For instance: np_to_tensor is just a call to K.constant
+In other words, if you want to implement a new layer and need functions not included here
+it is better to add a new method which is just a call to the relevant backend-dependent function
+For instance: np_to_tensor is just a call to K.constant
 """
 
-from keras.initializers import Constant, RandomUniform, glorot_normal, glorot_uniform
+from keras.initializers import Constant, RandomUniform, VarianceScaling, glorot_uniform
 from keras.layers import Layer
 
 # Define in this dictionary new initializers as well as the arguments they accept (with default values if needed be)
 initializers = {
     "random_uniform": (RandomUniform, {"minval": -0.5, "maxval": 0.5}),
     "glorot_uniform": (glorot_uniform, {}),
-    "glorot_normal": (glorot_normal, {}),
+    # glorot_normal expressed via VarianceScaling so its width is tunable through `scale`:
+    # scale=1.0 reproduces keras' glorot_normal exactly; weight std scales as sqrt(scale).
+    "glorot_normal": (
+        VarianceScaling,
+        {"scale": 1.0, "mode": "fan_avg", "distribution": "truncated_normal"},
+    ),
 }
 
 
@@ -91,10 +96,11 @@ def select_initializer(ini_name, seed=None, **kwargs):
             ) from e
 
         ini_class = ini_tuple[0]
-        ini_args = ini_tuple[1]
+        # Copy so per-call overrides (seed, scale, ...) don't leak into the shared defaults
+        ini_args = dict(ini_tuple[1])
         ini_args["seed"] = seed
 
         for key, value in kwargs.items():
-            if key in ini_args.keys():
+            if key in ini_args:
                 ini_args[key] = value
         return ini_class(**ini_args)
diff --git a/n3fit/src/n3fit/backends/keras_backend/callbacks.py b/n3fit/src/n3fit/backends/keras_backend/callbacks.py
@@ -13,6 +13,7 @@
 """
 
 import logging
+from pathlib import Path
 from time import time
 
 from keras import backend as K
@@ -196,6 +197,64 @@ def on_step_end(self, epoch, logs=None):
             self._update_weights()
 
 
+class StoreCallback(CallbackStep):
+    """
+    Given a ``savedir``, the callback will store the model parameters in
+    that directory every ``check_freq`` epochs.
+
+    Parameters
+    ----------
+        pdf_model: MetaModel
+            The multi-replica PDF model
+        replica_paths: list[Path]
+            One path for replica. Weights are saved under <path>/weights/.
+        check_freq: int
+            Save every this many epochs (default: 100)
+    """
+
+    def __init__(self, pdf_model, replica_paths, stopping_object, check_freq=100):
+        super().__init__()
+        self.check_freq = check_freq
+        self.pdf_model = pdf_model
+        self.weight_dirs = []
+        self.stopping_object = stopping_object
+        for path in replica_paths:
+            weight_dir = path / "parameters"
+            weight_dir.mkdir(parents=True, exist_ok=True)
+            self.weight_dirs.append(weight_dir)
+
+    def _save_weights(self, epoch, tr_weights, weight_dir):
+        filepath = weight_dir / f"params_{epoch}.npz"
+        # save parameters as expected by colibri
+        trainable_weights_flat = np.concatenate([np.asarray(w).flatten() for w in tr_weights])
+        np.savez(filepath, params=trainable_weights_flat)
+        log.info(f"Saved parameters at epoch {epoch} in {filepath}")
+
+    def on_step_end(self, epoch, logs=None):
+        """Function to be called at the end of every epoch
+        Every ``check_freq`` number of epochs, the parameters of the model will
+        be stored in the indicated directory.
+        """
+        if ((epoch + 1) % self.check_freq) == 0:
+            pdf_replicas = self.pdf_model.split_replicas()
+            for replica_model, weight_dir in zip(pdf_replicas, self.weight_dirs):
+                weights = replica_model.trainable_weights
+                self._save_weights(epoch + 1, weights, weight_dir)
+
+    def on_train_end(self, logs=None):
+        """Store the best parameters"""
+        for idx, weight_dir in enumerate(self.weight_dirs):
+            weights = self.stopping_object._best_weights[idx]
+            if weights is not None:
+                best_weights = weights['all_NNs']
+                best_epoch = self.stopping_object._best_epochs[idx]
+                self._save_weights(best_epoch, best_weights, weight_dir)
+            else:
+                log.warning(
+                    f"No best weights found for replica {idx+1}, skipping saving best parameters."
+                )
+
+
 def gen_tensorboard_callback(log_dir, profiling=False, histogram_freq=0):
     """
     Generate tensorboard logging details at ``log_dir``.

diff --git a/n3fit/src/n3fit/io/writer.py b/n3fit/src/n3fit/io/writer.py
@@ -220,7 +220,9 @@
 
 
 class WriterWrapper:
-    def __init__(self, replica_numbers, pdf_objects, stopping_object, all_chi2s, theory, timings, trials):
+    def __init__(
+        self, replica_numbers, pdf_objects, stopping_object, all_chi2s, theory, timings, trials
+    ):
         """
         Initializes the writer for all replicas.
 
@@ -298,18 +300,18 @@ def _hyperparam_settings(self, replica_number):
             trials_number = self.trials["number_of_trials"]
             idx_trial = replica_number % trials_number
             hyperparam_info = {}
-            hyperparam_info["optimizer"]=self.trials["optimizer"][idx_trial]
-            hyperparam_info["learning_rate"]=self.trials["learning_rate"][idx_trial]
-            hyperparam_info["clipnorm"]=self.trials["clipnorm"][idx_trial]
-            hyperparam_info["epochs"]=self.trials["epochs"][idx_trial]
-            hyperparam_info["stopping_patience"]=self.trials["stopping_patience"][idx_trial]
-            hyperparam_info["initial"]=self.trials["initial"][idx_trial]
-            hyperparam_info["nodes_per_layer"]=self.trials["nodes_per_layer"][idx_trial]
-            hyperparam_info["number_of_layers"]=self.trials["number_of_layers"][idx_trial]
-            hyperparam_info["activation"]=self.trials["activation_per_layer"][idx_trial]
-            hyperparam_info["layer_type"]=self.trials["layer_type"][idx_trial]
-            hyperparam_info["initializer"]=self.trials["initializer"][idx_trial]
-            hyperparam_info["dropout"]=self.trials["dropout"][idx_trial]
+            hyperparam_info["optimizer"] = self.trials["optimizer"][idx_trial]
+            hyperparam_info["learning_rate"] = self.trials["learning_rate"][idx_trial]
+            hyperparam_info["clipnorm"] = self.trials["clipnorm"][idx_trial]
+            hyperparam_info["epochs"] = self.trials["epochs"][idx_trial]
+            hyperparam_info["stopping_patience"] = self.trials["stopping_patience"][idx_trial]
+            hyperparam_info["initial"] = self.trials["initial"][idx_trial]
+            hyperparam_info["nodes_per_layer"] = self.trials["nodes_per_layer"][idx_trial]
+            hyperparam_info["number_of_layers"] = self.trials["number_of_layers"][idx_trial]
+            hyperparam_info["activation"] = self.trials["activation_per_layer"][idx_trial]
+            hyperparam_info["layer_type"] = self.trials["layer_type"][idx_trial]
+            hyperparam_info["initializer"] = self.trials["initializer"][idx_trial]
+            hyperparam_info["dropout"] = self.trials["dropout"][idx_trial]
             return hyperparam_info
         else:
             hyperparam_info = "from runcard"
@@ -329,6 +331,11 @@ def _write_metadata_json(self, i, replica_number, out_path):
             # Note: the 2 arguments below are the same for all replicas, unless run separately
             timing=self.timings,
             stop_epoch=self.stopping_object.stop_epoch,
+            would_stop_epoch=(
+                self.stopping_object.would_stop_epoch
+                if self.stopping_object._dont_stop
+                else self.stopping_object.stop_epoch
+            ),
         )
 
         with open(out_path, "w", encoding="utf-8") as fs:
@@ -373,6 +380,7 @@ def jsonfit(
     true_chi2,
     stop_epoch,
     timing,
+    would_stop_epoch,
     hyperparam_info,
 ):
     """Generates a dictionary containing all relevant metadata for the fit
@@ -399,7 +407,9 @@ def jsonfit(
             epoch at which the stopping stopped (not the one for the best fit!)
         timing: dict
             dictionary of the timing of the different events that happened
-        hyperparam_info: dict 
+        would_stop_epoch: int
+            epoch at which the stopping would have stopped if it were not set to "dont_stop"
+        hyperparam_info: dict
             dictionary of hyperparameter settings
     """
     all_info = {}
@@ -415,6 +425,7 @@ def jsonfit(
     all_info["arc_lengths"] = arc_lengths
     all_info["integrability"] = integrability_numbers
     all_info["timing"] = timing
+    all_info["would_stop_epoch"] = would_stop_epoch
     all_info["hyperparameters"] = hyperparam_info
     # Versioning info
     all_info["version"] = version()

diff --git a/n3fit/src/n3fit/model_gen.py b/n3fit/src/n3fit/model_gen.py
@@ -347,6 +347,9 @@ class ReplicaSettings:
             e.g. ``dense`` or ``dense_per_flavour``
         initializer: str
             initializer to be used for this replica
+        initializer_scale: float
+            width multiplier for the initializer distribution. Only affects ``glorot_normal``
+            (weight std scales as sqrt(scale)); 1.0 reproduces standard glorot_normal
         dropout: float
             rate of dropout for each layer
         regularizer: str
@@ -360,6 +363,7 @@ class ReplicaSettings:
     activations: list[str]
     architecture: str = "dense"
     initializer: str = "glorot_normal"
+    initializer_scale: float = 1.0
     dropout_rate: float = 0.0
     regularizer: str = None
     regularizer_args: dict = field(default_factory=dict)
@@ -806,6 +810,7 @@ def _generate_nn(
     activations: list[str] = None,
     architecture: str = "dense",
     initializer: str = None,
+    initializer_scale: float = 1.0,
     dropout_rate: float = 0.0,
     regularizer: str = None,
     regularizer_args: dict = field(default_factory=dict),
@@ -848,7 +853,7 @@ def layer_generator(i_layer, nodes_out, activation):
             """Generate the ``i_layer``-th dense_per_flavour layer for all replicas."""
             l_seed = int(seed + i_layer * n_flavours)
             initializers = [
-                MetaLayer.select_initializer(initializer, seed=l_seed + b)
+                MetaLayer.select_initializer(initializer, seed=l_seed + b, scale=initializer_scale)
                 for b in range(n_flavours)
             ]
             layer = base_layer_selector(
@@ -863,7 +868,9 @@ def layer_generator(i_layer, nodes_out, activation):
     elif architecture == "dense":
 
         def layer_generator(i_layer, nodes_out, activation):
-            kini = MetaLayer.select_initializer(initializer, seed=int(seed + i_layer))
+            kini = MetaLayer.select_initializer(
+                initializer, seed=int(seed + i_layer), scale=initializer_scale
+            )
             return base_layer_selector(
                 architecture,
                 kernel_initializer=kini,

diff --git a/n3fit/src/n3fit/model_trainer.py b/n3fit/src/n3fit/model_trainer.py
@@ -13,6 +13,7 @@
 from itertools import zip_longest
 import json
 import logging
+import pickle
 
 import numpy as np
 
@@ -113,6 +114,10 @@ def __init__(
         theoryid=None,
         lux_params=None,
         replicas=None,
+        save_checkpoints=False,
+        replica_path=None,
+        checkpoint_freq=100,
+        dont_stop=False,
         trials=None,
         load_weights_dict=None,
     ):
@@ -155,6 +160,15 @@ def __init__(
                 if not give, the photon is not generated
             replicas: list
                 list with the replicas ids to be fitted
+            save_checkpoints: bool
+                whether to save checkpoints (i.e. model parameters) during the fit. This requires
+                `replica_path` to be set as well. Not doing this will raise an error.
+            replica_path: Path
+                root path for all replicas.
+            checkpoint_freq: int
+                frequency (in epochs) at which to save checkpoints. Only relevant if `save_checkpoints` is True.
+            dont_stop: bool
+                whether to disable the stopping mechanism, i.e. to run for all epochs regardless of the validation chi2
             trials: str
                 name of the file containing the trials defining the methodology
         """
@@ -173,6 +187,14 @@ def __init__(
         self.lux_params = lux_params
         self.replicas = replicas
         self.experiments_data = experiments_data
+        self.dont_stop = dont_stop
+
+        # Checkpointing options
+        self.save_checkpoints = save_checkpoints
+        self.replica_path = replica_path
+        self.checkpoint_freq = checkpoint_freq
+        if self.save_checkpoints and self.replica_path is None:
+            raise ValueError("To save checkpoints, the 'replica_path' key must be set as well.")
         self.trials = trials
 
         # Initialise internal variables which define behaviour
@@ -728,11 +750,24 @@ def _train_and_fit(self, training_model, stopping_object, epochs=100) -> bool:
             self.training["integmultipliers"],
             update_freq=PUSH_INTEGRABILITY_EACH,
         )
+        callback_list = [callback_st, callback_pos, callback_integ]
+
+        if self.save_checkpoints:
+            pdf_model = training_model.get_layer("PDFs")
+            # Save parameters where colibri will look for checkpoints
+            replica_paths = [
+                self.replica_path.parent / f"fit_replicas/replica_{r}" for r in self.replicas
+            ]
+            checpoint_callback = callbacks.StoreCallback(
+                pdf_model=pdf_model,
+                replica_paths=replica_paths,
+                check_freq=self.checkpoint_freq,
+                stopping_object=stopping_object,
+            )
+            callback_list.append(checpoint_callback)
 
         training_model.perform_fit(
-            epochs=epochs,
-            verbose=False,
-            callbacks=self.callbacks + [callback_st, callback_pos, callback_integ],
+            epochs=epochs, verbose=False, callbacks=self.callbacks + callback_list
         )
 
     def _hyperopt_override(self, params):
@@ -928,6 +963,7 @@ def hyperparametrizable(self, params):
                     nodes=params["nodes_per_layer"],
                     activations=params["activation_per_layer"],
                     initializer=params["initializer"],
+                    initializer_scale=params.get("initializer_scale", 1.0),
                     architecture=params["layer_type"],
                     dropout_rate=params["dropout"],
                     regularizer=params.get("regularizer"),
@@ -949,13 +985,34 @@ def hyperparametrizable(self, params):
                     nodes=self.trials["nodes_per_layer"][idx_hyperparamters],
                     activations=activations,
                     initializer=self.trials["initializer"][idx_hyperparamters],
+                    initializer_scale=params.get("initializer_scale", 1.0),
                     architecture=self.trials["layer_type"][idx_hyperparamters],
                     dropout_rate=self.trials["dropout"][idx_hyperparamters],
                     regularizer=params.get("regularizer"),
                     regularizer_args=params.get("regularizer_args"),
                 )
                 replicas_settings.append(tmp)
 
+        # TODO: tempoerary fix to use NTK utilities in colibri
+        # Create model pkl for colibri n3fit module
+        _init_args = {
+            "flav_info": self.flavinfo,
+            "replica_range_settings": {
+                "min_replica": np.sort(self.replicas)[0],
+                "max_replica": np.sort(self.replicas)[0],
+            },
+            "impose_sumrule": self.impose_sumrule,
+            "fitbasis": self.fitbasis,
+            "nodes": params["nodes_per_layer"],
+            "activations": params["activation_per_layer"],
+            "initializer_name": params["initializer"],
+            "layer_type": params["layer_type"],
+        }
+        state = {"_init_args": _init_args}
+
+        with open(self.replica_path.parent / "pdf_model.pkl", "wb") as file:
+            pickle.dump(state, file)
+
         ### Training loop
         for k, partition in enumerate(self.kpartitions):
 
@@ -1030,6 +1087,7 @@ def hyperparametrizable(self, params):
                 stopping_patience=stopping_epochs,
                 threshold_positivity=threshold_pos,
                 threshold_chi2=threshold_chi2,
+                dont_stop=self.dont_stop,
             )
 
             if self.mode_hyperopt or (not self.trials):