Merge remote-tracking branch 'origin/main' into publication

Azure Pipelines · Azure Pipelines · commit 57266096ab3d · 2025-04-01T12:18:14.000Z
diff --git a/.github/workflows/ci_checks.yml b/.github/workflows/ci_checks.yml
@@ -14,4 +14,4 @@ concurrency:
 
 jobs:
   check-schema:
-    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.11.9
+    uses: Lightning-AI/utilities/.github/workflows/check-schema.yml@v0.12.0
diff --git a/.github/workflows/docs-deploy.yml b/.github/workflows/docs-deploy.yml
@@ -79,7 +79,7 @@ jobs:
 
       - name: Deploy 🚀
         if: ${{ github.event_name != 'pull_request' }}
-        uses: JamesIves/github-pages-deploy-action@v4.7.1
+        uses: JamesIves/github-pages-deploy-action@v4.7.3
         with:
           token: ${{ secrets.GITHUB_TOKEN }}
           branch: gh-pages # The branch the action should deploy to.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -9,7 +9,7 @@ ci:
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.6.0
+    rev: v5.0.0
     hooks:
       - id: end-of-file-fixer
       - id: trailing-whitespace
@@ -45,7 +45,7 @@ repos:
         args: ["--print-width=120"]
 
   - repo: https://github.com/executablebooks/mdformat
-    rev: 0.7.17
+    rev: 0.7.21
     hooks:
       - id: mdformat
         additional_dependencies:
@@ -55,7 +55,7 @@ repos:
         args: ["--number"]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.5.0
+    rev: v0.8.6
     hooks:
       # try to fix what is possible
       - id: ruff
diff --git a/README.md b/README.md
@@ -44,13 +44,13 @@ The addition has to formed as new folder:
   accelerator:
     - CPU
   ```
-- _\[optional\]_ requirements listed in `requirements.txt` in the particular folder (in case you need some other packaged then listed the parent folder)
+- _[optional]_ requirements listed in `requirements.txt` in the particular folder (in case you need some other packaged then listed the parent folder)
 
 ## Using datasets
 
 It is quite common to use some public or competition's dataset for your example.
 We facilitate this via defining the data sources in the metafile.
-There are two basic options, download a file from web or pul Kaggle dataset _\[Experimental\]_:
+There are two basic options, download a file from web or pul Kaggle dataset _[Experimental]_:
 
 ```yaml
 datasets:
diff --git a/course_UvA-DL/03-initialization-and-optimization/notebook.py b/course_UvA-DL/03-initialization-and-optimization/notebook.py
@@ -225,7 +225,7 @@ def plot_dists(val_dict, color="C0", xlabel=None, stat="count", use_kde=True):
             kde=use_kde and ((val_dict[key].max() - val_dict[key].min()) > 1e-8),
         )  # Only plot kde if there is variance
         hidden_dim_str = (
-            r"(%i $\to$ %i)" % (val_dict[key].shape[1], val_dict[key].shape[0]) if len(val_dict[key].shape) > 1 else ""
+            r"(%i $\to$ %i)" % (val_dict[key].shape[1], val_dict[key].shape[0]) if len(val_dict[key].shape) > 1 else ""  # noqa: UP031
         )
         key_ax.set_title(f"{key} {hidden_dim_str}")
         if xlabel is not None:
diff --git a/course_UvA-DL/05-transformers-and-MH-attention/MHAttention.py b/course_UvA-DL/05-transformers-and-MH-attention/MHAttention.py
@@ -118,7 +118,7 @@
 # * [Attention?
 # Attention!
 # (Lilian Weng, 2018)](https://lilianweng.github.io/lil-log/2018/06/24/attention-attention.html) - A nice blog post summarizing attention mechanisms in many domains including vision.
-# * [Illustrated: Self-Attention (Raimi Karim, 2019)](https://towardsdatascience.com/illustrated-self-attention-2d627e33b20a) - A nice visualization of the steps of self-attention.
+# * [Illustrated: Self-Attention (Raimi Karim, 2019)](https://medium.com/data-science/illustrated-self-attention-2d627e33b20a) - A nice visualization of the steps of self-attention.
 # Recommended going through if the explanation below is too abstract for you.
 # * [The Transformer family (Lilian Weng, 2020)](https://lilianweng.github.io/lil-log/2020/04/07/the-transformer-family.html) - A very detailed blog post reviewing more variants of Transformers besides the original one.
 
@@ -633,8 +633,8 @@ def forward(self, x):
 fig, ax = plt.subplots(2, 2, figsize=(12, 4))
 ax = [a for a_list in ax for a in a_list]
 for i in range(len(ax)):
-    ax[i].plot(np.arange(1, 17), pe[i, :16], color="C%i" % i, marker="o", markersize=6, markeredgecolor="black")
-    ax[i].set_title("Encoding in hidden dimension %i" % (i + 1))
+    ax[i].plot(np.arange(1, 17), pe[i, :16], color=f"C{i}", marker="o", markersize=6, markeredgecolor="black")
+    ax[i].set_title(f"Encoding in hidden dimension {i + 1}")
     ax[i].set_xlabel("Position in sequence", fontsize=10)
     ax[i].set_ylabel("Positional encoding", fontsize=10)
     ax[i].set_xticks(np.arange(1, 17))
@@ -1088,7 +1088,7 @@ def plot_attention_maps(input_data, attn_maps, idx=0):
             ax[row][column].set_xticklabels(input_data.tolist())
             ax[row][column].set_yticks(list(range(seq_len)))
             ax[row][column].set_yticklabels(input_data.tolist())
-            ax[row][column].set_title("Layer %i, Head %i" % (row + 1, column + 1))
+            ax[row][column].set_title(f"Layer {row + 1}, Head {column + 1}")
     fig.subplots_adjust(hspace=0.5)
     plt.show()
 
@@ -1590,7 +1590,7 @@ def visualize_prediction(idx):
 visualize_prediction(mistakes[-1])
 print("Probabilities:")
 for i, p in enumerate(preds[mistakes[-1]].cpu().numpy()):
-    print("Image %i: %4.2f%%" % (i, 100.0 * p))
+    print(f"Image {i}: {100.0 * p:4.2f}%")
 
 # %% [markdown]
 # In this example, the model confuses a palm tree with a building, giving a probability of ~90% to image 2, and 8% to the actual anomaly.
diff --git a/course_UvA-DL/06-graph-neural-networks/requirements.txt b/course_UvA-DL/06-graph-neural-networks/requirements.txt
@@ -5,3 +5,6 @@ torch-sparse ==0.6.*
 torch-cluster ==1.6.*
 torch-spline-conv ==1.2.*
 torch-geometric ==2.1.*
+
+# todo: some compatibility issues
+numpy <2.0
diff --git a/course_UvA-DL/07-deep-energy-based-generative-models/notebook.py b/course_UvA-DL/07-deep-energy-based-generative-models/notebook.py
@@ -570,7 +570,7 @@ def on_epoch_end(self, trainer, pl_module):
                 grid = torchvision.utils.make_grid(
                     imgs_to_plot, nrow=imgs_to_plot.shape[0], normalize=True, value_range=(-1, 1)
                 )
-                trainer.logger.experiment.add_image("generation_%i" % i, grid, global_step=trainer.current_epoch)
+                trainer.logger.experiment.add_image(f"generation_{i}", grid, global_step=trainer.current_epoch)
 
     def generate_imgs(self, pl_module):
         pl_module.eval()
diff --git a/course_UvA-DL/08-deep-autoencoders/notebook.py b/course_UvA-DL/08-deep-autoencoders/notebook.py
@@ -388,7 +388,7 @@ def on_train_epoch_end(self, trainer, pl_module):
 def train_cifar(latent_dim):
     # Create a PyTorch Lightning trainer with the generation callback
     trainer = pl.Trainer(
-        default_root_dir=os.path.join(CHECKPOINT_PATH, "cifar10_%i" % latent_dim),
+        default_root_dir=os.path.join(CHECKPOINT_PATH, f"cifar10_{latent_dim}"),
         accelerator="auto",
         devices=1,
         max_epochs=500,
@@ -402,7 +402,7 @@ def train_cifar(latent_dim):
     trainer.logger._default_hp_metric = None  # Optional logging argument that we don't need
 
     # Check whether pretrained model exists. If yes, load it and skip training
-    pretrained_filename = os.path.join(CHECKPOINT_PATH, "cifar10_%i.ckpt" % latent_dim)
+    pretrained_filename = os.path.join(CHECKPOINT_PATH, f"cifar10_{latent_dim}.ckpt")
     if os.path.isfile(pretrained_filename):
         print("Found pretrained model, loading...")
         model = Autoencoder.load_from_checkpoint(pretrained_filename)
@@ -475,7 +475,7 @@ def visualize_reconstructions(model, input_imgs):
     grid = torchvision.utils.make_grid(imgs, nrow=4, normalize=True, value_range=(-1, 1))
     grid = grid.permute(1, 2, 0)
     plt.figure(figsize=(7, 4.5))
-    plt.title("Reconstructed from %i latents" % (model.hparams.latent_dim))
+    plt.title(f"Reconstructed from {model.hparams.latent_dim} latents")
     plt.imshow(grid)
     plt.axis("off")
     plt.show()
diff --git a/course_UvA-DL/09-normalizing-flows/NF.py b/course_UvA-DL/09-normalizing-flows/NF.py
@@ -512,7 +512,7 @@ def visualize_dequantization(quants, prior=None):
     x_ticks = []
     for v in np.unique(out):
         indices = np.where(out == v)
-        color = to_rgb("C%i" % v)
+        color = to_rgb(f"C{v}")
         plt.fill_between(inp[indices], prob[indices], np.zeros(indices[0].shape[0]), color=color + (0.5,), label=str(v))
         plt.plot([inp[indices[0][0]]] * 2, [0, prob[indices[0][0]]], color=color)
         plt.plot([inp[indices[0][-1]]] * 2, [0, prob[indices[0][-1]]], color=color)
@@ -525,7 +525,7 @@ def visualize_dequantization(quants, prior=None):
     plt.xlim(inp.min(), inp.max())
     plt.xlabel("z")
     plt.ylabel("Probability")
-    plt.title("Dequantization distribution for %i discrete values" % quants)
+    plt.title(f"Dequantization distribution for {quants} discrete values")
     plt.legend()
     plt.show()
     plt.close()
diff --git a/course_UvA-DL/10-autoregressive-image-modeling/notebook.py b/course_UvA-DL/10-autoregressive-image-modeling/notebook.py
@@ -403,7 +403,7 @@ def show_center_recep_field(img, out):
 for l_idx in range(4):
     vert_img = vert_conv(vert_img)
     horiz_img = horiz_conv(horiz_img) + vert_img
-    print("Layer %i" % (l_idx + 2))
+    print(f"Layer {l_idx + 2}")
     show_center_recep_field(inp_img, horiz_img)
 
 # %% [markdown]
diff --git a/course_UvA-DL/12-meta-learning/notebook.py b/course_UvA-DL/12-meta-learning/notebook.py
@@ -703,10 +703,7 @@ def test_proto_net(model, dataset, data_feats=None, k_shot=4):
 data_feats = None
 for k in [2, 4, 8, 16, 32]:
     protonet_accuracies[k], data_feats = test_proto_net(protonet_model, test_set, data_feats=data_feats, k_shot=k)
-    print(
-        "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)"
-        % (k, 100.0 * protonet_accuracies[k][0], 100 * protonet_accuracies[k][1])
-    )
+    print(f"Accuracy for k={k}: {100.0 * protonet_accuracies[k][0]:4.2f}% (+-{100 * protonet_accuracies[k][1]:4.2f}%)")
 
 # %% [markdown]
 # Before discussing the results above, let's first plot the accuracies over number of examples in the support set:
@@ -1174,8 +1171,7 @@ def test_protomaml(model, dataset, k_shot=4):
 
 for k in protomaml_accuracies:
     print(
-        "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)"
-        % (k, 100.0 * protomaml_accuracies[k][0], 100.0 * protomaml_accuracies[k][1])
+        f"Accuracy for k={k}: {100.0 * protomaml_accuracies[k][0]:4.2f}% (+-{100.0 * protomaml_accuracies[k][1]:4.2f}%)"
     )
 
 # %% [markdown]
@@ -1267,8 +1263,7 @@ def test_protomaml(model, dataset, k_shot=4):
         protonet_model, svhn_fewshot_dataset, data_feats=data_feats, k_shot=k
     )
     print(
-        "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)"
-        % (k, 100.0 * protonet_svhn_accuracies[k][0], 100 * protonet_svhn_accuracies[k][1])
+        f"Accuracy for k={k}: {100.0 * protonet_svhn_accuracies[k][0]:4.2f}% (+-{100 * protonet_svhn_accuracies[k][1]:4.2f}%)"
     )
 
 # %% [markdown]
@@ -1295,8 +1290,7 @@ def test_protomaml(model, dataset, k_shot=4):
 
 for k in protomaml_svhn_accuracies:
     print(
-        "Accuracy for k=%i: %4.2f%% (+-%4.2f%%)"
-        % (k, 100.0 * protomaml_svhn_accuracies[k][0], 100.0 * protomaml_svhn_accuracies[k][1])
+        f"Accuracy for k={k}: {100.0 * protomaml_svhn_accuracies[k][0]:4.2f}% (+-{100.0 * protomaml_svhn_accuracies[k][1]:4.2f}%)"
     )
 
 # %% [markdown]
diff --git a/course_UvA-DL/requirements.txt b/course_UvA-DL/requirements.txt
@@ -1,7 +1,7 @@
 numpy <3.0 # needed for older Torch
-torch >=1.8.1,<2.5
-pytorch-lightning >=2.0,<2.5
-torchmetrics >=1.0,<1.5
+torch >=1.8.1,<2.7
+torchmetrics >=1.0,<1.7
+pytorch-lightning >=2.0,<2.6
 torchvision
 matplotlib
 seaborn
diff --git a/lightning_examples/finetuning-scheduler/finetuning-scheduler.py b/lightning_examples/finetuning-scheduler/finetuning-scheduler.py
@@ -147,6 +147,8 @@
 #
 # - ``ddp`` (and aliases ``ddp_find_unused_parameters_false``, ``ddp_find_unused_parameters_true``, ``ddp_spawn``, ``ddp_fork``, ``ddp_notebook``)
 # - ``fsdp`` (and alias ``fsdp_cpu_offload``)
+# - **NEW**: ``ModelParallelStrategy``
+#   - [See this example](https://finetuning-scheduler.readthedocs.io/en/stable/distributed/model_parallel_scheduled_fine_tuning.html) using FTS with PyTorch's composable distributed (e.g. ``fully_shard``, ``checkpoint``) and Tensor Parallelism (TP) APIs
 #
 # Custom or officially unsupported strategies can be used by setting [FinetuningScheduler.allow_untested](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.fts.html?highlight=allow_untested#finetuning_scheduler.fts.FinetuningScheduler.params.allow_untested) to ``True``.
 # Note that most currently unsupported strategies are so because they require varying degrees of modification to be compatible. For example, ``deepspeed`` will require a [StrategyAdapter](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.strategy_adapters.html#finetuning_scheduler.strategy_adapters.StrategyAdapter) to be written (similar to the one for ``FSDP``, [FSDPStrategyAdapter](https://finetuning-scheduler.readthedocs.io/en/stable/api/finetuning_scheduler.strategy_adapters.html#finetuning_scheduler.strategy_adapters.FSDPStrategyAdapter)) before support can be added (PRs welcome!),
@@ -260,7 +262,10 @@ def __init__(
         self.save_hyperparameters()
         os.environ["TOKENIZERS_PARALLELISM"] = "true" if self.hparams.tokenizers_parallelism else "false"
         self.tokenizer = AutoTokenizer.from_pretrained(
-            self.hparams.model_name_or_path, use_fast=True, local_files_only=False
+            self.hparams.model_name_or_path,
+            use_fast=True,
+            local_files_only=False,
+            clean_up_tokenization_spaces=True,
         )
 
     def prepare_data(self):
diff --git a/lightning_examples/finetuning-scheduler/requirements.txt b/lightning_examples/finetuning-scheduler/requirements.txt
@@ -1,2 +1,2 @@
 datasets >=2.17.0  # to allow explicitly setting `trust_remote_code`
-finetuning-scheduler[examples] <=2.4.0
+finetuning-scheduler[examples] <=2.5.0
diff --git a/lightning_examples/reinforce-learning-DQN/requirements.txt b/lightning_examples/reinforce-learning-DQN/requirements.txt
@@ -5,3 +5,6 @@ seaborn
 # todo: pin version intill reinstall with PT-eco alignment
 torch ==2.1.*
 torchvision ==0.16.*
+
+# todo: some compatibility issues
+numpy <2.0
diff --git a/lightning_examples/requirements.txt b/lightning_examples/requirements.txt
@@ -1,5 +1,5 @@
 numpy <3.0 # needed for older Torch
-torch>=1.8.1, <2.5
-pytorch-lightning >=2.0,<2.5
-torchmetrics>=1.0, <1.5
+torch>=1.8.1, <2.7
+torchmetrics>=1.0, <1.7
+pytorch-lightning >=2.0,<2.6
 matplotlib
diff --git a/lightning_examples/text-transformers/requirements.txt b/lightning_examples/text-transformers/requirements.txt
@@ -3,7 +3,8 @@ scikit-learn
 torchmetrics >=1.0, <1.2
 torchtext
 transformers
-datasets
+datasets <=2.21
 # todo: pin version intill reinstall with PT-eco alignment
 torch ==2.1.*
 torchvision ==0.16.*
+numpy <2.0

Original file line number	Diff line number	Diff line change
`@@ -225,7 +225,7 @@ def plot_dists(val_dict, color="C0", xlabel=None, stat="count", use_kde=True):`
`225`	`225`	`kde=use_kde and ((val_dict[key].max() - val_dict[key].min()) > 1e-8),`
`226`	`226`	`) # Only plot kde if there is variance`
`227`	`227`	`hidden_dim_str = (`
`228`		`- r"(%i $\to$ %i)" % (val_dict[key].shape[1], val_dict[key].shape[0]) if len(val_dict[key].shape) > 1 else ""`
	`228`	`+ r"(%i $\to$ %i)" % (val_dict[key].shape[1], val_dict[key].shape[0]) if len(val_dict[key].shape) > 1 else "" # noqa: UP031`
`229`	`229`	`)`
`230`	`230`	`key_ax.set_title(f"{key} {hidden_dim_str}")`
`231`	`231`	`if xlabel is not None:`
Original file line number	Diff line number	Diff line change
`@@ -570,7 +570,7 @@ def on_epoch_end(self, trainer, pl_module):`
`570`	`570`	`grid = torchvision.utils.make_grid(`
`571`	`571`	`imgs_to_plot, nrow=imgs_to_plot.shape[0], normalize=True, value_range=(-1, 1)`
`572`	`572`	`)`
`573`		`- trainer.logger.experiment.add_image("generation_%i" % i, grid, global_step=trainer.current_epoch)`
	`573`	`+ trainer.logger.experiment.add_image(f"generation_{i}", grid, global_step=trainer.current_epoch)`
`574`	`574`
`575`	`575`	`def generate_imgs(self, pl_module):`
`576`	`576`	`pl_module.eval()`