Merge branch 'main' into tutorial_nits

sarahyurick · web-flow · commit e97c36265cfc · 2026-02-12T08:22:26.000-08:00
diff --git a/.github/workflows/config/.secrets.baseline b/.github/workflows/config/.secrets.baseline
@@ -129,15 +129,6 @@
     }
   ],
   "results": {
-    ".github/workflows/cicd-main.yml": [
-      {
-        "type": "Hex High Entropy String",
-        "filename": ".github/workflows/cicd-main.yml",
-        "hashed_secret": "9b048088dfb38c9b23ea87ab0226e4b80c3fce4f",
-        "is_verified": false,
-        "line_number": 95
-      }
-    ],
     "docs/_extensions/ai_assistant/README.md": [
       {
         "type": "Secret Keyword",
@@ -172,6 +163,54 @@
         "line_number": 15
       }
     ],
+    "docs/curate-text/synthetic/index.md": [
+      {
+        "type": "Secret Keyword",
+        "filename": "docs/curate-text/synthetic/index.md",
+        "hashed_secret": "6d9c68c603e465077bdd49c62347fe54717f83a3",
+        "is_verified": false,
+        "line_number": 70
+      }
+    ],
+    "docs/curate-text/synthetic/llm-client.md": [
+      {
+        "type": "Secret Keyword",
+        "filename": "docs/curate-text/synthetic/llm-client.md",
+        "hashed_secret": "e6bdb3f031eea3001ca83dd43d7d49d65a7a6ce5",
+        "is_verified": false,
+        "line_number": 33
+      },
+      {
+        "type": "Secret Keyword",
+        "filename": "docs/curate-text/synthetic/llm-client.md",
+        "hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
+        "is_verified": false,
+        "line_number": 44
+      },
+      {
+        "type": "Secret Keyword",
+        "filename": "docs/curate-text/synthetic/llm-client.md",
+        "hashed_secret": "ec3810e10fb78db55ce38b9c18d1c3eb1db739e0",
+        "is_verified": false,
+        "line_number": 158
+      },
+      {
+        "type": "Secret Keyword",
+        "filename": "docs/curate-text/synthetic/llm-client.md",
+        "hashed_secret": "11fa7c37d697f30e6aee828b4426a10f83ab2380",
+        "is_verified": false,
+        "line_number": 165
+      }
+    ],
+    "docs/curate-text/synthetic/multilingual-qa.md": [
+      {
+        "type": "Secret Keyword",
+        "filename": "docs/curate-text/synthetic/multilingual-qa.md",
+        "hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
+        "is_verified": false,
+        "line_number": 30
+      }
+    ],
     "tests/models/client/test_openai_client.py": [
       {
         "type": "Secret Keyword",
@@ -187,7 +226,7 @@
         "filename": "tutorials/synthetic/README.md",
         "hashed_secret": "aecdccc1cf64595b34e0cc152d238daabb32183a",
         "is_verified": false,
-        "line_number": 18
+        "line_number": 19
       }
     ],
     "tutorials/text/deduplication/semantic/semantic_e2e.ipynb": [
@@ -196,7 +235,7 @@
         "filename": "tutorials/text/deduplication/semantic/semantic_e2e.ipynb",
         "hashed_secret": "344b84ad013e4ab6518c0dc457416855b7b86904",
         "is_verified": false,
-        "line_number": 736
+        "line_number": 744
       }
     ],
     "tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb": [
@@ -205,9 +244,9 @@
         "filename": "tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb",
         "hashed_secret": "344b84ad013e4ab6518c0dc457416855b7b86904",
         "is_verified": false,
-        "line_number": 728
+        "line_number": 736
       }
     ]
   },
-  "generated_at": "2026-01-30T23:03:31Z"
+  "generated_at": "2026-02-11T21:26:53Z"
 }
diff --git a/tutorials/text/deduplication/fuzzy/fuzzy_e2e.ipynb b/tutorials/text/deduplication/fuzzy/fuzzy_e2e.ipynb
@@ -267,10 +267,18 @@
    "source": [
     "import time\n",
     "\n",
+    "import torch\n",
+    "\n",
     "from nemo_curator.backends.experimental.ray_data import RayDataExecutor\n",
     "from nemo_curator.core.client import RayClient\n",
     "\n",
-    "client = RayClient(num_cpus=64, num_gpus=2)  # change as needed\n",
+    "NUM_GPUS = 2\n",
+    "\n",
+    "if torch.cuda.device_count() < NUM_GPUS:\n",
+    "    error_msg = \"The number of GPUs on this machine are lesser than the default this tutorial was tested with, please update `num_gpus` passed into `RayClient`\"\n",
+    "    raise ValueError(error_msg)\n",
+    "\n",
+    "client = RayClient(num_cpus=64, num_gpus=NUM_GPUS)  # change as needed\n",
     "client.start()"
    ]
   },
@@ -1501,7 +1509,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.12.3"
+   "version": "3.12.11"
   }
  },
  "nbformat": 4,
diff --git a/tutorials/text/deduplication/semantic/semantic_e2e.ipynb b/tutorials/text/deduplication/semantic/semantic_e2e.ipynb
@@ -253,10 +253,18 @@
     }
    ],
    "source": [
+    "import torch\n",
+    "\n",
     "from nemo_curator.core.client import RayClient\n",
     "\n",
     "# Number of GPUs should be roughly 2x the memory of the embeddings\n",
-    "client = RayClient(num_cpus=64, num_gpus=4)\n",
+    "NUM_GPUS = 4\n",
+    "\n",
+    "if torch.cuda.device_count() < NUM_GPUS:\n",
+    "    error_msg = \"The number of GPUs on this machine are lesser than the default this tutorial was tested with, please update `num_gpus` passed into `RayClient`\"\n",
+    "    raise ValueError(error_msg)\n",
+    "\n",
+    "client = RayClient(num_cpus=64, num_gpus=NUM_GPUS)\n",
     "client.start()\n",
     "try:\n",
     "    workflow.run()\n",
@@ -994,7 +1002,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },
diff --git a/tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb b/tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb
@@ -122,10 +122,18 @@
    "metadata": {},
    "outputs": [],
    "source": [
+    "import torch\n",
+    "\n",
     "from nemo_curator.core.client import RayClient\n",
     "\n",
     "# Number of GPUs should be roughly 2x the memory of the embeddings\n",
-    "client = RayClient(num_cpus=64, num_gpus=4)\n",
+    "NUM_GPUS = 4\n",
+    "\n",
+    "if torch.cuda.device_count() < NUM_GPUS:\n",
+    "    error_msg = \"The number of GPUs on this machine are lesser than the default this tutorial was tested with, please update `num_gpus` passed into `RayClient`\"\n",
+    "    raise ValueError(error_msg)\n",
+    "\n",
+    "client = RayClient(num_cpus=64, num_gpus=NUM_GPUS)\n",
     "client.start()"
    ]
   },
@@ -1142,7 +1150,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   },