Skip to content

Commit e97c362

Browse files
authored
Merge branch 'main' into tutorial_nits
2 parents f8b7170 + 731a24f commit e97c362

File tree

4 files changed

+82
-19
lines changed

4 files changed

+82
-19
lines changed

.github/workflows/config/.secrets.baseline

Lines changed: 52 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -129,15 +129,6 @@
129129
}
130130
],
131131
"results": {
132-
".github/workflows/cicd-main.yml": [
133-
{
134-
"type": "Hex High Entropy String",
135-
"filename": ".github/workflows/cicd-main.yml",
136-
"hashed_secret": "9b048088dfb38c9b23ea87ab0226e4b80c3fce4f",
137-
"is_verified": false,
138-
"line_number": 95
139-
}
140-
],
141132
"docs/_extensions/ai_assistant/README.md": [
142133
{
143134
"type": "Secret Keyword",
@@ -172,6 +163,54 @@
172163
"line_number": 15
173164
}
174165
],
166+
"docs/curate-text/synthetic/index.md": [
167+
{
168+
"type": "Secret Keyword",
169+
"filename": "docs/curate-text/synthetic/index.md",
170+
"hashed_secret": "6d9c68c603e465077bdd49c62347fe54717f83a3",
171+
"is_verified": false,
172+
"line_number": 70
173+
}
174+
],
175+
"docs/curate-text/synthetic/llm-client.md": [
176+
{
177+
"type": "Secret Keyword",
178+
"filename": "docs/curate-text/synthetic/llm-client.md",
179+
"hashed_secret": "e6bdb3f031eea3001ca83dd43d7d49d65a7a6ce5",
180+
"is_verified": false,
181+
"line_number": 33
182+
},
183+
{
184+
"type": "Secret Keyword",
185+
"filename": "docs/curate-text/synthetic/llm-client.md",
186+
"hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
187+
"is_verified": false,
188+
"line_number": 44
189+
},
190+
{
191+
"type": "Secret Keyword",
192+
"filename": "docs/curate-text/synthetic/llm-client.md",
193+
"hashed_secret": "ec3810e10fb78db55ce38b9c18d1c3eb1db739e0",
194+
"is_verified": false,
195+
"line_number": 158
196+
},
197+
{
198+
"type": "Secret Keyword",
199+
"filename": "docs/curate-text/synthetic/llm-client.md",
200+
"hashed_secret": "11fa7c37d697f30e6aee828b4426a10f83ab2380",
201+
"is_verified": false,
202+
"line_number": 165
203+
}
204+
],
205+
"docs/curate-text/synthetic/multilingual-qa.md": [
206+
{
207+
"type": "Secret Keyword",
208+
"filename": "docs/curate-text/synthetic/multilingual-qa.md",
209+
"hashed_secret": "2083c49ad8d63838a4d18f1de0c419f06eb464db",
210+
"is_verified": false,
211+
"line_number": 30
212+
}
213+
],
175214
"tests/models/client/test_openai_client.py": [
176215
{
177216
"type": "Secret Keyword",
@@ -187,7 +226,7 @@
187226
"filename": "tutorials/synthetic/README.md",
188227
"hashed_secret": "aecdccc1cf64595b34e0cc152d238daabb32183a",
189228
"is_verified": false,
190-
"line_number": 18
229+
"line_number": 19
191230
}
192231
],
193232
"tutorials/text/deduplication/semantic/semantic_e2e.ipynb": [
@@ -196,7 +235,7 @@
196235
"filename": "tutorials/text/deduplication/semantic/semantic_e2e.ipynb",
197236
"hashed_secret": "344b84ad013e4ab6518c0dc457416855b7b86904",
198237
"is_verified": false,
199-
"line_number": 736
238+
"line_number": 744
200239
}
201240
],
202241
"tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb": [
@@ -205,9 +244,9 @@
205244
"filename": "tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb",
206245
"hashed_secret": "344b84ad013e4ab6518c0dc457416855b7b86904",
207246
"is_verified": false,
208-
"line_number": 728
247+
"line_number": 736
209248
}
210249
]
211250
},
212-
"generated_at": "2026-01-30T23:03:31Z"
251+
"generated_at": "2026-02-11T21:26:53Z"
213252
}

tutorials/text/deduplication/fuzzy/fuzzy_e2e.ipynb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,18 @@
267267
"source": [
268268
"import time\n",
269269
"\n",
270+
"import torch\n",
271+
"\n",
270272
"from nemo_curator.backends.experimental.ray_data import RayDataExecutor\n",
271273
"from nemo_curator.core.client import RayClient\n",
272274
"\n",
273-
"client = RayClient(num_cpus=64, num_gpus=2) # change as needed\n",
275+
"NUM_GPUS = 2\n",
276+
"\n",
277+
"if torch.cuda.device_count() < NUM_GPUS:\n",
278+
" error_msg = \"The number of GPUs on this machine are lesser than the default this tutorial was tested with, please update `num_gpus` passed into `RayClient`\"\n",
279+
" raise ValueError(error_msg)\n",
280+
"\n",
281+
"client = RayClient(num_cpus=64, num_gpus=NUM_GPUS) # change as needed\n",
274282
"client.start()"
275283
]
276284
},
@@ -1501,7 +1509,7 @@
15011509
"name": "python",
15021510
"nbconvert_exporter": "python",
15031511
"pygments_lexer": "ipython3",
1504-
"version": "3.12.3"
1512+
"version": "3.12.11"
15051513
}
15061514
},
15071515
"nbformat": 4,

tutorials/text/deduplication/semantic/semantic_e2e.ipynb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,10 +253,18 @@
253253
}
254254
],
255255
"source": [
256+
"import torch\n",
257+
"\n",
256258
"from nemo_curator.core.client import RayClient\n",
257259
"\n",
258260
"# Number of GPUs should be roughly 2x the memory of the embeddings\n",
259-
"client = RayClient(num_cpus=64, num_gpus=4)\n",
261+
"NUM_GPUS = 4\n",
262+
"\n",
263+
"if torch.cuda.device_count() < NUM_GPUS:\n",
264+
" error_msg = \"The number of GPUs on this machine are lesser than the default this tutorial was tested with, please update `num_gpus` passed into `RayClient`\"\n",
265+
" raise ValueError(error_msg)\n",
266+
"\n",
267+
"client = RayClient(num_cpus=64, num_gpus=NUM_GPUS)\n",
260268
"client.start()\n",
261269
"try:\n",
262270
" workflow.run()\n",
@@ -994,7 +1002,7 @@
9941002
],
9951003
"metadata": {
9961004
"kernelspec": {
997-
"display_name": "Python 3",
1005+
"display_name": "Python 3 (ipykernel)",
9981006
"language": "python",
9991007
"name": "python3"
10001008
},

tutorials/text/deduplication/semantic/semantic_step_by_step.ipynb

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -122,10 +122,18 @@
122122
"metadata": {},
123123
"outputs": [],
124124
"source": [
125+
"import torch\n",
126+
"\n",
125127
"from nemo_curator.core.client import RayClient\n",
126128
"\n",
127129
"# Number of GPUs should be roughly 2x the memory of the embeddings\n",
128-
"client = RayClient(num_cpus=64, num_gpus=4)\n",
130+
"NUM_GPUS = 4\n",
131+
"\n",
132+
"if torch.cuda.device_count() < NUM_GPUS:\n",
133+
" error_msg = \"The number of GPUs on this machine are lesser than the default this tutorial was tested with, please update `num_gpus` passed into `RayClient`\"\n",
134+
" raise ValueError(error_msg)\n",
135+
"\n",
136+
"client = RayClient(num_cpus=64, num_gpus=NUM_GPUS)\n",
129137
"client.start()"
130138
]
131139
},
@@ -1142,7 +1150,7 @@
11421150
],
11431151
"metadata": {
11441152
"kernelspec": {
1145-
"display_name": "Python 3",
1153+
"display_name": "Python 3 (ipykernel)",
11461154
"language": "python",
11471155
"name": "python3"
11481156
},

0 commit comments

Comments
 (0)