Skip to content

Commit 6db5f80

Browse files
authored
Remove unused data files optims (#7985)
* remove unused data files optims * style
1 parent 224b4e6 commit 6db5f80

File tree

3 files changed

+5
-10
lines changed

3 files changed

+5
-10
lines changed

src/datasets/config.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -257,9 +257,7 @@
257257
STREAMING_OPEN_RETRY_INTERVAL = 5
258258

259259
# Datasets repositories exploration
260-
DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
261-
GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10
262-
ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE = 200
260+
ARCHIVES_MAX_NUMBER_FOR_MODULE_INFERENCE = 10
263261

264262
# Async map functions
265263
MAX_NUM_RUNNING_ASYNC_MAP_FUNCTIONS_IN_PARALLEL = 1000

src/datasets/load.py

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -218,7 +218,7 @@ def infer_module_for_data_files_list(
218218
"""
219219
extensions_counter = Counter(
220220
("." + suffix.lower(), xbasename(filepath) in FolderBasedBuilder.METADATA_FILENAMES)
221-
for filepath in data_files_list[: config.DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE]
221+
for filepath in data_files_list
222222
for suffix in xbasename(filepath).split(".")[1:]
223223
)
224224
if extensions_counter:
@@ -255,14 +255,11 @@ def infer_module_for_data_files_list_in_archives(
255255
for filepath in data_files_list:
256256
if str(filepath).endswith(".zip"):
257257
archive_files_counter += 1
258-
if archive_files_counter > config.GLOBBED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE:
258+
if archive_files_counter > config.ARCHIVES_MAX_NUMBER_FOR_MODULE_INFERENCE:
259259
break
260260
extracted = xjoin(StreamingDownloadManager().extract(filepath), "**")
261261
archived_files += [
262-
f.split("::")[0]
263-
for f in xglob(extracted, recursive=True, download_config=download_config)[
264-
: config.ARCHIVED_DATA_FILES_MAX_NUMBER_FOR_MODULE_INFERENCE
265-
]
262+
f.split("::")[0] for f in xglob(extracted, recursive=True, download_config=download_config)
266263
]
267264
extensions_counter = Counter(
268265
"." + suffix.lower() for filepath in archived_files for suffix in xbasename(filepath).split(".")[1:]

tests/test_arrow_dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1926,7 +1926,7 @@ def test_filter(self, in_memory):
19261926
with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
19271927
dset.set_format("numpy")
19281928
fingerprint = dset._fingerprint
1929-
with dset.filter(lambda x: (int(x["filename"][-1]) % 2 == 0)) as dset_filter_even_num:
1929+
with dset.filter(lambda x: int(x["filename"][-1]) % 2 == 0) as dset_filter_even_num:
19301930
self.assertEqual(len(dset_filter_even_num), 15)
19311931
self.assertDictEqual(dset.features, Features({"filename": Value("string")}))
19321932
self.assertDictEqual(dset_filter_even_num.features, Features({"filename": Value("string")}))

0 commit comments

Comments
 (0)