Skip to content

Commit 1a247f5

Browse files
authored
Write video blobs as is in parquet (#7976)
* plain encoding, uncompressed, no dict for videos in parquet * style
1 parent 518bf32 commit 1a247f5

File tree

1 file changed

+11
-0
lines changed

1 file changed

+11
-0
lines changed

src/datasets/io/parquet.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
from .. import Dataset, Features, NamedSplit, config
99
from ..arrow_writer import get_writer_batch_size_from_data_size, get_writer_batch_size_from_features
10+
from ..features.features import require_storage_embed
1011
from ..formatting import query_table
1112
from ..packaged_modules import _PACKAGED_DATASETS_MODULES
1213
from ..packaged_modules.parquet.parquet import Parquet
@@ -126,6 +127,16 @@ def _write(self, file_obj: BinaryIO, batch_size: int, **parquet_writer_kwargs) -
126127
schema=schema,
127128
use_content_defined_chunking=self.use_content_defined_chunking,
128129
write_page_index=self.write_page_index,
130+
compression={
131+
col: "none" if require_storage_embed(feature) else "snappy"
132+
for col, feature in self.dataset.features.items()
133+
},
134+
use_dictionary=[
135+
col for col, feature in self.dataset.features.items() if not require_storage_embed(feature)
136+
],
137+
column_encoding={
138+
col: "PLAIN" for col, feature in self.dataset.features.items() if require_storage_embed(feature)
139+
},
129140
**parquet_writer_kwargs,
130141
)
131142

0 commit comments

Comments
 (0)