Spaces:
Build error
Build error
meg-huggingface
commited on
Commit
•
6af9ef6
1
Parent(s):
937841c
Splitting prepare_dataset into preparing the base dataset, and the tokenized dataset. This will help us to have further control over caching and loading data, eventually removing the storage of base dataset.
Browse files
data_measurements/dataset_statistics.py
CHANGED
@@ -436,24 +436,34 @@ class DatasetStatisticsCacheClass:
|
|
436 |
with open(text_duplicate_counts_df_fid, "rb") as f:
|
437 |
self.text_dup_counts_df = feather.read_feather(f)
|
438 |
|
439 |
-
def load_or_prepare_dataset(self, use_cache=True,
|
440 |
"""
|
441 |
-
|
442 |
-
|
443 |
-
|
444 |
-
|
445 |
-
:
|
|
|
|
|
|
|
|
|
|
|
446 |
"""
|
447 |
-
|
448 |
-
|
449 |
-
|
450 |
-
|
451 |
-
|
452 |
-
and exists(self.tokenized_df_fid)
|
453 |
-
):
|
454 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
455 |
-
|
456 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
457 |
# load extracted text
|
458 |
self.text_dset = load_from_disk(self.text_dset_fid)
|
459 |
logs.warning("Loaded dataset from disk")
|
@@ -473,11 +483,6 @@ class DatasetStatisticsCacheClass:
|
|
473 |
# save extracted text instances
|
474 |
logs.warning("Saving dataset to disk")
|
475 |
self.text_dset.save_to_disk(self.text_dset_fid)
|
476 |
-
# tokenize all text instances
|
477 |
-
self.tokenized_df = self.do_tokenization()
|
478 |
-
if save:
|
479 |
-
# save tokenized text
|
480 |
-
write_df(self.tokenized_df, self.tokenized_df_fid)
|
481 |
|
482 |
def do_tokenization(self):
|
483 |
"""
|
|
|
436 |
with open(text_duplicate_counts_df_fid, "rb") as f:
|
437 |
self.text_dup_counts_df = feather.read_feather(f)
|
438 |
|
439 |
+
def load_or_prepare_dataset(self, use_cache=True, save=True):
|
440 |
"""
|
441 |
+
Prepares the HF datasets and data frames containing the untokenized and
|
442 |
+
tokenized text as well as the label values.
|
443 |
+
self.tokenized_df is used further for calculating text lengths,
|
444 |
+
word counts, etc.
|
445 |
+
Args:
|
446 |
+
use_cache: Used stored data if there; otherwise calculate afresh
|
447 |
+
save: Store the calculated data to disk.
|
448 |
+
|
449 |
+
Returns:
|
450 |
+
|
451 |
"""
|
452 |
+
self.load_or_prepare_text_dset(save, use_cache)
|
453 |
+
self.load_or_prepare_tokenized_df(save, use_cache)
|
454 |
+
|
455 |
+
def load_or_prepare_tokenized_df(self, save, use_cache):
|
456 |
+
if (use_cache and exists(self.tokenized_df_fid)):
|
|
|
|
|
457 |
self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
|
458 |
+
else:
|
459 |
+
# tokenize all text instances
|
460 |
+
self.tokenized_df = self.do_tokenization()
|
461 |
+
if save:
|
462 |
+
# save tokenized text
|
463 |
+
write_df(self.tokenized_df, self.tokenized_df_fid)
|
464 |
+
|
465 |
+
def load_or_prepare_text_dset(self, save, use_cache):
|
466 |
+
if (use_cache and exists(self.text_dset_fid)):
|
467 |
# load extracted text
|
468 |
self.text_dset = load_from_disk(self.text_dset_fid)
|
469 |
logs.warning("Loaded dataset from disk")
|
|
|
483 |
# save extracted text instances
|
484 |
logs.warning("Saving dataset to disk")
|
485 |
self.text_dset.save_to_disk(self.text_dset_fid)
|
|
|
|
|
|
|
|
|
|
|
486 |
|
487 |
def do_tokenization(self):
|
488 |
"""
|