meg-huggingface commited on
Commit
6af9ef6
1 Parent(s): 937841c

Splitting prepare_dataset into preparing the base dataset, and the tokenized dataset. This will help us to have further control over caching and loading data, eventually removing the storage of base dataset.

Browse files
data_measurements/dataset_statistics.py CHANGED
@@ -436,24 +436,34 @@ class DatasetStatisticsCacheClass:
436
  with open(text_duplicate_counts_df_fid, "rb") as f:
437
  self.text_dup_counts_df = feather.read_feather(f)
438
 
439
- def load_or_prepare_dataset(self, use_cache=True, use_df=False, save=True):
440
  """
441
- Prepares the HF datasets and data frames containing the untokenized and tokenized
442
- text as well as the label values. If cache is not being used (use_cache=False), writes the datasets to text.
443
- :param use_cache:
444
- :param use_df: Whether to used stored dataframes rather than dset files
445
- :return:
 
 
 
 
 
446
  """
447
- ## Raw text first, then tokenization.
448
- # Use what has been previously stored in DataFrame form or Dataset form.
449
- if (
450
- use_cache
451
- and use_df
452
- and exists(self.tokenized_df_fid)
453
- ):
454
  self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
455
- elif (
456
- use_cache and exists(self.text_dset_fid)):
 
 
 
 
 
 
 
457
  # load extracted text
458
  self.text_dset = load_from_disk(self.text_dset_fid)
459
  logs.warning("Loaded dataset from disk")
@@ -473,11 +483,6 @@ class DatasetStatisticsCacheClass:
473
  # save extracted text instances
474
  logs.warning("Saving dataset to disk")
475
  self.text_dset.save_to_disk(self.text_dset_fid)
476
- # tokenize all text instances
477
- self.tokenized_df = self.do_tokenization()
478
- if save:
479
- # save tokenized text
480
- write_df(self.tokenized_df, self.tokenized_df_fid)
481
 
482
  def do_tokenization(self):
483
  """
 
436
  with open(text_duplicate_counts_df_fid, "rb") as f:
437
  self.text_dup_counts_df = feather.read_feather(f)
438
 
439
+ def load_or_prepare_dataset(self, use_cache=True, save=True):
440
  """
441
+ Prepares the HF datasets and data frames containing the untokenized and
442
+ tokenized text as well as the label values.
443
+ self.tokenized_df is used further for calculating text lengths,
444
+ word counts, etc.
445
+ Args:
446
+ use_cache: Used stored data if there; otherwise calculate afresh
447
+ save: Store the calculated data to disk.
448
+
449
+ Returns:
450
+
451
  """
452
+ self.load_or_prepare_text_dset(save, use_cache)
453
+ self.load_or_prepare_tokenized_df(save, use_cache)
454
+
455
+ def load_or_prepare_tokenized_df(self, save, use_cache):
456
+ if (use_cache and exists(self.tokenized_df_fid)):
 
 
457
  self.tokenized_df = feather.read_feather(self.tokenized_df_fid)
458
+ else:
459
+ # tokenize all text instances
460
+ self.tokenized_df = self.do_tokenization()
461
+ if save:
462
+ # save tokenized text
463
+ write_df(self.tokenized_df, self.tokenized_df_fid)
464
+
465
+ def load_or_prepare_text_dset(self, save, use_cache):
466
+ if (use_cache and exists(self.text_dset_fid)):
467
  # load extracted text
468
  self.text_dset = load_from_disk(self.text_dset_fid)
469
  logs.warning("Loaded dataset from disk")
 
483
  # save extracted text instances
484
  logs.warning("Saving dataset to disk")
485
  self.text_dset.save_to_disk(self.text_dset_fid)
 
 
 
 
 
486
 
487
  def do_tokenization(self):
488
  """