hf-public-data-insights / tables /datasets.example.yaml
Xianbao QIAN
query tables and create example
9279ca3
raw
history blame
10.8 kB
datasets:
table_structure:
- column: _id
type: VARCHAR
- column: id
type: VARCHAR
- column: author
type: VARCHAR
- column: cardData
type: VARCHAR
- column: disabled
type: BOOLEAN
- column: gated
type: VARCHAR
- column: lastModified
type: VARCHAR
- column: likes
type: BIGINT
- column: trendingScore
type: DOUBLE
- column: private
type: BOOLEAN
- column: sha
type: VARCHAR
- column: description
type: VARCHAR
- column: downloads
type: BIGINT
- column: tags
type: VARCHAR[]
- column: createdAt
type: VARCHAR
- column: key
type: VARCHAR
- column: paperswithcode_id
type: VARCHAR
- column: citation
type: VARCHAR
random_items:
- _id: 64c38c73a8de22f7a1a6c7e1
id: C-MTEB/EcomRetrieval-qrels
author: C-MTEB
cardData: {"configs": [{"config_name": "default", "data_files": [{"split": "dev", "path": "data/dev-*"}]}], "dataset_info": {"features": [{"name": "qid", "dtype": "string"}, {"name": "pid", "dtype": "string"}, {"name": "score", "dtype": "int64"}], "splits": [{"name": "dev", "num_bytes": 27890, "num_examples": 1000}], "download_size": 14540, "dataset_size": 27890}}
disabled: False
gated: False
lastModified: 2023-07-28T09:37:58.000Z
likes: 0
trendingScore: 0.0
private: False
sha: 39c90699b034ec22ac45b3abf5b0bbb5ffd421f9
description:
Dataset Card for "EcomRetrieval-qrels"
More Information needed
downloads: 2893
tags: ['size_categories:1K<n<10K', 'format:parquet', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2023-07-28T09:37:55.000Z
key:
paperswithcode_id: None
citation: None
- _id: 6571a64493ea01156d9c5f2b
id: topeomole/fin
author: topeomole
cardData: None
disabled: False
gated: False
lastModified: 2023-12-29T08:49:31.000Z
likes: 0
trendingScore: 0.0
private: False
sha: 6e68c3a7cce23e0e650997d672fbef009b14af32
description: None
downloads: 0
tags: ['size_categories:10K<n<100K', 'format:csv', 'modality:text', 'library:datasets', 'library:dask', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2023-12-07T11:02:28.000Z
key:
paperswithcode_id: None
citation: None
- _id: 659193db35c41262d6a53f71
id: greathero/evenmorex11-newthreeclass-newercontrailsvalidationdataset
author: greathero
cardData: {"dataset_info": {"features": [{"name": "pixel_values", "dtype": "image"}, {"name": "label", "dtype": "image"}], "splits": [{"name": "train", "num_bytes": 162552, "num_examples": 9}], "download_size": 38146, "dataset_size": 162552}, "configs": [{"config_name": "default", "data_files": [{"split": "train", "path": "data/train-*"}]}]}
disabled: False
gated: False
lastModified: 2023-12-31T19:12:42.000Z
likes: 0
trendingScore: 0.0
private: False
sha: b7b5c1282518329689aba655335e218e79831e97
description: None
downloads: 0
tags: ['size_categories:n<1K', 'format:parquet', 'modality:image', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2023-12-31T16:16:27.000Z
key:
paperswithcode_id: None
citation: None
- _id: 62d3dd1ac85b0fcf7fd98c9d
id: Maxmioti/GDRP-fines
author: Maxmioti
cardData: {"license": "other"}
disabled: False
gated: False
lastModified: 2022-07-17T10:03:34.000Z
likes: 0
trendingScore: 0.0
private: False
sha: 85b9612b440ac0158d5722d0d45b849a012468ec
description: Opensource DataSet form a Kaggle competition https://www.kaggle.com/datasets/andreibuliga1/gdpr-fines-20182020-updated-23012021
GDPR-fines is a dataset with summary of GDPR cases from companies that were find between 2018 and 2021. You will find the summary plus the Articles violated in the cases (3 most importants + "Others" regrouping the rest of articles).
Raw text and lemmatized text available plus multi-labels.
downloads: 0
tags: ['license:other', 'size_categories:n<1K', 'format:csv', 'modality:tabular', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2022-07-17T09:57:46.000Z
key:
paperswithcode_id: None
citation: None
- _id: 65f4980b0a4480a3d52dbbf3
id: reach-vb/mls-eng-10k-repunct-test-v4
author: reach-vb
cardData: {"dataset_info": {"features": [{"name": "original_path", "dtype": "string"}, {"name": "begin_time", "dtype": "float64"}, {"name": "end_time", "dtype": "float64"}, {"name": "transcript", "dtype": "string"}, {"name": "audio_duration", "dtype": "float64"}, {"name": "speaker_id", "dtype": "string"}, {"name": "book_id", "dtype": "string"}, {"name": "repunct_text", "dtype": "string"}], "splits": [{"name": "dev", "num_bytes": 2182587, "num_examples": 3807}], "download_size": 1221776, "dataset_size": 2182587}, "configs": [{"config_name": "default", "data_files": [{"split": "dev", "path": "data/dev-*"}]}]}
disabled: False
gated: False
lastModified: 2024-03-15T18:48:45.000Z
likes: 0
trendingScore: 0.0
private: False
sha: 0f7c184d7e9f0f69babcca664bac0fcda107ef17
description: None
downloads: 0
tags: ['size_categories:1K<n<10K', 'format:parquet', 'modality:tabular', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2024-03-15T18:48:43.000Z
key:
paperswithcode_id: None
citation: None
- _id: 661fa0bcb0e9b7e049a9202b
id: atluzz/train_doc_en_jsonl
author: atluzz
cardData: {"license": "apache-2.0"}
disabled: False
gated: False
lastModified: 2024-04-24T10:38:19.000Z
likes: 0
trendingScore: 0.0
private: False
sha: 5aba071b4fb548841207b134d5f22f981a3ba89b
description: None
downloads: 0
tags: ['license:apache-2.0', 'size_categories:n<1K', 'format:json', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2024-04-17T10:13:16.000Z
key:
paperswithcode_id: None
citation: None
- _id: 662ed2502e1ed663bc3570c9
id: leharris3/airline_reviews_servq
author: leharris3
cardData: {"license": "mit"}
disabled: False
gated: False
lastModified: 2024-04-30T00:16:58.000Z
likes: 0
trendingScore: 0.0
private: False
sha: cf7deac5762779e544ed94d8445d074725d42cd3
description: None
downloads: 0
tags: ['license:mit', 'size_categories:10K<n<100K', 'format:csv', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2024-04-28T22:48:48.000Z
key:
paperswithcode_id: None
citation: None
- _id: 6670d2f2b0cc541843371fb4
id: rahulsikder223/SentEval-CR
author: rahulsikder223
cardData: {"dataset_info": {"features": [{"name": "sentence", "dtype": "string"}, {"name": "label", "dtype": "int64"}], "splits": [{"name": "train", "num_bytes": 288477.7054304636, "num_examples": 2642}, {"name": "test", "num_bytes": 123711.29456953642, "num_examples": 1133}], "download_size": 246421, "dataset_size": 412189}, "configs": [{"config_name": "default", "data_files": [{"split": "train", "path": "data/train-*"}, {"split": "test", "path": "data/test-*"}]}]}
disabled: False
gated: False
lastModified: 2024-06-18T00:40:19.000Z
likes: 0
trendingScore: 0.0
private: False
sha: b092bc7763a0514f2c454d001abdfbf138a6be74
description: This is the SentEval Customer Reviews dataset which has been divided into train and test splits. Here, the labels are binary where 1 corresponds to 'Positive Reviews' and 0 corresponds to 'Negative Reviews'.
downloads: 18
tags: ['size_categories:1K<n<10K', 'format:parquet', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2024-06-18T00:21:06.000Z
key:
paperswithcode_id: None
citation: None
- _id: 6412397cbf081e031e9391a7
id: fathyshalab/reklamation24_wasser-strom-gas-intent
author: fathyshalab
cardData: {"dataset_info": {"features": [{"name": "text", "dtype": "string"}, {"name": "label", "dtype": "int64"}, {"name": "__index_level_0__", "dtype": "int64"}], "splits": [{"name": "train", "num_bytes": 203230, "num_examples": 383}, {"name": "test", "num_bytes": 52516, "num_examples": 96}], "download_size": 142247, "dataset_size": 255746}}
disabled: False
gated: False
lastModified: 2023-03-15T22:09:40.000Z
likes: 0
trendingScore: 0.0
private: False
sha: c8d8322e9e75b8c81eb9b31eb4f4ad9a420f6dbb
description:
Dataset Card for "reklamation24_wasser-strom-gas-intent"
More Information needed
downloads: 0
tags: ['size_categories:n<1K', 'format:parquet', 'modality:tabular', 'modality:text', 'library:datasets', 'library:pandas', 'library:mlcroissant', 'library:polars', 'region:us']
createdAt: 2023-03-15T21:32:44.000Z
key:
paperswithcode_id: None
citation: None
- _id: 66071d3e8ea886bc670da278
id: BangumiBase/saijakutamerwagomihiroinotabiwohajimemashita
author: BangumiBase
cardData: {"license": "mit", "tags": ["art"], "size_categories": ["1K<n<10K"]}
disabled: False
gated: False
lastModified: 2024-03-30T01:28:15.000Z
likes: 0
trendingScore: 0.0
private: False
sha: ecf08621c8d5dc8d1051abdad6568f99e20b9bd0
description:
Bangumi Image Base of Saijaku Tamer Wa Gomi Hiroi No Tabi Wo Hajimemashita
This is the image base of bangumi Saijaku Tamer wa Gomi Hiroi no Tabi wo Hajimemashita, we detected 81 characters, 6058 images in total. The full dataset is here.
Please note that these image bases are not guaranteed to be 100% cleaned, they may be noisy actual. If you intend to manually train models using this dataset, we recommend performing necessary preprocessing on the downloaded dataset to eliminate… See the full description on the dataset page: https://huggingface.co/datasets/BangumiBase/saijakutamerwagomihiroinotabiwohajimemashita.
downloads: 0
tags: ['license:mit', 'size_categories:1K<n<10K', 'modality:image', 'region:us', 'art']
createdAt: 2024-03-29T19:57:50.000Z
key:
paperswithcode_id: None
citation: None