File size: 3,786 Bytes
84d8ab2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#!/usr/bin/env python3
"""
Serve a Hugging Face dataset.
"""

import dataclasses
import os
from typing import Optional

import datasets
import huggingface_hub
from renumics import spotlight  # type: ignore


def login() -> None:
    """
    Login to Hugging Face Hub.
    """
    if token := os.environ.get("HF_TOKEN"):
        huggingface_hub.login(token)


@dataclasses.dataclass
class HFSettings:
    """
    Hugging Face settings.
    """

    dataset: str
    subset: Optional[str] = None
    split: Optional[str] = None
    revision: Optional[str] = None

    enrichment: Optional[str] = None
    enrichment_revision: Optional[str] = None

    @classmethod
    def from_environ(cls) -> "HFSettings":
        """
        Parse Hugging Face settings from environment.
        """
        dataset = os.environ.get("HF_DATASET") or None
        if dataset is None:
            raise RuntimeError(
                "Desired Hugging Face dataset must be set as `HF_DATASET` "
                "environment variable."
            )
        return cls(
            dataset,
            os.environ.get("HF_SUBSET") or None,
            os.environ.get("HF_SPLIT") or None,
            os.environ.get("HF_REVISION") or None,
            os.environ.get("HF_ENRICHMENT") or None,
            os.environ.get("HF_ENRICHMENT_REVISION") or None,
        )

    def __str__(self) -> str:
        return f"{self.dataset}[subset={self.subset},split={self.split},revision={self.revision}]"


if __name__ == "__main__":
    """
    Load and serve the given Hugging Face dataset.
    """
    login()

    hf_settings = HFSettings.from_environ()
    print(f"Loading Hugging Face dataset {hf_settings}.")
    ds = datasets.load_dataset(
        hf_settings.dataset,
        hf_settings.subset,
        split=hf_settings.split,
        revision=hf_settings.revision,
    )
    if hf_settings.enrichment is not None:
        ds_enrichment = datasets.load_dataset(
            hf_settings.enrichment,
            hf_settings.subset,
            split=hf_settings.split,
            revision=hf_settings.enrichment_revision,
        )
        if len(ds_enrichment) != len(ds):
            raise RuntimeError(
                f"Length of the enrichment dataset ({len(ds_enrichment)}) "
                f"mismatches length of the original dataset ({len(ds)})"
            )
        ds = datasets.concatenate_datasets([ds, ds_enrichment], split=ds.split, axis=1)

    dtypes = {}
    for col in ds.column_names:
        if "embedding" in col and isinstance(ds.features[col], datasets.Sequence):
            dtypes[col] = spotlight.dtypes.embedding_dtype

    layout = spotlight.layout.split(
        spotlight.layout.split(
            spotlight.layout.tab(spotlight.layout.table(), weight=4),
            spotlight.layout.tab(
                spotlight.layout.similaritymap(),
                spotlight.layout.scatterplot(),
                weight=3,
            ),
            spotlight.layout.tab(
                spotlight.layout.histogram(), spotlight.layout.metric(), weight=3
            ),
            weight=5,
        ),
        spotlight.layout.tab(spotlight.layout.inspector(), weight=3),
        orientation="vertical",
    )

    if not isinstance(ds, datasets.Dataset):
        raise TypeError(
            f"Loaded Hugging Face dataset is of type {type(ds)} instead of "
            "`datasets.Dataset`. Did you forget to specify subset and/or split "
            "(use environment variables `HF_SUBSET` and `HF_SPLIT` respective)?"
        )
    print(f"Serving Hugging Face dataset {hf_settings}.")
    spotlight.show(
        ds,
        host="0.0.0.0",
        port=7860,
        wait="forever",
        dtype=dtypes,
        layout=layout,
        analyze=True,
        no_browser=True,
    )