Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
HugoLaurencon
commited on
Commit
β’
53ffb10
1
Parent(s):
2c2f259
web docs viz
Browse files- .gitattributes +1 -0
- .gitignore +1 -0
- app.py +66 -0
- web_docs_final/data-00000-of-00002.arrow +3 -0
- web_docs_final/data-00001-of-00002.arrow +3 -0
- web_docs_final/dataset_info.json +3 -0
- web_docs_final/state.json +3 -0
.gitattributes
CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
32 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
33 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
34 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
35 |
+
web_docs_final/** filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
*DS_Store
|
app.py
ADDED
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import random
|
3 |
+
|
4 |
+
import streamlit as st
|
5 |
+
from datasets import load_from_disk
|
6 |
+
|
7 |
+
|
8 |
+
class Visualization:
|
9 |
+
def __init__(self, path_web_documents_dataset):
|
10 |
+
self.path_web_documents_dataset = path_web_documents_dataset
|
11 |
+
|
12 |
+
def visualization(self):
|
13 |
+
self.set_title()
|
14 |
+
self.load_dataset()
|
15 |
+
self.choose_document()
|
16 |
+
self.display_document()
|
17 |
+
|
18 |
+
def set_title(self):
|
19 |
+
st.title("Visualization of web documents")
|
20 |
+
|
21 |
+
def load_dataset(self):
|
22 |
+
st.header("Select the size of the dataset")
|
23 |
+
|
24 |
+
self.dataset = load_from_disk(self.path_web_documents_dataset)
|
25 |
+
|
26 |
+
opt_sizes = ["100", "300", "1000", "3000"]
|
27 |
+
size_dataset = st.selectbox(
|
28 |
+
"Select the size of the dataset",
|
29 |
+
options=opt_sizes,
|
30 |
+
)
|
31 |
+
|
32 |
+
self.dataset = self.dataset.select(range(int(size_dataset)))
|
33 |
+
|
34 |
+
def choose_document(self):
|
35 |
+
st.header("Choose a document")
|
36 |
+
if st.button("Select a random document"):
|
37 |
+
dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
|
38 |
+
else:
|
39 |
+
dct_idx = 0
|
40 |
+
idx = st.number_input(
|
41 |
+
f"Select a document among the first {self.dataset.num_rows} ones",
|
42 |
+
min_value=0,
|
43 |
+
max_value=self.dataset.num_rows - 1,
|
44 |
+
value=dct_idx,
|
45 |
+
step=1,
|
46 |
+
help=f"Index between 0 and {self.dataset.num_rows-1}",
|
47 |
+
)
|
48 |
+
self.current_doc = self.dataset[idx]
|
49 |
+
|
50 |
+
def display_document(self):
|
51 |
+
st.header("Document")
|
52 |
+
texts = self.current_doc["texts"]
|
53 |
+
images = self.current_doc["images"]
|
54 |
+
metadata = json.loads(self.current_doc["metadata"])
|
55 |
+
for text, image, meta in zip(texts, images, metadata):
|
56 |
+
if text:
|
57 |
+
st.text(f"{text}\n\n")
|
58 |
+
elif image:
|
59 |
+
st.markdown(f"![img]({meta['src']})\n\n")
|
60 |
+
|
61 |
+
|
62 |
+
if __name__ == "__main__":
|
63 |
+
st.set_page_config(layout="wide")
|
64 |
+
path_web_documents_dataset = "./web_docs_final" # Find at s3://m4-datasets/trash/web_docs_final/
|
65 |
+
visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
|
66 |
+
visualization.visualization()
|
web_docs_final/data-00000-of-00002.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0e5527454919de59e213c2f3ea7e7eab79a0ec9e32e6cd60987c89edc6768028
|
3 |
+
size 315163440
|
web_docs_final/data-00001-of-00002.arrow
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6a5b9f8f8155a2fb407afd071f2c4e72e93a030ae60261add15840583bcf5a1
|
3 |
+
size 264390800
|
web_docs_final/dataset_info.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d6e43ea6cfda9aa9b5c77a68bee33d75e1ed4836545d9a5e398211de392266a8
|
3 |
+
size 556
|
web_docs_final/state.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9bf0df57986410f1c06732db30d5cd49675992821a69824661ae1cafc223114e
|
3 |
+
size 393
|