HugoLaurencon commited on
Commit
53ffb10
β€’
1 Parent(s): 2c2f259

web docs viz

Browse files
.gitattributes CHANGED
@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
32
  *.zip filter=lfs diff=lfs merge=lfs -text
33
  *.zst filter=lfs diff=lfs merge=lfs -text
34
  *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ web_docs_final/** filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *DS_Store
app.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import random
3
+
4
+ import streamlit as st
5
+ from datasets import load_from_disk
6
+
7
+
8
+ class Visualization:
9
+ def __init__(self, path_web_documents_dataset):
10
+ self.path_web_documents_dataset = path_web_documents_dataset
11
+
12
+ def visualization(self):
13
+ self.set_title()
14
+ self.load_dataset()
15
+ self.choose_document()
16
+ self.display_document()
17
+
18
+ def set_title(self):
19
+ st.title("Visualization of web documents")
20
+
21
+ def load_dataset(self):
22
+ st.header("Select the size of the dataset")
23
+
24
+ self.dataset = load_from_disk(self.path_web_documents_dataset)
25
+
26
+ opt_sizes = ["100", "300", "1000", "3000"]
27
+ size_dataset = st.selectbox(
28
+ "Select the size of the dataset",
29
+ options=opt_sizes,
30
+ )
31
+
32
+ self.dataset = self.dataset.select(range(int(size_dataset)))
33
+
34
+ def choose_document(self):
35
+ st.header("Choose a document")
36
+ if st.button("Select a random document"):
37
+ dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
38
+ else:
39
+ dct_idx = 0
40
+ idx = st.number_input(
41
+ f"Select a document among the first {self.dataset.num_rows} ones",
42
+ min_value=0,
43
+ max_value=self.dataset.num_rows - 1,
44
+ value=dct_idx,
45
+ step=1,
46
+ help=f"Index between 0 and {self.dataset.num_rows-1}",
47
+ )
48
+ self.current_doc = self.dataset[idx]
49
+
50
+ def display_document(self):
51
+ st.header("Document")
52
+ texts = self.current_doc["texts"]
53
+ images = self.current_doc["images"]
54
+ metadata = json.loads(self.current_doc["metadata"])
55
+ for text, image, meta in zip(texts, images, metadata):
56
+ if text:
57
+ st.text(f"{text}\n\n")
58
+ elif image:
59
+ st.markdown(f"![img]({meta['src']})\n\n")
60
+
61
+
62
+ if __name__ == "__main__":
63
+ st.set_page_config(layout="wide")
64
+ path_web_documents_dataset = "./web_docs_final" # Find at s3://m4-datasets/trash/web_docs_final/
65
+ visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
66
+ visualization.visualization()
web_docs_final/data-00000-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e5527454919de59e213c2f3ea7e7eab79a0ec9e32e6cd60987c89edc6768028
3
+ size 315163440
web_docs_final/data-00001-of-00002.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b6a5b9f8f8155a2fb407afd071f2c4e72e93a030ae60261add15840583bcf5a1
3
+ size 264390800
web_docs_final/dataset_info.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6e43ea6cfda9aa9b5c77a68bee33d75e1ed4836545d9a5e398211de392266a8
3
+ size 556
web_docs_final/state.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9bf0df57986410f1c06732db30d5cd49675992821a69824661ae1cafc223114e
3
+ size 393