HugoLaurencon commited on
Commit
963c572
β€’
1 Parent(s): 53ffb10
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import json
2
- import random
3
 
4
  import streamlit as st
5
  from datasets import load_from_disk
@@ -19,29 +18,15 @@ class Visualization:
19
  st.title("Visualization of web documents")
20
 
21
  def load_dataset(self):
22
- st.header("Select the size of the dataset")
23
-
24
  self.dataset = load_from_disk(self.path_web_documents_dataset)
25
 
26
- opt_sizes = ["100", "300", "1000", "3000"]
27
- size_dataset = st.selectbox(
28
- "Select the size of the dataset",
29
- options=opt_sizes,
30
- )
31
-
32
- self.dataset = self.dataset.select(range(int(size_dataset)))
33
-
34
  def choose_document(self):
35
  st.header("Choose a document")
36
- if st.button("Select a random document"):
37
- dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
38
- else:
39
- dct_idx = 0
40
  idx = st.number_input(
41
  f"Select a document among the first {self.dataset.num_rows} ones",
42
  min_value=0,
43
  max_value=self.dataset.num_rows - 1,
44
- value=dct_idx,
45
  step=1,
46
  help=f"Index between 0 and {self.dataset.num_rows-1}",
47
  )
@@ -54,13 +39,15 @@ class Visualization:
54
  metadata = json.loads(self.current_doc["metadata"])
55
  for text, image, meta in zip(texts, images, metadata):
56
  if text:
57
- st.text(f"{text}\n\n")
 
58
  elif image:
59
- st.markdown(f"![img]({meta['src']})\n\n")
60
-
 
61
 
62
  if __name__ == "__main__":
63
  st.set_page_config(layout="wide")
64
- path_web_documents_dataset = "./web_docs_final" # Find at s3://m4-datasets/trash/web_docs_final/
65
  visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
66
  visualization.visualization()
 
1
  import json
 
2
 
3
  import streamlit as st
4
  from datasets import load_from_disk
 
18
  st.title("Visualization of web documents")
19
 
20
  def load_dataset(self):
 
 
21
  self.dataset = load_from_disk(self.path_web_documents_dataset)
22
 
 
 
 
 
 
 
 
 
23
  def choose_document(self):
24
  st.header("Choose a document")
 
 
 
 
25
  idx = st.number_input(
26
  f"Select a document among the first {self.dataset.num_rows} ones",
27
  min_value=0,
28
  max_value=self.dataset.num_rows - 1,
29
+ value=0,
30
  step=1,
31
  help=f"Index between 0 and {self.dataset.num_rows-1}",
32
  )
 
39
  metadata = json.loads(self.current_doc["metadata"])
40
  for text, image, meta in zip(texts, images, metadata):
41
  if text:
42
+ display_text = f"{text}\n".replace("\n", "<br>") # .replace(" ", "&nbsp;") Preserves white spaces, but creates text outside the width of the window
43
+ st.markdown(f"<pre>{display_text}</pre>", unsafe_allow_html=True)
44
  elif image:
45
+ st.markdown(f'<img src="{meta["src"]}" style="max-width: 1000px; height: auto;" />', unsafe_allow_html=True)
46
+ st.text("\n")
47
+
48
 
49
  if __name__ == "__main__":
50
  st.set_page_config(layout="wide")
51
+ path_web_documents_dataset = "./web_docs_final_replaceimgbyurl"
52
  visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
53
  visualization.visualization()
web_docs_final/data-00000-of-00002.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:0e5527454919de59e213c2f3ea7e7eab79a0ec9e32e6cd60987c89edc6768028
3
- size 315163440
 
 
 
 
web_docs_final/data-00001-of-00002.arrow DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6a5b9f8f8155a2fb407afd071f2c4e72e93a030ae60261add15840583bcf5a1
3
- size 264390800
 
 
 
 
web_docs_final/state.json DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:9bf0df57986410f1c06732db30d5cd49675992821a69824661ae1cafc223114e
3
- size 393
 
 
 
 
web_docs_final/dataset_info.json β†’ web_docs_final_replaceimgbyurl/data-00000-of-00001.arrow RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6e43ea6cfda9aa9b5c77a68bee33d75e1ed4836545d9a5e398211de392266a8
3
- size 556
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab183c2859d5d97a347d3035d96c988a31c460d945bf8f1df2a56488f3525b56
3
+ size 5574096
web_docs_final_replaceimgbyurl/dataset_info.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "citation": "",
3
+ "description": "",
4
+ "features": {
5
+ "images": {
6
+ "feature": {
7
+ "dtype": "string",
8
+ "_type": "Value"
9
+ },
10
+ "_type": "Sequence"
11
+ },
12
+ "metadata": {
13
+ "dtype": "string",
14
+ "_type": "Value"
15
+ },
16
+ "general_metadata": {
17
+ "dtype": "string",
18
+ "_type": "Value"
19
+ },
20
+ "texts": {
21
+ "feature": {
22
+ "dtype": "string",
23
+ "_type": "Value"
24
+ },
25
+ "_type": "Sequence"
26
+ }
27
+ },
28
+ "homepage": "",
29
+ "license": ""
30
+ }
web_docs_final_replaceimgbyurl/state.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_data_files": [
3
+ {
4
+ "filename": "data-00000-of-00001.arrow"
5
+ }
6
+ ],
7
+ "_fingerprint": "39738be005ac73bb",
8
+ "_format_columns": [
9
+ "general_metadata",
10
+ "images",
11
+ "metadata",
12
+ "texts"
13
+ ],
14
+ "_format_kwargs": {},
15
+ "_format_type": null,
16
+ "_output_all_columns": false,
17
+ "_split": null
18
+ }