Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
HugoLaurencon
commited on
Commit
β’
963c572
1
Parent(s):
53ffb10
update
Browse files- app.py +7 -20
- web_docs_final/data-00000-of-00002.arrow +0 -3
- web_docs_final/data-00001-of-00002.arrow +0 -3
- web_docs_final/state.json +0 -3
- web_docs_final/dataset_info.json β web_docs_final_replaceimgbyurl/data-00000-of-00001.arrow +2 -2
- web_docs_final_replaceimgbyurl/dataset_info.json +30 -0
- web_docs_final_replaceimgbyurl/state.json +18 -0
app.py
CHANGED
@@ -1,5 +1,4 @@
|
|
1 |
import json
|
2 |
-
import random
|
3 |
|
4 |
import streamlit as st
|
5 |
from datasets import load_from_disk
|
@@ -19,29 +18,15 @@ class Visualization:
|
|
19 |
st.title("Visualization of web documents")
|
20 |
|
21 |
def load_dataset(self):
|
22 |
-
st.header("Select the size of the dataset")
|
23 |
-
|
24 |
self.dataset = load_from_disk(self.path_web_documents_dataset)
|
25 |
|
26 |
-
opt_sizes = ["100", "300", "1000", "3000"]
|
27 |
-
size_dataset = st.selectbox(
|
28 |
-
"Select the size of the dataset",
|
29 |
-
options=opt_sizes,
|
30 |
-
)
|
31 |
-
|
32 |
-
self.dataset = self.dataset.select(range(int(size_dataset)))
|
33 |
-
|
34 |
def choose_document(self):
|
35 |
st.header("Choose a document")
|
36 |
-
if st.button("Select a random document"):
|
37 |
-
dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
|
38 |
-
else:
|
39 |
-
dct_idx = 0
|
40 |
idx = st.number_input(
|
41 |
f"Select a document among the first {self.dataset.num_rows} ones",
|
42 |
min_value=0,
|
43 |
max_value=self.dataset.num_rows - 1,
|
44 |
-
value=
|
45 |
step=1,
|
46 |
help=f"Index between 0 and {self.dataset.num_rows-1}",
|
47 |
)
|
@@ -54,13 +39,15 @@ class Visualization:
|
|
54 |
metadata = json.loads(self.current_doc["metadata"])
|
55 |
for text, image, meta in zip(texts, images, metadata):
|
56 |
if text:
|
57 |
-
|
|
|
58 |
elif image:
|
59 |
-
st.markdown(f"
|
60 |
-
|
|
|
61 |
|
62 |
if __name__ == "__main__":
|
63 |
st.set_page_config(layout="wide")
|
64 |
-
path_web_documents_dataset = "./
|
65 |
visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
|
66 |
visualization.visualization()
|
|
|
1 |
import json
|
|
|
2 |
|
3 |
import streamlit as st
|
4 |
from datasets import load_from_disk
|
|
|
18 |
st.title("Visualization of web documents")
|
19 |
|
20 |
def load_dataset(self):
|
|
|
|
|
21 |
self.dataset = load_from_disk(self.path_web_documents_dataset)
|
22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
23 |
def choose_document(self):
|
24 |
st.header("Choose a document")
|
|
|
|
|
|
|
|
|
25 |
idx = st.number_input(
|
26 |
f"Select a document among the first {self.dataset.num_rows} ones",
|
27 |
min_value=0,
|
28 |
max_value=self.dataset.num_rows - 1,
|
29 |
+
value=0,
|
30 |
step=1,
|
31 |
help=f"Index between 0 and {self.dataset.num_rows-1}",
|
32 |
)
|
|
|
39 |
metadata = json.loads(self.current_doc["metadata"])
|
40 |
for text, image, meta in zip(texts, images, metadata):
|
41 |
if text:
|
42 |
+
display_text = f"{text}\n".replace("\n", "<br>") # .replace(" ", " ") Preserves white spaces, but creates text outside the width of the window
|
43 |
+
st.markdown(f"<pre>{display_text}</pre>", unsafe_allow_html=True)
|
44 |
elif image:
|
45 |
+
st.markdown(f'<img src="{meta["src"]}" style="max-width: 1000px; height: auto;" />', unsafe_allow_html=True)
|
46 |
+
st.text("\n")
|
47 |
+
|
48 |
|
49 |
if __name__ == "__main__":
|
50 |
st.set_page_config(layout="wide")
|
51 |
+
path_web_documents_dataset = "./web_docs_final_replaceimgbyurl"
|
52 |
visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
|
53 |
visualization.visualization()
|
web_docs_final/data-00000-of-00002.arrow
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:0e5527454919de59e213c2f3ea7e7eab79a0ec9e32e6cd60987c89edc6768028
|
3 |
-
size 315163440
|
|
|
|
|
|
|
|
web_docs_final/data-00001-of-00002.arrow
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:b6a5b9f8f8155a2fb407afd071f2c4e72e93a030ae60261add15840583bcf5a1
|
3 |
-
size 264390800
|
|
|
|
|
|
|
|
web_docs_final/state.json
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:9bf0df57986410f1c06732db30d5cd49675992821a69824661ae1cafc223114e
|
3 |
-
size 393
|
|
|
|
|
|
|
|
web_docs_final/dataset_info.json β web_docs_final_replaceimgbyurl/data-00000-of-00001.arrow
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ab183c2859d5d97a347d3035d96c988a31c460d945bf8f1df2a56488f3525b56
|
3 |
+
size 5574096
|
web_docs_final_replaceimgbyurl/dataset_info.json
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"citation": "",
|
3 |
+
"description": "",
|
4 |
+
"features": {
|
5 |
+
"images": {
|
6 |
+
"feature": {
|
7 |
+
"dtype": "string",
|
8 |
+
"_type": "Value"
|
9 |
+
},
|
10 |
+
"_type": "Sequence"
|
11 |
+
},
|
12 |
+
"metadata": {
|
13 |
+
"dtype": "string",
|
14 |
+
"_type": "Value"
|
15 |
+
},
|
16 |
+
"general_metadata": {
|
17 |
+
"dtype": "string",
|
18 |
+
"_type": "Value"
|
19 |
+
},
|
20 |
+
"texts": {
|
21 |
+
"feature": {
|
22 |
+
"dtype": "string",
|
23 |
+
"_type": "Value"
|
24 |
+
},
|
25 |
+
"_type": "Sequence"
|
26 |
+
}
|
27 |
+
},
|
28 |
+
"homepage": "",
|
29 |
+
"license": ""
|
30 |
+
}
|
web_docs_final_replaceimgbyurl/state.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_data_files": [
|
3 |
+
{
|
4 |
+
"filename": "data-00000-of-00001.arrow"
|
5 |
+
}
|
6 |
+
],
|
7 |
+
"_fingerprint": "39738be005ac73bb",
|
8 |
+
"_format_columns": [
|
9 |
+
"general_metadata",
|
10 |
+
"images",
|
11 |
+
"metadata",
|
12 |
+
"texts"
|
13 |
+
],
|
14 |
+
"_format_kwargs": {},
|
15 |
+
"_format_type": null,
|
16 |
+
"_output_all_columns": false,
|
17 |
+
"_split": null
|
18 |
+
}
|