Spaces:

HuggingFaceM4
/

obelics_visualization

Running on CPU Upgrade

App Files Files Community

HugoLaurencon commited on May 30, 2023

Commit

963c572

•

1 Parent(s): 53ffb10

update

Browse files

Files changed (7) hide show

app.py +7 -20
web_docs_final/data-00000-of-00002.arrow +0 -3
web_docs_final/data-00001-of-00002.arrow +0 -3
web_docs_final/state.json +0 -3
web_docs_final/dataset_info.json → web_docs_final_replaceimgbyurl/data-00000-of-00001.arrow +2 -2
web_docs_final_replaceimgbyurl/dataset_info.json +30 -0
web_docs_final_replaceimgbyurl/state.json +18 -0

app.py CHANGED Viewed

@@ -1,5 +1,4 @@
 import json
-import random
 import streamlit as st
 from datasets import load_from_disk
@@ -19,29 +18,15 @@ class Visualization:
         st.title("Visualization of web documents")
     def load_dataset(self):
-        st.header("Select the size of the dataset")
         self.dataset = load_from_disk(self.path_web_documents_dataset)
-        opt_sizes = ["100", "300", "1000", "3000"]
-        size_dataset = st.selectbox(
-            "Select the size of the dataset",
-            options=opt_sizes,
-        )
-        self.dataset = self.dataset.select(range(int(size_dataset)))
     def choose_document(self):
         st.header("Choose a document")
-        if st.button("Select a random document"):
-            dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
-        else:
-            dct_idx = 0
         idx = st.number_input(
             f"Select a document among the first {self.dataset.num_rows} ones",
             min_value=0,
             max_value=self.dataset.num_rows - 1,
-            value=dct_idx,
             step=1,
             help=f"Index between 0 and {self.dataset.num_rows-1}",
         )
@@ -54,13 +39,15 @@ class Visualization:
         metadata = json.loads(self.current_doc["metadata"])
         for text, image, meta in zip(texts, images, metadata):
             if text:
-                st.text(f"{text}\n\n")
             elif image:
-                st.markdown(f"![img]({meta['src']})\n\n")
 if __name__ == "__main__":
     st.set_page_config(layout="wide")
-    path_web_documents_dataset = "./web_docs_final"  # Find at s3://m4-datasets/trash/web_docs_final/
     visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
     visualization.visualization()

 import json
 import streamlit as st
 from datasets import load_from_disk
         st.title("Visualization of web documents")
     def load_dataset(self):
         self.dataset = load_from_disk(self.path_web_documents_dataset)
     def choose_document(self):
         st.header("Choose a document")
         idx = st.number_input(
             f"Select a document among the first {self.dataset.num_rows} ones",
             min_value=0,
             max_value=self.dataset.num_rows - 1,
+            value=0,
             step=1,
             help=f"Index between 0 and {self.dataset.num_rows-1}",
         )
         metadata = json.loads(self.current_doc["metadata"])
         for text, image, meta in zip(texts, images, metadata):
             if text:
+                display_text = f"{text}\n".replace("\n", "<br>") # .replace(" ", "&nbsp;") Preserves white spaces, but creates text outside the width of the window
+                st.markdown(f"<pre>{display_text}</pre>", unsafe_allow_html=True)
             elif image:
+                st.markdown(f'<img src="{meta["src"]}" style="max-width: 1000px; height: auto;" />', unsafe_allow_html=True)
+                st.text("\n")
 if __name__ == "__main__":
     st.set_page_config(layout="wide")
+    path_web_documents_dataset = "./web_docs_final_replaceimgbyurl"
     visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
     visualization.visualization()

web_docs_final/data-00000-of-00002.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0e5527454919de59e213c2f3ea7e7eab79a0ec9e32e6cd60987c89edc6768028
-size 315163440

web_docs_final/data-00001-of-00002.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b6a5b9f8f8155a2fb407afd071f2c4e72e93a030ae60261add15840583bcf5a1
-size 264390800

web_docs_final/state.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9bf0df57986410f1c06732db30d5cd49675992821a69824661ae1cafc223114e
-size 393

web_docs_final/dataset_info.json → web_docs_final_replaceimgbyurl/data-00000-of-00001.arrow RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d6e43ea6cfda9aa9b5c77a68bee33d75e1ed4836545d9a5e398211de392266a8
-size 556

 version https://git-lfs.github.com/spec/v1
+oid sha256:ab183c2859d5d97a347d3035d96c988a31c460d945bf8f1df2a56488f3525b56
+size 5574096

web_docs_final_replaceimgbyurl/dataset_info.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "citation": "",
+  "description": "",
+  "features": {
+    "images": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    },
+    "metadata": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "general_metadata": {
+      "dtype": "string",
+      "_type": "Value"
+    },
+    "texts": {
+      "feature": {
+        "dtype": "string",
+        "_type": "Value"
+      },
+      "_type": "Sequence"
+    }
+  },
+  "homepage": "",
+  "license": ""
+}

web_docs_final_replaceimgbyurl/state.json ADDED Viewed

	@@ -0,0 +1,18 @@

+{
+  "_data_files": [
+    {
+      "filename": "data-00000-of-00001.arrow"
+    }
+  ],
+  "_fingerprint": "39738be005ac73bb",
+  "_format_columns": [
+    "general_metadata",
+    "images",
+    "metadata",
+    "texts"
+  ],
+  "_format_kwargs": {},
+  "_format_type": null,
+  "_output_all_columns": false,
+  "_split": null
+}