Spaces:

HuggingFaceM4
/

obelics_visualization

Running on CPU Upgrade

App Files Files Community

HugoLaurencon commited on May 12, 2023

Commit

53ffb10

•

1 Parent(s): 2c2f259

web docs viz

Browse files

Files changed (7) hide show

.gitattributes +1 -0
.gitignore +1 -0
app.py +66 -0
web_docs_final/data-00000-of-00002.arrow +3 -0
web_docs_final/data-00001-of-00002.arrow +3 -0
web_docs_final/dataset_info.json +3 -0
web_docs_final/state.json +3 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+web_docs_final/** filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *DS_Store

app.py ADDED Viewed

	@@ -0,0 +1,66 @@

+import json
+import random
+import streamlit as st
+from datasets import load_from_disk
+class Visualization:
+    def __init__(self, path_web_documents_dataset):
+        self.path_web_documents_dataset = path_web_documents_dataset
+    def visualization(self):
+        self.set_title()
+        self.load_dataset()
+        self.choose_document()
+        self.display_document()
+    def set_title(self):
+        st.title("Visualization of web documents")
+    def load_dataset(self):
+        st.header("Select the size of the dataset")
+        self.dataset = load_from_disk(self.path_web_documents_dataset)
+        opt_sizes = ["100", "300", "1000", "3000"]
+        size_dataset = st.selectbox(
+            "Select the size of the dataset",
+            options=opt_sizes,
+        )
+        self.dataset = self.dataset.select(range(int(size_dataset)))
+    def choose_document(self):
+        st.header("Choose a document")
+        if st.button("Select a random document"):
+            dct_idx = random.randint(a=0, b=self.dataset.num_rows - 1)
+        else:
+            dct_idx = 0
+        idx = st.number_input(
+            f"Select a document among the first {self.dataset.num_rows} ones",
+            min_value=0,
+            max_value=self.dataset.num_rows - 1,
+            value=dct_idx,
+            step=1,
+            help=f"Index between 0 and {self.dataset.num_rows-1}",
+        )
+        self.current_doc = self.dataset[idx]
+    def display_document(self):
+        st.header("Document")
+        texts = self.current_doc["texts"]
+        images = self.current_doc["images"]
+        metadata = json.loads(self.current_doc["metadata"])
+        for text, image, meta in zip(texts, images, metadata):
+            if text:
+                st.text(f"{text}\n\n")
+            elif image:
+                st.markdown(f"![img]({meta['src']})\n\n")
+if __name__ == "__main__":
+    st.set_page_config(layout="wide")
+    path_web_documents_dataset = "./web_docs_final"  # Find at s3://m4-datasets/trash/web_docs_final/
+    visualization = Visualization(path_web_documents_dataset=path_web_documents_dataset)
+    visualization.visualization()

web_docs_final/data-00000-of-00002.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0e5527454919de59e213c2f3ea7e7eab79a0ec9e32e6cd60987c89edc6768028
+size 315163440

web_docs_final/data-00001-of-00002.arrow ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b6a5b9f8f8155a2fb407afd071f2c4e72e93a030ae60261add15840583bcf5a1
+size 264390800

web_docs_final/dataset_info.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6e43ea6cfda9aa9b5c77a68bee33d75e1ed4836545d9a5e398211de392266a8
+size 556

web_docs_final/state.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9bf0df57986410f1c06732db30d5cd49675992821a69824661ae1cafc223114e
+size 393