Liam Dyer commited on
Commit
ec76910
1 Parent(s): f5f6563

kerfuffles

Browse files
Files changed (1) hide show
  1. app.py +10 -10
app.py CHANGED
@@ -13,15 +13,6 @@ model = SentenceTransformer("Snowflake/snowflake-arctic-embed-m")
13
  model.to(device="cuda")
14
 
15
 
16
- def chunk(text, max_length=512):
17
- chunks = []
18
- while len(text) > max_length:
19
- chunks.append(text[:max_length])
20
- text = text[max_length:]
21
- chunks.append(text)
22
- return chunks
23
-
24
-
25
  @spaces.GPU
26
  def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
27
  query_embeddings = model.encode(queries, prompt_name="query")
@@ -118,6 +109,15 @@ def convert(input_file) -> str:
118
  return convert_pandoc(input_file, input_file)
119
 
120
 
 
 
 
 
 
 
 
 
 
121
  @spaces.GPU
122
  def predict(queries, documents, max_characters) -> list[list[str]]:
123
  queries = queries.split("\n")
@@ -131,7 +131,7 @@ def predict(queries, documents, max_characters) -> list[list[str]]:
131
  return [[doc] for doc, _ in converted_docs]
132
 
133
  # Embed the documents in 512 character chunks
134
- chunked_docs = [chunk(doc, 512) for doc in converted_docs]
135
  embedded_docs = [embed(queries, chunks) for chunks in chunked_docs]
136
 
137
  # Get a structure like {query: [(doc_idx, chunk_idx, score), (doc_idx, chunk_idx, score), ...]}
 
13
  model.to(device="cuda")
14
 
15
 
 
 
 
 
 
 
 
 
 
16
  @spaces.GPU
17
  def embed(queries, chunks) -> dict[str, list[tuple[str, float]]]:
18
  query_embeddings = model.encode(queries, prompt_name="query")
 
109
  return convert_pandoc(input_file, input_file)
110
 
111
 
112
+ def chunk_to_length(text, max_length=512):
113
+ chunks = []
114
+ while len(text) > max_length:
115
+ chunks.append(text[:max_length])
116
+ text = text[max_length:]
117
+ chunks.append(text)
118
+ return chunks
119
+
120
+
121
  @spaces.GPU
122
  def predict(queries, documents, max_characters) -> list[list[str]]:
123
  queries = queries.split("\n")
 
131
  return [[doc] for doc, _ in converted_docs]
132
 
133
  # Embed the documents in 512 character chunks
134
+ chunked_docs = [chunk_to_length(doc, 512) for doc in converted_docs]
135
  embedded_docs = [embed(queries, chunks) for chunks in chunked_docs]
136
 
137
  # Get a structure like {query: [(doc_idx, chunk_idx, score), (doc_idx, chunk_idx, score), ...]}