File size: 3,973 Bytes
e943a05 3acc11d e943a05 4606755 e943a05 e3af794 e943a05 4606755 e943a05 e3af794 4606755 e943a05 4606755 e943a05 e3af794 e943a05 4606755 e943a05 5071731 e943a05 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import { searchWeb } from "$lib/server/websearch/searchWeb";
import type { Message } from "$lib/types/Message";
import type { WebSearch, WebSearchSource } from "$lib/types/WebSearch";
import { generateQuery } from "$lib/server/websearch/generateQuery";
import { parseWeb } from "$lib/server/websearch/parseWeb";
import { chunk } from "$lib/utils/chunk";
import {
MAX_SEQ_LEN as CHUNK_CAR_LEN,
findSimilarSentences,
} from "$lib/server/websearch/sentenceSimilarity";
import type { Conversation } from "$lib/types/Conversation";
import type { MessageUpdate } from "$lib/types/MessageUpdate";
import { getWebSearchProvider } from "./searchWeb";
const MAX_N_PAGES_SCRAPE = 10 as const;
const MAX_N_PAGES_EMBED = 5 as const;
const DOMAIN_BLOCKLIST = ["youtube.com", "twitter.com"];
export async function runWebSearch(
conv: Conversation,
prompt: string,
updatePad: (upd: MessageUpdate) => void
) {
const messages = (() => {
return [...conv.messages, { content: prompt, from: "user", id: crypto.randomUUID() }];
})() satisfies Message[];
const webSearch: WebSearch = {
prompt: prompt,
searchQuery: "",
results: [],
context: "",
contextSources: [],
createdAt: new Date(),
updatedAt: new Date(),
};
function appendUpdate(message: string, args?: string[], type?: "error" | "update") {
updatePad({ type: "webSearch", messageType: type ?? "update", message: message, args: args });
}
try {
webSearch.searchQuery = await generateQuery(messages);
const searchProvider = getWebSearchProvider();
appendUpdate(`Searching ${searchProvider}`, [webSearch.searchQuery]);
const results = await searchWeb(webSearch.searchQuery);
webSearch.results =
(results.organic_results &&
results.organic_results.map((el: { title?: string; link: string; text?: string }) => {
const { title, link, text } = el;
const { hostname } = new URL(link);
return { title, link, hostname, text };
})) ??
[];
webSearch.results = webSearch.results
.filter(({ link }) => !DOMAIN_BLOCKLIST.some((el) => link.includes(el))) // filter out blocklist links
.slice(0, MAX_N_PAGES_SCRAPE); // limit to first 10 links only
let paragraphChunks: { source: WebSearchSource; text: string }[] = [];
if (webSearch.results.length > 0) {
appendUpdate("Browsing results");
const promises = webSearch.results.map(async (result) => {
const { link } = result;
let text = result.text ?? "";
if (!text) {
try {
text = await parseWeb(link);
appendUpdate("Browsing webpage", [link]);
} catch (e) {
// ignore errors
}
}
const MAX_N_CHUNKS = 100;
const texts = chunk(text, CHUNK_CAR_LEN).slice(0, MAX_N_CHUNKS);
return texts.map((t) => ({ source: result, text: t }));
});
const nestedParagraphChunks = (await Promise.all(promises)).slice(0, MAX_N_PAGES_EMBED);
paragraphChunks = nestedParagraphChunks.flat();
if (!paragraphChunks.length) {
throw new Error("No text found on the first 5 results");
}
} else {
throw new Error("No results found for this search query");
}
appendUpdate("Extracting relevant information");
const topKClosestParagraphs = 8;
const texts = paragraphChunks.map(({ text }) => text);
const indices = await findSimilarSentences(prompt, texts, {
topK: topKClosestParagraphs,
});
webSearch.context = indices.map((idx) => texts[idx]).join("");
const usedSources = new Set<string>();
for (const idx of indices) {
const { source } = paragraphChunks[idx];
if (!usedSources.has(source.link)) {
usedSources.add(source.link);
webSearch.contextSources.push(source);
}
}
updatePad({
type: "webSearch",
messageType: "sources",
message: "sources",
sources: webSearch.contextSources,
});
} catch (searchError) {
if (searchError instanceof Error) {
appendUpdate(
"An error occurred with the web search",
[JSON.stringify(searchError.message)],
"error"
);
}
}
return webSearch;
}
|