from typing import Any from langchain.chat_models import ChatOpenAI from langchain.output_parsers import NumberedListOutputParser from langchain.prompts import ChatPromptTemplate from utils import str_to_list query_template = """ You are a bi-lingual (french and english) linguistic teacher working at a top-tier university. We are conducting a research project that requires the extraction of keywords from chatbot queries. Below, you will find a query. Please identify and rank the three most important keywords or phrases (n-grams) based on their relevance to the main topic of the query. For each keyword or phrase, assign it to one of the following categories: ["University / Company", "Research domain", "Country", "Name", "Other"]. An 'n-gram' refers to a contiguous sequence of words, where 'n' can be 1 for a single word, 2 for a pair of words, and so on, up to two words in length. Please ensure not to list more than three n-grams in total. Your expertise in linguistic analysis is crucial for the success of this project. Thank you for your contribution. Please attach your ranked list in the following format: 1. Keyword/Phrase - Category 2. Keyword/Phrase - Category 3. Keyword/Phrase - Category You must be concise and don't need to justify your choices. ``` {query} ``` """ output_parser = NumberedListOutputParser() format_instructions = output_parser.get_format_instructions() class KeywordExtractor: def __init__(self): super().__init__() self.model = ChatOpenAI() self.prompt = ChatPromptTemplate.from_template( template=query_template, ) self.chain = self.prompt | self.model # | output_parser def __call__( self, inputs: str, filter_categories: list[str] = ["Research domain"] ) -> Any: output = self.chain.invoke({"query": inputs}) keywords = output_parser.parse(output.content) filtered_keywords = [] for keyword in keywords: if " - " not in keyword: continue keyword, category = keyword.split(" - ", maxsplit=2) if category in filter_categories: filtered_keywords.append(keyword) return filtered_keywords