File size: 2,224 Bytes
68a9b68
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
from typing import Any

from langchain.chat_models import ChatOpenAI
from langchain.output_parsers import NumberedListOutputParser
from langchain.prompts import ChatPromptTemplate
from utils import str_to_list

query_template = """
You are a bi-lingual (french and english) linguistic teacher working at a top-tier university.
We are conducting a research project that requires the extraction of keywords from chatbot queries.
Below, you will find a query. Please identify and rank the three most important keywords or phrases (n-grams) based on their relevance to the main topic of the query.
For each keyword or phrase, assign it to one of the following categories: ["University / Company", "Research domain", "Country", "Name", "Other"].
An 'n-gram' refers to a contiguous sequence of words, where 'n' can be 1 for a single word, 2 for a pair of words, and so on, up to two words in length.
Please ensure not to list more than three n-grams in total.
Your expertise in linguistic analysis is crucial for the success of this project. Thank you for your contribution.

Please attach your ranked list in the following format:
1. Keyword/Phrase - Category
2. Keyword/Phrase - Category
3. Keyword/Phrase - Category

You must be concise and don't need to justify your choices.
```
{query}
```
"""

output_parser = NumberedListOutputParser()
format_instructions = output_parser.get_format_instructions()


class KeywordExtractor:
    def __init__(self):
        super().__init__()
        self.model = ChatOpenAI()
        self.prompt = ChatPromptTemplate.from_template(
            template=query_template,
        )

        self.chain = self.prompt | self.model  # | output_parser

    def __call__(
        self, inputs: str, filter_categories: list[str] = ["Research domain"]
    ) -> Any:
        output = self.chain.invoke({"query": inputs})

        keywords = output_parser.parse(output.content)

        filtered_keywords = []
        for keyword in keywords:
            if " - " not in keyword:
                continue

            keyword, category = keyword.split(" - ", maxsplit=2)
            if category in filter_categories:
                filtered_keywords.append(keyword)

        return filtered_keywords