jianghuyihei commited on
Commit
863d8a3
1 Parent(s): ef12410

first commit

Browse files
.gitattributes copy ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.9
2
+
3
+ # 创建用户并切换到该用户
4
+ RUN useradd -m -u 1000 user
5
+
6
+ # 设置环境变量
7
+ ENV PATH="/home/user/.local/bin:$PATH"
8
+
9
+ # /home/user/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
10
+ RUN echo $PATH
11
+
12
+ # 设置环境变量,以避免在安装过程中出现交互提示
13
+ ENV DEBIAN_FRONTEND=noninteractive
14
+
15
+ # 设置工作目录
16
+ WORKDIR /app
17
+ USER root
18
+
19
+ # # 安装 sudo
20
+ RUN apt-get update && apt-get install -y sudo supervisor
21
+
22
+ # 克隆并安装 scipdf_parser
23
+ RUN git clone -q https://github.com/titipata/scipdf_parser.git
24
+ RUN pip install -q git+https://github.com/titipata/scipdf_parser
25
+
26
+ # 进入 scipdf_parser 目录并安装 spaCy 模型
27
+ WORKDIR /app/scipdf_parser
28
+ RUN python -m spacy download en_core_web_sm
29
+
30
+ # 返回工作目录
31
+ WORKDIR /app
32
+
33
+ # 复制requirements.txt文件并安装Python依赖
34
+ COPY --chown=user ./requirements.txt requirements.txt
35
+ RUN pip install -q python-multipart
36
+ RUN pip install -q --upgrade pip
37
+ RUN pip install -q --upgrade -r requirements.txt
38
+
39
+ # 下载并解压 OpenJDK 11
40
+ RUN wget -q https://download.oracle.com/java/GA/jdk11/9/GPL/openjdk-11.0.2_linux-x64_bin.tar.gz \
41
+ && tar -zxvf openjdk-11.0.2_linux-x64_bin.tar.gz > /dev/null \
42
+ && mv jdk-11.0.2 /opt/ \
43
+ && rm openjdk-11.0.2_linux-x64_bin.tar.gz
44
+
45
+ # 设置环境变量
46
+ ENV JAVA_HOME=/opt/jdk-11.0.2
47
+ ENV PATH=$JAVA_HOME/bin:$PATH
48
+
49
+ # 验证 Java 安装
50
+ RUN java -version
51
+
52
+ # RUN sudo apt-get install -y maven
53
+ RUN wget -q https://github.com/kermitt2/grobid/archive/0.7.3.zip
54
+ RUN unzip 0.7.3.zip > /dev/null
55
+
56
+ # 安装依赖
57
+ RUN git clone -q https://github.com/kermitt2/grobid.git
58
+
59
+
60
+ WORKDIR /app/grobid
61
+ RUN chmod +x gradlew
62
+
63
+ RUN ./gradlew clean install --console=plain
64
+
65
+ WORKDIR /app
66
+ # 暴露端口
67
+ EXPOSE 8070
68
+ EXPOSE 7860
69
+ COPY --chown=user ./start.sh start.sh
70
+
71
+ RUN chmod +x start.sh
72
+ RUN chmod -R 777 grobid
73
+
74
+ # 复制应用程序代码
75
+ COPY --chown=user . /app
76
+ CMD ["./start.sh"]
77
+
78
+ # CMD ["supervisord", "-c", "supervisord.conf"]
79
+
80
+ # # 设置默认命令
81
+ # CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
82
+
83
+ # "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"
84
+ # uvicorn app:app --host "0.0.0.0" --port 7860
LLM.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from openai import AzureOpenAI, OpenAI,AsyncAzureOpenAI,AsyncOpenAI
2
+
3
+ from abc import abstractmethod
4
+ import os
5
+ import httpx
6
+ import base64
7
+ import logging
8
+ import asyncio
9
+ import numpy as np
10
+ from tenacity import (
11
+ retry,
12
+ stop_after_attempt,
13
+ wait_fixed,
14
+ )
15
+
16
+
17
+ def get_content_between_a_b(start_tag, end_tag, text):
18
+ extracted_text = ""
19
+ start_index = text.find(start_tag)
20
+ while start_index != -1:
21
+ end_index = text.find(end_tag, start_index + len(start_tag))
22
+ if end_index != -1:
23
+ extracted_text += text[start_index + len(start_tag) : end_index] + " "
24
+ start_index = text.find(start_tag, end_index + len(end_tag))
25
+ else:
26
+ break
27
+
28
+ return extracted_text.strip()
29
+
30
+ def before_retry_fn(retry_state):
31
+ if retry_state.attempt_number > 1:
32
+ logging.info(f"Retrying API call. Attempt #{retry_state.attempt_number}, f{retry_state}")
33
+
34
+ def encode_image(image_path):
35
+ with open(image_path, "rb") as image_file:
36
+ return base64.b64encode(image_file.read()).decode('utf-8')
37
+
38
+ def get_openai_url(img_pth):
39
+ end = img_pth.split(".")[-1]
40
+ if end == "jpg":
41
+ end = "jpeg"
42
+ base64_image = encode_image(img_pth)
43
+ return f"data:image/{end};base64,{base64_image}"
44
+
45
+ class base_llm:
46
+ def __init__(self) -> None:
47
+ pass
48
+
49
+ @abstractmethod
50
+ def response(self,messages,**kwargs):
51
+ pass
52
+
53
+ def get_imgs(self,prompt, save_path="saves/dalle3.jpg"):
54
+ pass
55
+
56
+
57
+
58
+ class openai_llm(base_llm):
59
+ def __init__(self,model = "gpt4o-0513") -> None:
60
+ super().__init__()
61
+ self.model = model
62
+ if "AZURE_OPENAI_ENDPOINT" not in os.environ or os.environ["AZURE_OPENAI_ENDPOINT"] == "":
63
+ raise ValueError("AZURE_OPENAI_ENDPOINT is not set")
64
+ if "AZURE_OPENAI_KEY" not in os.environ or os.environ["AZURE_OPENAI_KEY"] == "":
65
+ raise ValueError("AZURE_OPENAI_KEY is not set")
66
+
67
+ api_version = os.environ.get("AZURE_OPENAI_API_VERSION",None)
68
+ if api_version == "":
69
+ api_version = None
70
+ self.client = AzureOpenAI(
71
+ azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
72
+ api_key=os.environ["AZURE_OPENAI_KEY"],
73
+ api_version= api_version
74
+ )
75
+ self.async_client = AsyncAzureOpenAI(
76
+ azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
77
+ api_key=os.environ["AZURE_OPENAI_KEY"],
78
+ api_version= api_version
79
+ )
80
+
81
+
82
+ def cal_cosine_similarity(self, vec1, vec2):
83
+ if isinstance(vec1, list):
84
+ vec1 = np.array(vec1)
85
+ if isinstance(vec2, list):
86
+ vec2 = np.array(vec2)
87
+ return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
88
+
89
+
90
+ @retry(wait=wait_fixed(10), stop=stop_after_attempt(10), before=before_retry_fn)
91
+ def response(self,messages,**kwargs):
92
+ try:
93
+ response = self.client.chat.completions.create(
94
+ model=kwargs.get("model", self.model),
95
+ messages=messages,
96
+ n = kwargs.get("n", 1),
97
+ temperature= kwargs.get("temperature", 0.7),
98
+ max_tokens=kwargs.get("max_tokens", 4000),
99
+ timeout=kwargs.get("timeout", 180)
100
+ )
101
+ except Exception as e:
102
+ model = kwargs.get("model", self.model)
103
+ print(f"get {model} response failed: {e}")
104
+ print(e)
105
+ logging.info(e)
106
+ return
107
+ return response.choices[0].message.content
108
+
109
+ @retry(wait=wait_fixed(10), stop=stop_after_attempt(10), before=before_retry_fn)
110
+ def get_embbeding(self,text):
111
+ if os.environ.get("EMBEDDING_API_ENDPOINT"):
112
+ client = AzureOpenAI(
113
+ azure_endpoint=os.environ.get("EMBEDDING_API_ENDPOINT",None),
114
+ api_key=os.environ.get("EMBEDDING_API_KEY",None),
115
+ api_version= os.environ.get("AZURE_OPENAI_API_VERSION",None),
116
+ azure_deployment="embedding-3-large"
117
+ )
118
+ else:
119
+ client = self.client
120
+ try:
121
+ embbeding = client.embeddings.create(
122
+ model=os.environ.get("EMBEDDING_MODEL","text-embedding-3-large"),
123
+ input=text,
124
+ timeout= 180
125
+ )
126
+ return embbeding.data[0].embedding
127
+ except Exception as e:
128
+ print(f"get embbeding failed: {e}")
129
+ print(e)
130
+ logging.info(e)
131
+ return
132
+
133
+ async def get_embbeding_async(self,text):
134
+ if os.environ.get("EMBEDDING_API_ENDPOINT",None):
135
+ async_client = AsyncAzureOpenAI(
136
+ azure_endpoint=os.environ.get("EMBEDDING_API_ENDPOINT",None),
137
+ api_key=os.environ.get("EMBEDDING_API_KEY",None),
138
+ api_version= os.environ.get("AZURE_OPENAI_API_VERSION",None),
139
+ azure_deployment="embedding-3-large"
140
+ )
141
+ else:
142
+ async_client = self.async_client
143
+
144
+ try:
145
+ embbeding = await async_client.embeddings.create(
146
+ model=os.environ.get("EMBEDDING_MODEL","text-embedding-3-large"),
147
+ input=text,
148
+ timeout= 180
149
+ )
150
+ return embbeding.data[0].embedding
151
+ except Exception as e:
152
+ await asyncio.sleep(0.1)
153
+ print(f"get embbeding failed: {e}")
154
+ print(e)
155
+ logging.info(e)
156
+ return
157
+
158
+ @retry(wait=wait_fixed(10), stop=stop_after_attempt(10), before=before_retry_fn)
159
+ async def response_async(self,messages,**kwargs):
160
+ try:
161
+ response = await self.async_client.chat.completions.create(
162
+ model=kwargs.get("model", self.model),
163
+ messages=messages,
164
+ n = kwargs.get("n", 1),
165
+ temperature= kwargs.get("temperature", 0.7),
166
+ max_tokens=kwargs.get("max_tokens", 4000),
167
+ timeout=kwargs.get("timeout", 180)
168
+ )
169
+ except Exception as e:
170
+ await asyncio.sleep(0.1)
171
+ model = kwargs.get("model", self.model)
172
+ print(f"get {model} response failed: {e}")
173
+ print(e)
174
+ logging.info(e)
175
+ return
176
+
177
+ return response.choices[0].message.content
178
+
179
+
180
+ if __name__ == "__main__":
181
+ llm = gemini_llm(api_key="")
182
+ prompt = """
183
+ """
184
+ messages = [{"role":"user","content":prompt}]
185
+ response = asyncio.run(llm.response_async(messages))
186
+ print(response)
README.md CHANGED
@@ -1,11 +1,13 @@
1
  ---
2
  title: CoI Agent
3
- emoji: 🌖
4
- colorFrom: purple
5
- colorTo: yellow
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
 
 
9
  short_description: 'Online demo of paper: Chain of Ideas: Revolutionizing Resear'
10
  ---
11
 
 
1
  ---
2
  title: CoI Agent
3
+ emoji: 🐢
4
+ colorFrom: indigo
5
+ colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
  license: apache-2.0
9
+ app_port: 7860
10
+ startup_duration_timeout: 1h
11
  short_description: 'Online demo of paper: Chain of Ideas: Revolutionizing Resear'
12
  ---
13
 
agents.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import time
3
+ import asyncio
4
+ import os
5
+ from searcher import Result,SementicSearcher
6
+ from LLM import openai_llm
7
+ from prompts import *
8
+ from utils import extract
9
+
10
+
11
+ def get_llm(model = "gpt4o-0513"):
12
+ return openai_llm(model)
13
+
14
+ def get_llms():
15
+
16
+ main_llm = get_llm("gpt4o-0513")
17
+ cheap_llm = get_llm("gpt-4o-mini")
18
+ return main_llm,cheap_llm
19
+
20
+ async def judge_idea(i,j,idea0,idea1,topic,llm):
21
+ prompt = get_judge_idea_all_prompt(idea0,idea1,topic)
22
+ messages = [{"role":"user","content":prompt}]
23
+ response = await llm.response_async(messages)
24
+ novelty = extract(response,"novelty")
25
+ relevance = extract(response,"relevance")
26
+ significance = extract(response,"significance")
27
+ clarity = extract(response,"clarity")
28
+ feasibility = extract(response,"feasibility")
29
+ effectiveness = extract(response,"effectiveness")
30
+ return i,j,novelty,relevance,significance,clarity,feasibility,effectiveness
31
+
32
+ class DeepResearchAgent:
33
+ def __init__(self,llm = None,cheap_llm=None,publicationData = None,ban_paper = [],**kwargs) -> None:
34
+ self.reader = SementicSearcher(ban_paper = ban_paper)
35
+ self.begin_time = time.time()
36
+ self.llm = llm
37
+ self.cheap_llm = cheap_llm
38
+ self.read_papers = set()
39
+ self.paper_storage = []
40
+ self.paper_info_for_refine_experiment = []
41
+ self.search_qeuries = []
42
+ self.deep_research_chains = []
43
+ self.deep_ideas = []
44
+ self.check_novel_results = []
45
+ self.score_results = []
46
+ self.topic =None
47
+
48
+
49
+ self.publicationData = publicationData
50
+ self.improve_cnt = kwargs.get("improve_cnt",1)
51
+ self.max_chain_length = kwargs.get("max_chain_length",5)
52
+ self.min_chain_length = kwargs.get("min_chain_length",3)
53
+ self.max_chain_numbers = kwargs.get("max_chain_numbers",10)
54
+
55
+ def wrap_messages(self,prompt):
56
+ return [{"role":"user","content":prompt}]
57
+
58
+ async def get_openai_response_async(self,messages):
59
+ return await self.llm.response_async(messages)
60
+
61
+ async def get_cheap_openai_response_async(self,messages):
62
+ return await self.cheap_llm.response_async(messages,max_tokens = 16000)
63
+
64
+ async def get_search_query(self,topic = None,query=None):
65
+ prompt = get_deep_search_query_prompt(topic,query)
66
+ messages = self.wrap_messages(prompt)
67
+ response = await self.get_openai_response_async(messages)
68
+ search_query = extract(response,"queries")
69
+ try:
70
+ search_query = json.loads(search_query)
71
+ self.search_qeuries.append({"query":query,"search_query":search_query})
72
+ except:
73
+ search_query = [query]
74
+ return search_query
75
+
76
+ async def generate_idea_with_chain(self,topic):
77
+ self.topic = topic
78
+ print(f"begin to generate search query for {topic}")
79
+ search_query = await self.get_search_query(topic=topic)
80
+ papers = []
81
+ for query in search_query:
82
+ failed_query = []
83
+ current_papers = []
84
+ cnt = 0
85
+ while len(current_papers) == 0 and cnt < 10:
86
+ paper = await self.reader.search_async(query,1,paper_list=self.read_papers,llm=self.llm,rerank_query=f"{topic}",publicationDate=self.publicationData)
87
+ if paper and len(paper) > 0 and paper[0]:
88
+ self.read_papers.add(paper[0].title)
89
+ current_papers.append(paper[0])
90
+ else:
91
+ failed_query.append(query)
92
+ prompt = get_deep_rewrite_query_prompt(failed_query,topic)
93
+ messages = self.wrap_messages(prompt)
94
+ new_query = await self.get_openai_response_async(messages)
95
+ new_query = extract(new_query,"query")
96
+ print(f"Failed to search papers for {query}, regenerating query {new_query} to search papers.")
97
+ query = new_query
98
+ cnt += 1
99
+ papers.extend(current_papers)
100
+ if len(papers) >= self.max_chain_numbers:
101
+ break
102
+
103
+ if len(papers) == 0:
104
+ print(f"failed to generate idea {topic}")
105
+ return None,None,None,None,None,None,None,None,None
106
+
107
+ tasks = [self.deep_research_paper_with_chain(paper) for paper in papers]
108
+ results = await asyncio.gather(*tasks)
109
+ results = [result for result in results if result]
110
+ if len(results) ==0:
111
+ print(f"failed to generate idea {topic}")
112
+ return None,None,None,None,None,None,None,None,None
113
+
114
+ ideas,idea_chains,experiments,entities,trends,futures,humans,years = [[result[i] for result in results] for i in range(8)]
115
+
116
+ tasks = []
117
+ for i,idea_1 in enumerate(ideas):
118
+ for j,idea_2 in enumerate(ideas):
119
+ if i != j:
120
+ tasks.append(judge_idea(i,j,idea_1,idea_2,topic,self.llm))
121
+ results = await asyncio.gather(*tasks)
122
+ elo_scores = [0 for _ in range(len(ideas))]
123
+ elo_selected = 0
124
+ def change_winner_to_score(winner,score_1,score_2):
125
+ try:
126
+ winner = int(winner)
127
+ except:
128
+ return score_1+0.5,score_2+0.5
129
+ if winner == 0:
130
+ return score_1+1,score_2
131
+ if winner == 2:
132
+ return score_1+0.5,score_2+0.5
133
+ return score_1,score_2+1
134
+ for result in results:
135
+ i,j,novelty,relevance,significance,clarity,feasibility,effectiveness = result
136
+ for dimension in [novelty,relevance,significance,clarity,feasibility,effectiveness]:
137
+ elo_scores[i],elo_scores[j] = change_winner_to_score(dimension,elo_scores[i],elo_scores[j])
138
+ print(f"i:{i},j:{j},novelty:{novelty},relevance:{relevance},significance:{significance},clarity:{clarity},feasibility:{feasibility},effectiveness:{effectiveness}")
139
+ print(elo_scores)
140
+ try:
141
+ elo_selected = elo_scores.index(max(elo_scores))
142
+ except:
143
+ elo_selected = 0
144
+
145
+ idea,experiment,entities,idea_chain,trend,future,human,year = ideas[elo_selected],experiments[elo_selected],entities[elo_selected],idea_chains[elo_selected],trends[elo_selected],futures[elo_selected],humans[elo_selected],years[elo_selected]
146
+ print(f"successfully generated idea")
147
+ return idea,experiment,entities,idea_chain,ideas,trend,future,human,year
148
+
149
+ async def get_paper_idea_experiment_references_info(self,paper):
150
+ article = paper.article
151
+ if not article:
152
+ return None
153
+ paper_content = self.reader.read_paper_content(article)
154
+ prompt = get_deep_reference_prompt(paper_content,self.topic)
155
+ messages = self.wrap_messages(prompt)
156
+ response = await self.get_cheap_openai_response_async(messages)
157
+ entities = extract(response,"entities")
158
+ idea = extract(response,"idea")
159
+ experiment = extract(response,"experiment")
160
+ references = extract(response,"references")
161
+ return idea,experiment,entities,references,paper.title
162
+
163
+ async def get_article_idea_experiment_references_info(self,article):
164
+ paper_content = self.reader.read_paper_content_with_ref(article)
165
+ prompt = get_deep_reference_prompt(paper_content,self.topic)
166
+ messages = self.wrap_messages(prompt)
167
+ response = await self.get_cheap_openai_response_async(messages)
168
+ entities = extract(response,"entities")
169
+ idea = extract(response,"idea")
170
+ experiment = extract(response,"experiment")
171
+ references = extract(response,"references")
172
+ return idea,experiment,entities,references
173
+
174
+
175
+ async def deep_research_paper_with_chain(self,paper:Result):
176
+ print(f"begin to deep research paper {paper.title}")
177
+ article = paper.article
178
+ if not article:
179
+ print(f"failed to deep research paper {paper.title}")
180
+ return None
181
+ idea_chain = []
182
+ idea_papers = []
183
+ experiments = []
184
+ total_entities = []
185
+ years = []
186
+ idea,experiment,entities,references = await self.get_article_idea_experiment_references_info(article)
187
+ try:
188
+ references = json.loads(references)
189
+ except:
190
+ references = []
191
+ total_entities.append(entities)
192
+ idea_chain.append(idea)
193
+ idea_papers.append(paper.title)
194
+ experiments.append(experiment)
195
+ years.append(paper.year)
196
+
197
+ current_title = paper.title
198
+ current_abstract = paper.abstract
199
+
200
+ # search before
201
+ while len(idea_chain)<self.max_chain_length:
202
+ rerank_query = f"{self.topic} {current_title} {current_abstract}"
203
+ citation_paper = await self.reader.search_related_paper_async(current_title,need_reference=False,rerank_query=rerank_query,llm=self.llm,paper_list=idea_papers)
204
+ if not citation_paper:
205
+ print(f"failed to find citation paper for {current_title}")
206
+ break
207
+ title = citation_paper.title
208
+ abstract = citation_paper.abstract
209
+ prompt = get_deep_judge_relevant_prompt(current_title,current_abstract,self.topic)
210
+ messages = self.wrap_messages(prompt)
211
+ response = await self.get_openai_response_async(messages)
212
+ relevant = extract(response,"relevant")
213
+ if relevant != "0":
214
+ result = await self.get_paper_idea_experiment_references_info(citation_paper)
215
+ if not result:
216
+ break
217
+ idea,experiment,entities,_,_ = result
218
+ idea_chain.append(idea)
219
+ experiments.append(experiment)
220
+ total_entities.append(entities)
221
+ idea_papers.append(citation_paper.title)
222
+ years.append(citation_paper.year)
223
+ current_title = citation_paper.title
224
+ current_abstract = citation_paper.abstract
225
+ else:
226
+ print(f"the paper {title} is not relevant")
227
+ break
228
+
229
+ current_title = paper.title
230
+ current_abstract = paper.abstract
231
+ # search after
232
+ while len(idea_chain) < self.max_chain_length and len(references) > 0:
233
+ search_paper = []
234
+ article = None
235
+ print(f"The references find:{references}")
236
+ while len(references) > 0 and len(search_paper) == 0:
237
+ reference = references[0]
238
+ references.pop(0)
239
+ if reference in self.read_papers:
240
+ continue
241
+ search_paper = await self.reader.search_async(reference,3,llm=self.llm,publicationDate=self.publicationData,paper_list= idea_papers)
242
+ if len(search_paper) > 0:
243
+ s_p = search_paper[0]
244
+ if s_p and s_p.title not in self.read_papers:
245
+ prompt = get_deep_judge_relevant_prompt(current_title,current_abstract,self.topic)
246
+ messages = self.wrap_messages(prompt)
247
+ response = await self.get_openai_response_async(messages)
248
+ relevant = extract(response,"relevant")
249
+ if relevant != "0" or len(idea_chain) < self.min_chain_length:
250
+ article = s_p.article
251
+ if article:
252
+ cite_paper = s_p
253
+ break
254
+ else:
255
+ print(f"the paper {s_p.title} is not relevant")
256
+ search_paper = []
257
+
258
+ if not article:
259
+ rerank_query = f"topic: {self.topic} Title: {current_title} Abstract: {current_abstract}"
260
+ search_paper = await self.reader.search_related_paper_async(current_title,need_citation=False,rerank_query = rerank_query,llm=self.llm,paper_list=idea_papers)
261
+ if not search_paper:
262
+ print(f"failed to find citation paper for {current_title}")
263
+ continue
264
+ s_p = search_paper
265
+ if len(idea_chain) < self.min_chain_length:
266
+ article = s_p.article
267
+ if not article:
268
+ continue
269
+ else:
270
+ cite_paper = s_p
271
+ break
272
+ else:
273
+ if s_p and s_p.title not in self.read_papers:
274
+ prompt = get_deep_judge_relevant_prompt(current_title,current_abstract,self.topic)
275
+ messages = self.wrap_messages(prompt)
276
+ response = await self.get_openai_response_async(messages)
277
+ relevant = extract(response,"relevant")
278
+ if relevant == "1" or len(idea_chain) < self.min_chain_length:
279
+ article = await s_p.article
280
+ if not article:
281
+ continue
282
+ else:
283
+ cite_paper = s_p
284
+ break
285
+ if not article:
286
+ print(f"failed to find citation paper for {current_title}")
287
+ continue
288
+
289
+ print("find the citation paper, begin to deep research")
290
+ paper_content = self.reader.read_paper_content_with_ref(article)
291
+ prompt = get_deep_reference_prompt(paper_content,self.topic)
292
+ messages = self.wrap_messages(prompt)
293
+ response = await self.get_cheap_openai_response_async(messages)
294
+ idea = extract(response,"idea")
295
+ references = extract(response,"references")
296
+ experiment = extract(response,"experiment")
297
+ entities = extract(response,"entities")
298
+ try:
299
+ references = json.loads(references)
300
+ except:
301
+ references = []
302
+ current_title = cite_paper.title
303
+ current_abstract = cite_paper.abstract
304
+ years = [cite_paper.year] + years
305
+ idea_chain = [idea] + idea_chain
306
+ idea_papers = [cite_paper.title] + idea_papers
307
+ experiments = [experiment] + experiments
308
+ total_entities = [entities] + total_entities
309
+ if len(idea_chain) >= self.min_chain_length:
310
+ if cite_paper.citations_conut > 1000:
311
+ break
312
+
313
+ print("successfully generate idea chain")
314
+ idea_chains = ""
315
+ for i,idea,title in zip(range(len(idea_chain)),idea_chain,idea_papers):
316
+ idea_chains += f"{i}.Paper:{title} idea:{idea}\n \n"
317
+
318
+ prompt = get_deep_trend_idea_chains_prompt(idea_chains,entities,self.topic)
319
+ messages = self.wrap_messages(prompt)
320
+ response = await self.get_openai_response_async(messages)
321
+ trend = extract(response,"trend")
322
+
323
+ self.deep_research_chains.append({"idea_chains":idea_chains,"trend":trend,"topic":self.topic,"ideas":idea_chain,"experiments":experiments,"entities":total_entities,"years":years})
324
+ prompt = f"""The current research topic is: {self.topic}. Please help me summarize and refine the following entities by merging, simplifying, or deleting them : {total_entities}
325
+ Please output strictly in the following format:
326
+ <entities> {{cleaned entities}}</entities>
327
+ """
328
+ messages = self.wrap_messages(prompt)
329
+ response = await self.get_openai_response_async(messages)
330
+ total_entities = extract(response,"entities")
331
+ bad_case = []
332
+ prompt = get_deep_generate_future_direciton_prompt(idea_chain,trend,self.topic,total_entities)
333
+ messages = self.wrap_messages(prompt)
334
+ response = await self.get_openai_response_async(messages)
335
+ future = extract(response,"future")
336
+ human = extract(response,"human")
337
+
338
+
339
+ prompt = get_deep_generate_idea_prompt(idea_chains,trend,self.topic,total_entities,future,bad_case)
340
+ messages = self.wrap_messages(prompt)
341
+ response = await self.get_openai_response_async(messages)
342
+ method = extract(response,"method")
343
+ novelty = extract(response,"novelty")
344
+ motivation = extract(response,"motivation")
345
+ idea = {"motivation":motivation,"novelty":novelty,"method":method}
346
+ prompt = get_deep_final_idea_prompt(idea_chains,trend,idea,self.topic)
347
+ messages = self.wrap_messages(prompt)
348
+ response = await self.get_openai_response_async(messages)
349
+ final_idea = extract(response,"final_idea")
350
+
351
+ idea = final_idea
352
+ self.deep_ideas.append(idea)
353
+ print(f"successfully deep research paper {paper.title}")
354
+ return idea,idea_chains,trend,experiments,total_entities,future,human,years
355
+
356
+
357
+ if __name__ == "__main__":
358
+ reader = SementicSearcher()
app.py ADDED
@@ -0,0 +1,345 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, Form
2
+ from fastapi.responses import HTMLResponse
3
+ from jinja2 import Template
4
+ import markdown
5
+ import time
6
+ from datetime import datetime, timedelta
7
+ from apscheduler.schedulers.background import BackgroundScheduler
8
+ import asyncio
9
+ from agents import DeepResearchAgent, get_llms
10
+
11
+ app = FastAPI()
12
+
13
+ # 每日最大回复次数
14
+ MAX_REPLIES_PER_DAY = 100
15
+ # 当日回复次数计数器
16
+ reply_count = 0
17
+ # 启动时设置计数器重置
18
+ last_reset_time = datetime.now()
19
+
20
+ # HTML模板
21
+ html_template = """
22
+ <!DOCTYPE html>
23
+ <html>
24
+ <head>
25
+ <title>CoI Agent online demo 😊</title>
26
+ <style>
27
+ body {
28
+ font-family: 'Arial', sans-serif;
29
+ background-color: #f4f4f9;
30
+ margin: 0;
31
+ padding: 0;
32
+ display: flex;
33
+ justify-content: center;
34
+ align-items: center;
35
+ min-height: 100vh;
36
+ }
37
+ .container {
38
+ width: 95%;
39
+ max-width: 1200px;
40
+ background-color: #fff;
41
+ padding: 2rem;
42
+ border-radius: 10px;
43
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
44
+ }
45
+ h1 {
46
+ font-size: 2rem;
47
+ margin-bottom: 1.5rem;
48
+ color: #333;
49
+ text-align: center;
50
+ }
51
+ form {
52
+ margin-bottom: 1.5rem;
53
+ }
54
+ .form-group {
55
+ display: flex;
56
+ justify-content: space-between;
57
+ align-items: center;
58
+ margin-bottom: 1.5rem;
59
+ }
60
+ .form-group label {
61
+ flex: 0;
62
+ font-size: 1 rem; /* 增大字体 */
63
+ color: #333;
64
+ margin-right: 0.5rem;
65
+ background-color: #f0f8ff; /* 气泡背景颜色 */
66
+ padding: 0.5rem 1rem; /* 气泡内边距 */
67
+ border-radius: 10px; /* 气泡圆角 */
68
+ text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); /* 艺术字效果 */
69
+ font-family: 'Times new roman', cursive, sans-serif; /* 艺术字体 */
70
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.1); /* 气泡阴影 */
71
+ }
72
+ .form-group input {
73
+ flex: 4;
74
+ padding: 0.6rem;
75
+ font-size: 1rem;
76
+ border: 1px solid #ccc;
77
+ border-radius: 5px;
78
+ margin-left: 1rem;
79
+ }
80
+ .form-group button {
81
+ flex: 0;
82
+ padding: 0.6rem 1rem;
83
+ font-size: 1rem;
84
+ background-color: #F2A582;
85
+ color: #fff;
86
+ border: none;
87
+ border-radius: 5px;
88
+ cursor: pointer;
89
+ transition: background-color 0.3s ease;
90
+ margin-left: 1rem;
91
+ }
92
+ .form-group button:hover {
93
+ background-color: #0056b3;
94
+ }
95
+ .loading,
96
+ .time-box,
97
+ .counter-box,
98
+ .result,
99
+ .error {
100
+ margin-top: 1.5rem;
101
+ }
102
+ .loading {
103
+ font-size: 1.2rem;
104
+ color: #007bff;
105
+ animation: fadeIn 0.5s ease-in-out;
106
+ text-align: center;
107
+ }
108
+ .time-counter-container {
109
+ display: flex;
110
+ justify-content: space-between;
111
+ }
112
+ .time-box,
113
+ .counter-box {
114
+ display: inline-block;
115
+ padding: 0.5rem 1rem;
116
+ background-color: #e9ecef;
117
+ border-radius: 10px;
118
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
119
+ font-size: 0.9rem;
120
+ margin: 0.5rem;
121
+ flex: 1;
122
+ text-align: center;
123
+ }
124
+ .result {
125
+ display: flex;
126
+ justify-content: space-between;
127
+ flex-wrap: wrap;
128
+ }
129
+ .result .box {
130
+ flex: 1;
131
+ margin: 0.5rem;
132
+ padding: 1rem;
133
+ background-color: #e9ecef;
134
+ border-radius: 10px;
135
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
136
+ word-wrap: break-word;
137
+ height: 400px;
138
+ overflow-y: auto;
139
+ font-size: 1rem;
140
+ font-family: "Times New Roman", Times, serif;
141
+ line-height: 1.5;
142
+ }
143
+ .error .box {
144
+ width: 100%;
145
+ padding: 1rem;
146
+ background-color: #f8d7da;
147
+ color: #721c24;
148
+ border-radius: 10px;
149
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
150
+ word-wrap: break-word;
151
+ }
152
+ h2 {
153
+ font-size: 1.3rem;
154
+ margin-bottom: 1rem;
155
+ color: #333;
156
+ }
157
+ @keyframes fadeIn {
158
+ from { opacity: 0; }
159
+ to { opacity: 1; }
160
+ }
161
+ .progress-bar-container {
162
+ width: 100%;
163
+ background-color: #e9ecef;
164
+ border-radius: 10px;
165
+ overflow: hidden;
166
+ margin-top: 1.5rem;
167
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
168
+ }
169
+ .progress-bar {
170
+ height: 20px;
171
+ background-color: #727372;
172
+ width: 0%;
173
+ transition: width 0.1s ease;
174
+ }
175
+ .example-container {
176
+ display: flex;
177
+ justify-content: space-between;
178
+ align-items: center;
179
+ margin-bottom: 1.5rem;
180
+ }
181
+ .example-label {
182
+ flex: 0.7;
183
+ font-size: 1 rem;
184
+ color: #333;
185
+ text-align: center;
186
+ margin-right: 0rem;
187
+ padding: 0.5rem 0.2rem;
188
+ background-color: #f0f8ff;
189
+ border-radius: 10px;
190
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
191
+ font-family: 'Times new roman', cursive, sans-serif;
192
+ text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1);
193
+ box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
194
+ }
195
+ .example-topics {
196
+ flex: 6;
197
+ display: flex;
198
+ justify-content: space-around;
199
+ }
200
+ .example-topics button {
201
+ padding: 0.5rem 1rem;
202
+ font-size: 1rem;
203
+ background-color: #ffa07a; /* 浅橙色 */
204
+ color: #fff;
205
+ border: none;
206
+ border-radius: 5px;
207
+ cursor: pointer;
208
+ margin: 0.3rem;
209
+ transition: background-color 0.3s ease;
210
+ }
211
+ .example-topics button:hover {
212
+ background-color: #ff4500; /* 深橙色 */
213
+ }
214
+ </style>
215
+ <script>
216
+ let startTime = 0;
217
+ let intervalId = null;
218
+ let progressIntervalId = null;
219
+ let maxTime = 180; // 最大时间180秒
220
+ function showLoading() {
221
+ document.getElementById("loading").style.display = "block";
222
+ document.getElementById("submit-btn").disabled = true;
223
+ startTime = Date.now();
224
+ intervalId = setInterval(updateTime, 100);
225
+ progressIntervalId = setInterval(updateProgressBar, 100);
226
+ }
227
+ function hideLoading() {
228
+ document.getElementById("loading").style.display = "none";
229
+ document.getElementById("submit-btn").disabled = false;
230
+ if (intervalId) {
231
+ clearInterval(intervalId);
232
+ intervalId = null;
233
+ }
234
+ if (progressIntervalId) {
235
+ clearInterval(progressIntervalId);
236
+ progressIntervalId = null;
237
+ }
238
+ updateProgressBar(100); // 立即更新进度条至100%
239
+ }
240
+ function updateTime() {
241
+ const now = Date.now();
242
+ const elapsed = ((now - startTime) / 1000).toFixed(2);
243
+ document.getElementById("time-taken").innerText = `Time Taken: ${elapsed} s`;
244
+ }
245
+ function updateProgressBar(percentage = null) {
246
+ const progressBar = document.getElementById("progress-bar");
247
+ if (percentage !== null) {
248
+ progressBar.style.width = `${percentage}%`;
249
+ } else {
250
+ const now = Date.now();
251
+ const elapsed = (now - startTime) / 1000;
252
+ const progress = Math.min((elapsed / maxTime) * 60, 97);
253
+ progressBar.style.width = `${progress}%`;
254
+ }
255
+ }
256
+ function fillTopic(topic) {
257
+ document.getElementById("topic").value = topic;
258
+ }
259
+ </script>
260
+ </head>
261
+ <body>
262
+ <div class="container">
263
+ <h1>CoI Agent online demo 😊</h1>
264
+ <div class="time-counter-container">
265
+ <div id="time-taken" class="time-box">Time Taken: {{ time_taken }} seconds</div>
266
+ <div class="counter-box">Today's Replies: {{ reply_count }}</div>
267
+ </div>
268
+ <div class="example-container">
269
+ <div class="example-label">Example Input:</div>
270
+ <div class="example-topics">
271
+ <button onclick="fillTopic('Realistic Image Synthesis in Medical Imaging')">Realistic Image Synthesis in Medical Imaging</button>
272
+ <button onclick="fillTopic('Using diffusion to generate road layout')">Using diffusion to generate road layout</button>
273
+ <button onclick="fillTopic('Using LLM-based agent to generate idea')">Using LLM-based agent to generate idea</button>
274
+ </div>
275
+ </div>
276
+ <form action="/" method="post" onsubmit="showLoading()">
277
+ <div class="form-group">
278
+ <label for="topic">Topic:</label>
279
+ <input type="text" id="topic" name="topic">
280
+ <button type="submit" id="submit-btn">Generate</button>
281
+ </div>
282
+ </form>
283
+ <div id="loading" class="loading">Generating content, Usually takes 3-4 minutes, please wait...</div>
284
+ <div class="progress-bar-container">
285
+ <div id="progress-bar" class="progress-bar"></div>
286
+ </div>
287
+ <div class="result">
288
+ <div class="box">
289
+ <h2>Idea</h2>
290
+ <div>{{ idea | safe }}</div>
291
+ </div>
292
+ </div>
293
+ {% if error %}
294
+ <div class="error">
295
+ <div class="box">
296
+ <h2>Error</h2>
297
+ <div>{{ error }}</div>
298
+ </div>
299
+ </div>
300
+ {% endif %}
301
+ </div>
302
+ <script>
303
+ hideLoading();
304
+ </script>
305
+ </body>
306
+ </html>
307
+ """
308
+
309
+ # 重置每日计数器
310
+ def reset_counter():
311
+ global reply_count
312
+ reply_count = 0
313
+
314
+ # 设置定时任务每天0点重置计数器
315
+ scheduler = BackgroundScheduler()
316
+ scheduler.add_job(reset_counter, 'cron', hour=0, minute=0)
317
+ scheduler.start()
318
+
319
+ @app.get("/", response_class=HTMLResponse)
320
+ def form_get():
321
+ return Template(html_template).render(idea= "This is a example of the idea geneartion", error=None, reply_count=reply_count)
322
+
323
+ @app.post("/", response_class=HTMLResponse)
324
+ def form_post(topic: str = Form(...)):
325
+ global reply_count
326
+ start_time = time.time()
327
+ # 检查是否超过每日最大回复次数
328
+ if reply_count >= MAX_REPLIES_PER_DAY:
329
+ error_message = "Today's maximum number of replies has been reached. Please try again tomorrow."
330
+ return Template(html_template).render(idea="", error=error_message, reply_count=reply_count)
331
+ try:
332
+ main_llm, cheap_llm = get_llms()
333
+ deep_research_agent = DeepResearchAgent(llm=main_llm, cheap_llm=cheap_llm, improve_cnt=1, max_chain_length=5, min_chain_length=3, max_chain_numbers=1)
334
+ print(f"begin to generate idea of topic {topic}")
335
+ idea, related_experiments, entities, idea_chain, ideas, trend, future, human, year = asyncio.run(deep_research_agent.generate_idea_with_chain(topic))
336
+ idea_md = markdown.markdown(idea)
337
+ # 更新每日回复次数
338
+ reply_count += 1
339
+ end_time = time.time()
340
+ time_taken = round(end_time - start_time, 2)
341
+ return Template(html_template).render(idea=idea_md, error=None, reply_count=reply_count, time_taken=time_taken)
342
+ except Exception as e:
343
+ end_time = time.time()
344
+ time_taken = round(end_time - start_time, 2)
345
+ return Template(html_template).render(idea="", error=str(e), reply_count=reply_count, time_taken=time_taken)
main.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from agents import DeepResearchAgent,ReviewAgent,get_llms
2
+ import asyncio
3
+ import json
4
+ import argparse
5
+
6
+ if __name__ == '__main__':
7
+
8
+ argparser = argparse.ArgumentParser()
9
+ argparser.add_argument("--topic",type=str,help="research topic",default="Using diffusion to generate urban road layout map")
10
+ argparser.add_argument("--anchor_paper_path",type=str,help="PDF path of the anchor paper",default= None)
11
+ argparser.add_argument("--save_file",type=str,default="saves/",help="save file path")
12
+ argparser.add_argument("--improve_cnt",type=int,default= 1,help="experiment refine count")
13
+ argparser.add_argument("--max_chain_length t",type=int,default=5,help="max chain length")
14
+ argparser.add_argument("--min_chain_length",type=int,default=3,help="min chain length")
15
+ argparser.add_argument("--max_chain_numbers",type=int,default=1,help="max chain numbers")
16
+
17
+ args = argparser.parse_args()
18
+
19
+ main_llm , cheap_llm = get_llms()
20
+
21
+ topic = args.topic
22
+ anchor_paper_path = args.anchor_paper_path
23
+
24
+
25
+ review_agent = ReviewAgent(save_file=args.save_file,llm=main_llm,cheap_llm=cheap_llm)
26
+ deep_research_agent = DeepResearchAgent(llm=main_llm,cheap_llm=cheap_llm,**vars(args))
27
+
28
+ print(f"begin to generate idea and experiment of topic {topic}")
29
+ idea,related_experiments,entities,idea_chain,ideas,trend,future,human,year= asyncio.run(deep_research_agent.generate_idea_with_chain(topic,anchor_paper_path))
30
+ experiment = asyncio.run(deep_research_agent.generate_experiment(idea,related_experiments,entities))
31
+
32
+ for i in range(args.improve_cnt):
33
+ experiment = asyncio.run(deep_research_agent.improve_experiment(review_agent,idea,experiment,entities))
34
+
35
+ print(f"succeed to generate idea and experiment of topic {topic}")
36
+ res = {"idea":idea,"experiment":experiment,"related_experiments":related_experiments,"entities":entities,"idea_chain":idea_chain,"ideas":ideas,"trend":trend,"future":future,"year":year,"human":human}
37
+ with open("result.json","w") as f:
38
+ json.dump(res,f)
prompts/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # from .idea_refiner_prompts import *
2
+ from .juder_prompts import *
3
+ from .review_agent_prompts import *
4
+ from .deep_research_agent_promts import *
prompts/deep_research_agent_promts.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ use_entities = True
2
+
3
+ def get_deep_search_query_prompt(topic = None,query = None) -> str:
4
+ if topic and query:
5
+ prompt = f"""
6
+ You are a master of literature searcher, tasked with finding relevant research literatures based on a specific topic and idea.
7
+
8
+ Currently, we would like to study the following topic: {topic}.
9
+ And we have the following idea: {query}.
10
+
11
+ Please provide the literature search queries you would use to search for papers related to the topic and idea.
12
+ """
13
+ elif topic:
14
+ prompt = f"""
15
+ You are a master of literature searcher, tasked with finding relevant research literatures based on a specific topic.
16
+
17
+ Currently, we would like to study the following topic: {topic}.
18
+
19
+ Please provide the literature search queries you would use to search for papers related to the topic.
20
+ """
21
+ elif query:
22
+ prompt = f"""
23
+ You are a master of literature searcher, tasked with finding relevant research literatures based on a specific idea.
24
+
25
+ Currently, we would like to search for papers related to the following idea: {query}.
26
+
27
+ Please provide the literature search querie syou would use to search for papers related to the paper idea.
28
+ """
29
+ output_format = """
30
+ Each query should be a string and should be enclosed in double quotes.It is best to output one query representing the whole and other queries representing other different aspects of the whole.(no more than 5 queries)
31
+
32
+ Output strictly in the following format:
33
+ <queries>["query1", "query2", ...]</queries>
34
+
35
+ For example:
36
+ <queries>["Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks","Automatic agent learning for QA","QA task planning with minimal human intervention", "Few-shot learning for QA"]</queries>
37
+ """
38
+ return prompt + output_format
39
+
40
+ def get_deep_check_idea_novel_search_query_prompt(idea,topic: str) -> str:
41
+ prompt = f"""
42
+ You are a scientific research expert.
43
+ Your task is to check whether the target idea is similar to existing research.
44
+
45
+ The target idea you need to check is as follows:{idea}
46
+
47
+ The topic you are studying is: {topic}
48
+
49
+ Please provide multiple search queries to find relevant papers that can help you determine whether the idea is novel(no more than 3 queries).
50
+
51
+ Output strictly in the following format:
52
+ <queries>["query1", "query2", "query3"]</queries>
53
+
54
+ For example:
55
+ <queries>["Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks","Automatic agent learning for QA","QA task planning with minimal human intervention"]</queries>
56
+ """
57
+ return prompt
58
+
59
+
60
+
61
+
62
+ def get_deep_rewrite_query_prompt(failed_query,topic):
63
+ prompt = f"""
64
+ You are a master of search engine query writing. We want to utilize the literature search engine to find relevant paper.
65
+
66
+ The queries that have been used so far are as follows: {failed_query}. Unfortunately, no satisfactory answers were found. Please rewrite a query to help us locate the literature we need (do not repeat the failed query).
67
+
68
+ The topic you are studying is: {topic}.
69
+ Please provide a new search query to find the relevant papers.
70
+
71
+ Try to make your query more concise and general so that it can be used to search for a wide range of papers.
72
+ If you failed more than 5 times, you can use a short query(no more than 5 words) to search for the paper.
73
+ Please output strictly in the following format:
74
+ <query>{{new query}}</query>
75
+
76
+ For example:
77
+ <query>Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks</query>
78
+ """
79
+ return prompt
80
+
81
+
82
+ def get_deep_reference_prompt(paper_content: str,topic) -> str:
83
+ prompt = f"""
84
+ You are a scientific research expert, tasked with extracting and summarizing information from provided paper content relevant to the topic: {topic}. Your deliverables will include pertinent references, extracted entities, a detailed summary, and the experimental design.
85
+
86
+ The topic you are studying is: {topic}. (Ensure that the references are pertinent to this topic.)
87
+
88
+ Extraction Requirements:
89
+ Entities
90
+ 1. Identify unique entities mentioned in the paper, such as model names, datasets, metrics, and specialized terminology.
91
+ 2. Format the entities with a name followed by a brief description.
92
+ 3. Ensure all entities are relevant to the specified topic ([topic]).
93
+
94
+
95
+ Summary Idea:
96
+ 1. Background: Elaborate on the task's context and previous work, outlining the starting point of this paper.
97
+ 2. Novelty: Describe the main innovations and contributions of this paper in comparison to prior work.
98
+ 3. Contribution: Explain the primary methods used, detailing the theory and functions of each core component.
99
+ 4. Detail Reason: Provide a thorough explanation of why the chosen methods are effective, including implementation details for further research.
100
+ 5. Limitation: Discuss current shortcomings of the approach.
101
+
102
+ Experimental Content:
103
+ 1. Experimental Process: Detail the entire experimental procedure, from dataset construction to specific steps, ensuring clarity and thoroughness.
104
+ 2. Technical Details: Describe any specific technologies involved, providing detailed implementation processes.
105
+ 3. Clarity of Plan: State your experimental plan concisely to facilitate understanding without unnecessary complexity.
106
+ 4. Baseline: Elaborate on the baseline used, comparative methods, and experimental design, illustrating how these support and validate the conclusions drawn.
107
+ 5. Verification: Explain how your experimental design assists in verifying the core idea and ensure it is detailed and feasible.
108
+
109
+ Relevance Criteria:
110
+ 1. Method Relevance: References must directly correlate with the paper's methodology, indicating improvements or modifications.
111
+ 2. Task Relevance: References should address the same task, even if methods differ, better have the same topic {topic}.
112
+ 3. Baseline Relevance: References should serve as baselines for the methods discussed in the paper.
113
+ 4. Output Format: Provide references without author names or publication years, formatted as titles only.
114
+ 5. Specific paper titles will be placed between <References></References>. Based on the precise citation location and the corresponding ref_id in the paper, you need to infer the specific title of your output relevant references.
115
+
116
+
117
+ The paper content is as follows:
118
+ {paper_content}
119
+
120
+
121
+ Please provide the entities, summary idea, experimental design, and the three most relevant references (Sort by relevance, with priority given to new ones with the same level of relevance, do not reference the original paper.) based on the paper's content.
122
+ Note: Ensure the references are pertinent to the topic you are studying: {topic}. If there are no relevant references, output <references>[]</references>.
123
+
124
+ Now please output strictly in the following format:
125
+ <entities>{{A list of entities you extract}}</entities>
126
+ <idea>{{Background: ... \nNovelty: ...\nContribution:...\nMethods:...\nDetail reason:...\nLimitation:...\n }}</idea>
127
+ <experiment>{{Step1:... Step2:...}}</experiment>
128
+ <references>["{{Title1}}", "{{Title2}}", ...]</references>
129
+ """
130
+ return prompt
131
+
132
+
133
+ def get_deep_trend_idea_chains_prompt(idea_chains,entities,topic) -> str:
134
+ entities = f"""
135
+ Here are the entities you need to know: {entities}
136
+ """ if use_entities else ""
137
+ prompt = f"""
138
+ You are a scientific research expert tasked with summarizing the historical progression of research related to our current topic, based on the literature we have reviewed.
139
+
140
+ {entities}
141
+
142
+ The topic you are studying is: {topic}
143
+
144
+ The literature from early to late: {idea_chains}
145
+
146
+ Your objective is to outline the historical evolution of the research in light of current trends. Please follow these requirements:
147
+ Analysis of Published Viewpoints: Examine the progression of ideas across the identified papers. Detail how each paper transitions to the next—for instance, how Paper 0 leads to Paper 1, and so forth. Focus on understanding how Paper 1 builds upon the concepts in Paper 0. Elaborate on specific advancements made, including proposed modules, their designs, and the rationale behind their effectiveness in addressing previous challenges. Apply this analytical approach to each paper in the sequence.
148
+
149
+
150
+ Please present your findings in the following format:
151
+ <trend> {{The research trend you summarized based on the past work}} </trend>
152
+
153
+ Example:
154
+ <trend>from Paper 0 to Paper 1: ... \nfrom Paper 1 to Paper 2: ... \n </trend>
155
+ """
156
+ return prompt
157
+
158
+
159
+ def get_deep_judge_relevant_prompt(target_paper_title,target_paper_abstract,topic) -> str:
160
+ prompt = f"""
161
+ You are an expert researcher tasked with evaluating whether a given paper is relevant to our research topic.
162
+
163
+ Below are the details of the paper you need to assess:
164
+ Title: {target_paper_title}
165
+ Abstract: {target_paper_abstract}
166
+
167
+ The topic is: {topic}
168
+
169
+ if the paper title and abstract are related to the topic, output <relevant>1</relevant>, otherwise output <relevant>0</relevant>. As long as you feel that this article has reference value for your question, you can use it to help you study the topic, it does not need to be completely consistent in topic.
170
+
171
+ Please output strictly in the following format(no extra content):
172
+ <think>{{your thinking steps}}</think>
173
+ <relevant>{{0/1}}</relevant>
174
+ """
175
+ return prompt
176
+
177
+
178
+ def get_deep_generate_future_direciton_prompt(idea_chains,trend,topic,entities) -> str:
179
+ entities = f"""
180
+ Here are the entities you need to know: {entities}
181
+ """ if use_entities else ""
182
+ prompt = f"""
183
+ You are a scientific research expert tasked with proposing future research directions based on the literature we have reviewed.
184
+
185
+ {entities}
186
+
187
+ The topic you are studying is: {topic}
188
+
189
+ The literature you have studied is as follows:
190
+ {idea_chains}
191
+
192
+ The following section delineates the progressive relationships among the previously summarized research papers:
193
+ <the begin of previous trend>{trend}</the end of previous trend>
194
+
195
+ Based on previous research, analyze how human experts think and transition from previous methods to subsequent approaches. Focus on their reasoning logic and the sources of their thought processes. Learn to emulate their reasoning patterns to further develop and guide your own research direction in a natural and coherent manner.
196
+
197
+ Additionally, you are encouraged to adopt the following three modes of thinking:
198
+ 1. Reflection: Reflect on scenarios where a specific method encounters significant challenges. Consider potential solutions that could effectively address these issues, make the solutions sounds reasonable, novel and amazing.
199
+ 2. Analogy: Identify a specific problem you are currently facing and research existing solutions that have successfully tackled similar challenges. Explore these solutions and adapt key principles and strategies to your situation. Think creatively about how tools and approaches from other domains can be reimagined to devise a novel strategy for your issue. Encourage you to actively explore methods in other fields to solve your current problems.
200
+ 3. Deep Dive: Some methods may present specific approaches to addressing a particular problem. Consider whether there are aspects that could be modified to enhance their rationale and effectiveness.
201
+
202
+ Note:Each article's limitations are specific to that particular piece and should not be applied to others. Carefully consider the task at hand and analyze the potential issues you might encounter if you proceed with your original approach, reflecting on the challenges previously faced. Then, think critically about how to address these issues effectively.
203
+
204
+ You are encouraged to apply human reasoning strategies to identify future research directions based on prior studies. Aim for in-depth analysis rather than mere integration of existing ideas. Please avoid introducing unfamiliar information, ensuring that the trends you present are both authentic and reasonable. Before proposing any trends, take a moment to reflect on the principles underlying the methods you're employing and assess their relevance to your research area.
205
+
206
+ The future research direction should be related to the topic: {topic}.
207
+ Please output strictly in the following format:
208
+ <human>{{The human reasoning way you analyzed based on the previous research}}</human>
209
+ <future>{{the future research direction}}</future>
210
+ """
211
+ return prompt
212
+
213
+
214
+ def get_deep_generate_idea_prompt(idea_chains,trend,topic,entities,future = None,bad_case = []) -> str:
215
+ bad_case_content = ""
216
+ if len(bad_case) > 0:
217
+ bad_case_content = "The following are examples of ideas you have proposed in the past that are similar to real papers. Please avoid this situation as much as possible. You can continue to make in-depth innovations, but avoid plagiarism:\n"
218
+ for i,(paper,summary) in enumerate(bad_case):
219
+ bad_case_content += f"<example>{i}. Your orig idea:{summary} \n Similar paper Title: {paper.title}\n Abstract: {paper.abstract}</example>\n"
220
+
221
+ trend = f"""
222
+ The following section delineates the progressive relationships among the previously summarized research papers:
223
+ <the begin of previous trend>{trend}</the end of previous trend>
224
+ """ if trend else ""
225
+
226
+ future = f"""
227
+ The following section outlines the potential future research directions based on the literature you have studied:
228
+ <the begin of future>{future}</the end of future>
229
+ """ if future else ""
230
+
231
+
232
+ entities = f"""
233
+ Here are the entities you need to know: {entities}
234
+ """ if use_entities else ""
235
+ prompt = f"""
236
+ You are a scientific expert tasked with formulating a novel and innovative research idea based on your comprehensive literature review. Your objective is to propose a feasible approach that could significantly advance the field.
237
+
238
+ {bad_case_content}
239
+
240
+ {entities}
241
+
242
+ The topic you are studying is: {topic}
243
+
244
+ The literature you have studied is as follows:
245
+ {idea_chains}
246
+
247
+ Task: Based on the current literature, propose a research idea that incorporates the following components:
248
+
249
+ Your idea is composed of the following components:
250
+ Motivation:
251
+ 1. Provide a background for your idea, summarizing relevant past work.
252
+ 2. Identify shortcomings in previous research and highlight the specific problems that remain unsolved and that you aim to address.
253
+
254
+ Novelty:
255
+ 1. Distinguish your proposed method from existing methods (preferably by naming specific approaches).
256
+ 2. Detail the improvements your method brings compared to previous work.
257
+ 3. Clearly outline at least three contributions your idea offers to the field, including the problems it resolves and the benefits it delivers.
258
+
259
+ Method:
260
+ 1. Present a detailed description of your idea, focusing on the core method, the specific problem it solves, and enhancements over earlier research (citing relevant literature with titles).
261
+ 2. Explain the step-by-step methodology, including the functions of each module and the rationale for why this approach effectively addresses previous challenges.
262
+
263
+ Please adhere to the following guidelines:
264
+ 1. Your research idea should be innovative, feasible, and contribute meaningfully to the field.Please carefully examine the idea you have proposed, avoid immediate perception, and try to be different from the previous methods as much as possible.
265
+ 2. Ensure your proposal is solid, clearly defined, and practical to implement. Logic should underpin your reasoning.
266
+ 3. Write in clear, concise language aimed at an audience with limited background knowledge in the subject. Avoid complex technical jargon, but when professional terms are necessary, provide thorough explanations.
267
+ 4. Refrain from introducing concepts from uncertain fields to prevent proposing ideas that may be incorrect or impractical.
268
+ 5. When referencing other research, please include the titles of the cited papers.
269
+ 6. Please avoid introducing unfamiliar information, ensuring that the trends you present are both authentic and reasonable. Before proposing any trends, take a moment to reflect on the principles underlying the methods you're employing and assess their relevance to your research area.
270
+ 7. Each article's limitations are specific to that particular piece and should not be applied to others. Carefully consider the task at hand and analyze the potential issues you might encounter if you proceed with your original approach, reflecting on the challenges previously faced. Then, think critically about how to address these issues effectively.
271
+
272
+ {trend}
273
+
274
+ {future}
275
+
276
+ Please output strictly in the following format:
277
+ <motivation>{{the motivation of your idea}}</motivation>
278
+ <novelty> {{the novelty of your idea}} </novelty>
279
+ <method> {{the method of your idea}} </method>
280
+ """
281
+ return prompt
282
+
283
+
284
+ def get_deep_final_idea_prompt(idea_chains,trend,idea,topic):
285
+ idea = f"""
286
+ Here is your thinking steps:
287
+ {idea}
288
+ """ if idea else ""
289
+ if idea and trend:
290
+ trend = f"""The relationship between each paper are as follows: {trend}"""
291
+ elif trend:
292
+ trend = f"""
293
+ The following section outlines the progress relationships between the previously summarized research papers:
294
+ <the begin of summarize>{trend}</the end of summarize>
295
+ """
296
+ else:
297
+ trend = ""
298
+
299
+ prompt = f"""
300
+ You are an scientific expert with the primary objective of proposing a research idea based on the literature you have studied. Your goal is to propose a novel, feasible, and innovative research idea that can advance the field.
301
+
302
+ The topic you are studying is: {topic}
303
+
304
+ Here are the literature you have studied:
305
+ {idea_chains}
306
+
307
+ Task: Based on the current literature, propose a research idea that incorporates the following components:
308
+
309
+ Please adhere to the following guidelines:
310
+ 1. Your research idea should be innovative, feasible, and contribute meaningfully to the field. Please carefully examine the idea you have proposed, avoid immediate perception, and try to be different from the previous methods as much as possible
311
+ 2. Ensure your proposal is solid, clearly defined, and practical to implement. Logic should underpin your reasoning.
312
+ 3. Write in clear, concise language aimed at an audience with limited background knowledge in the subject. Avoid complex technical jargon, but when professional terms are necessary, provide thorough explanations.
313
+ 4. Refrain from introducing concepts from uncertain fields to prevent proposing ideas that may be incorrect or impractical.
314
+ When referencing other research, please include the titles of the cited papers.
315
+
316
+ {trend}
317
+
318
+ {idea}
319
+
320
+ The final idea should contains the title, clearly explain the origins, motivation, and challenges of your idea, detailing how you overcame these hurdles.
321
+ Please output strictly in the following format:
322
+ <final_idea> {{the final idea}} </final_idea>
323
+ """
324
+ return prompt
325
+
326
+
327
+ def get_deep_check_idea_novel_prompt(idea,papers):
328
+ papers_content = ""
329
+ for i,paper in enumerate(papers):
330
+ papers_content += f"Paper {i}: Title:{paper.title}\n Abstract:{paper.abstract}\n"
331
+ prompt = f"""
332
+ You are a scientific research expert tasked with evaluating the similarity between a specified idea and existing research. Your objective is to determine if the target idea closely resembles any findings in the provided papers.
333
+
334
+ The target idea you need to check is as follows:
335
+ {idea}
336
+
337
+ The relevant papers you need to refer to are as follows:
338
+ {papers_content}
339
+
340
+ Here are your guidlines:
341
+ 1. Comparison Process: Begin by thoroughly comparing each paper's ideas with the target idea. Consider the methodologies, conclusions, and underlying concepts in each paper in your analysis.
342
+ 2. Similarity Assessment: If the target idea shares fundamental similarities with any existing research to the extent that they can be considered identical, classify this as plagiarism.
343
+ 3. Output: Your output should provide a clear thought process, the similarity assessment, a summary of the target idea, and the ID of the most relevant similar paper.
344
+
345
+ Please output strictly in the following format:
346
+ <think>{{your thinking steps}}</think>
347
+ <similar>{{0/1}}</similar>
348
+ <summary>{{the summary of the target idea}}</summary>
349
+ <similar_paper_id>{{the id of the similar paper}}</similar_paper_id>
350
+
351
+ For example:
352
+ <think> There are my think steps:... </think>
353
+ <similar>0</similar>
354
+ <summary> It proposes ... </summary>
355
+ <similar_paper_id>0</similar_paper_id>
356
+ """
357
+ return prompt
358
+
359
+
360
+
361
+ def get_deep_generate_experiment_prompt(idea,experiments,entities) -> str:
362
+ prompt = f"""
363
+ You are a scientific expert tasked with designing rigorous, feasible, and impactful experiments based on specified scientific questions and the methodologies derived from the idea I provide, along with relevant past research. Your goal is to assist researchers in systematically testing hypotheses and validating innovative discoveries that could significantly advance their fields.
364
+
365
+ Past Related Research Experiments: {experiments}
366
+
367
+ Here are the entities you need to know: {entities}.
368
+
369
+ Here is the idea you need to design an experiment for: {idea}.
370
+
371
+ Please propose a detailed experimental plan addressing the following points:
372
+ 1. Experimental Design: Develop rigorous experiments to ensure the reliability and validity of your results. Provide a comprehensive explanation of the baseline used, comparative methods, ablation study design, and criteria for data analysis and result evaluation. Clarify how these components collectively reinforce and validate the conclusions of your research. Structure your experimental design in a clear, logical, and step-by-step manner, ensuring each step is well-defined and easy to understand.
373
+ 2. Implementation of Technologies/Methods: If your experimental design involves specific technologies or methodologies, describe the implementation process in detail, including key technical aspects. For any critical concepts utilized, provide thorough explanations. For instance, if you propose a modular approach, detail its construction, components, and functionality.
374
+ 3. Feasibility Assessment: Ensure your experimental plan is realistic, considering technological availability, timelines, resources, and personnel. Identify potential challenges and propose strategies for addressing them.
375
+ 4. References to Previous Studies: When citing related literature, include titles and pertinent details of the original papers. Strive to use as many references as necessary to support your experimental design.
376
+ 5. Visual Aids: If useful, provide pseudocode or a flowchart to illustrate the implementation process. For example, you can use pseudocode to detail the core algorithm or the model architecture, or employ a flowchart to map out the experimental procedure and data flow.
377
+ 6. Clarity of Language: Use straightforward language to describe your methods, assuming the reader may have limited knowledge of the subject matter. Avoid complex jargon and utilize accessible terminology. If professional terms are necessary, please provide clear and detailed explanations.
378
+
379
+
380
+ Please output strictly in the following format:
381
+ <experiment>{{your experimental plan}}</experiment>
382
+
383
+ For example:
384
+ <experiment> Step1: ... \n Step2: ..., ..., ... </experiment>
385
+ """
386
+ return prompt
387
+
388
+
389
+ def get_deep_refine_experiment_prompt(experiment,suggestions,paper_infos=None,entities = None) -> str:
390
+ infos = f"""
391
+ The literature infos you maybe need to refer to are as follows: {paper_infos}
392
+ """ if paper_infos else ""
393
+
394
+ prompt = f"""
395
+ You are a research expert tasked with refining and improving an experimental plan based on the feedback received.
396
+
397
+ {infos}
398
+
399
+ The experimental plan you proposed is as follows:
400
+ {experiment}
401
+
402
+ Please propose a detailed experimental plan addressing the following points:
403
+ 1. Experimental Design: Develop rigorous experiments to ensure the reliability and validity of your results. Provide a comprehensive explanation of the baseline used, comparative methods, ablation study design, and criteria for data analysis and result evaluation. Clarify how these components collectively reinforce and validate the conclusions of your research. Structure your experimental design in a clear, logical, and step-by-step manner, ensuring each step is well-defined and easy to understand.
404
+ 2. Implementation of Technologies/Methods: If your experimental design involves specific technologies or methodologies, describe the implementation process in detail, including key technical aspects. For any critical concepts utilized, provide thorough explanations. For instance, if you propose a modular approach, detail its construction, components, and functionality.
405
+ 3. Feasibility Assessment: Ensure your experimental plan is realistic, considering technological availability, timelines, resources, and personnel. Identify potential challenges and propose strategies for addressing them.
406
+ 4. References to Previous Studies: When citing related literature, include titles and pertinent details of the original papers. Strive to use as many references as necessary to support your experimental design.
407
+ 5. Visual Aids: If useful, provide pseudocode or a flowchart to illustrate the implementation process. For example, you can use pseudocode to detail the core algorithm or the model architecture, or employ a flowchart to map out the experimental procedure and data flow.
408
+ 6. Clarity of Language: Use straightforward language to describe your methods, assuming the reader may have limited knowledge of the subject matter. Avoid complex jargon and utilize accessible terminology. If professional terms are necessary, please provide clear and detailed explanations.
409
+
410
+ You have received the following suggestions for improvement:
411
+ {suggestions}
412
+
413
+ Please refine your experimental plan based on the feedback provided. Ensure your refined plan is feasible, clearly defined, and addresses the feedback you received.
414
+
415
+ Please output strictly in the following format:
416
+ <experiment>{{your refined experimental plan}}</experiment>
417
+ """
418
+ return prompt
419
+
420
+
421
+
422
+ def get_deep_refine_experiment_search_query_prompt(experiment,suggestions):
423
+ prompt = f"""
424
+ You are a research expert tasked with refining and improving an experimental plan based on the feedback received.
425
+
426
+ The experimental plan you proposed is as follows:
427
+ {experiment}
428
+
429
+ You have received the following suggestions for improvement:
430
+ {suggestions}
431
+
432
+ Please decide whether you need to search for relevant papers to obtain relevant knowledge to improve your experiment.
433
+
434
+ If you need to search for relevant papers, please provide a search query(only a conciese phrase) for literature search, else provide "".
435
+ For example: if suggestions say that the dynamic query additional information and update knowledge graph described in the experiment is not clearly described, so you need to output <query>dynamic knowledge graph update</query>(only a conciese phrase) .
436
+
437
+ Please output strictly in the following format:
438
+ <query>{{the search query}}</query>, or <query></query> if no search is needed.
439
+
440
+ For example:
441
+ <query>Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks</query>
442
+ """
443
+ return prompt
444
+
445
+ def get_deep_paper_info_prompt_for_refine_experiment(paper,experiment,suggestions) -> str:
446
+ prompt = f"""
447
+ You are a scientific research expert.
448
+ Your task is to research the relevant literature to refine your experiment.
449
+
450
+ The literature you need to study is:
451
+ {paper}
452
+
453
+ The experiment designed for the idea is:
454
+ {experiment}
455
+
456
+ You have received the following suggestions for improvement:
457
+ {suggestions}
458
+
459
+ Please extract useful information from the paper that can help you improve your experiment.For example, if the paper describes a method or dataset or matric that can be used in your experiment, you should extract this method.
460
+
461
+ Please output strictly in the following format:
462
+ <info>{{The information you extracted from the paper}}</info>
463
+ """
464
+ return prompt
465
+
prompts/juder_prompts.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ def get_judge_idea_all_prompt(idea0,idea1,topic):
3
+ prompt = f"""
4
+ You are a judge in a competition. You have to decide which idea is better.
5
+
6
+ The idea0 is: {idea0}
7
+
8
+ The idea1 is: {idea1}
9
+
10
+ The topic is: {topic}
11
+
12
+ Which idea do you think is better? Please write a short paragraph to explain your choice.
13
+
14
+ Here are your evaluation criteria:
15
+ 1. Novelty: Are the problems or approaches new? Is this a novel combination of familiar techniques? Is it clear how this work differs from previous contributions? Is related work adequately referenced?
16
+ 2. Significance: Are the idea important? Are other people (practitioners or researchers) likely to use these ideas or build on them? Does the idea address a difficult problem in a better way than previous research? Does it provide a unique theoretical or pragmatic approach?
17
+ 3. Feasibility: Can the idea be realized with existing technology or methods? Are there any technical difficulties or bottlenecks? Is the idea clear and logical? Is there any obvious error or unreasonable part in the idea, and can the experiment be designed normally according to this idea.
18
+ 4. Clarity: Is the paper clearly written? Is it well-organized? Does it adequately inform the reader?
19
+ 5. Effectiveness: How likely the proposed idea is going to work well (e.g., better than existing baselines).
20
+
21
+ Note:
22
+ Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. Be as objective as possible. (very important!!!)
23
+
24
+ If you think idea0 is better than idea1, you should output 0. If you think idea1 is better than idea0, you should output 1. If you think idea0 and idea1 are equally good, you should output 2.
25
+
26
+ Your output should be strictly in following format:
27
+ Your thinking process:
28
+ ...
29
+
30
+ Your choice:
31
+ <novelty>{{ Your choice for novelty }}</novelty>
32
+ <significance>{{ Your choice for significance }}</significance>
33
+ <feasibility>{{ Your choice for feasibility }}</feasibility>
34
+ <clarity>{{ Your choice for clarity }}</clarity>
35
+ <effectiveness>{{ Your choice for effectiveness }}</effectiveness>
36
+ """
37
+ return prompt
38
+
39
+
40
+ def get_judge_experiment_all_prompt(idea0,experiment0,idea1,experiment1):
41
+ prompt = f"""
42
+ You are a judge in a competition. You have to decide which experiment is better.
43
+ The idea of experiment0 is: {idea0}
44
+ The experiment0 is: {experiment0}
45
+
46
+ The idea of experiment1 is: {idea1}
47
+ The experiment1 is: {experiment1}
48
+
49
+ Which experiment do you think is better? Please write a short paragraph to explain your choice.
50
+
51
+ Here are your evaluation criteria:
52
+ 1. Feasibility: Can the experiment be realized with existing technology or methods? Are there any technical difficulties or bottlenecks? Is the experimental plan detailed and feasible? Are the experimental steps clear and logical? Is there any obvious error or unreasonable part in the experiment. Consider the rationality of its steps and the possibility that the idea can be successfully implemented.
53
+ 2. Quality: Is there a clear rationale for each step of the experimental design? Are the baseline and evaluation metrics chosen appropriately? Has the design taken into account the potential advantages and limitations of the methods used? Can this experimental design effectively support the claims made in the idea.
54
+ 3. Clarity: Is the experimental plan clearly written? Dose it provide enough information for the expert reader to understand the experiment? Is it well organized? Does it adequately inform the reader?
55
+
56
+ Note: Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. Be as objective as possible. (very important!!!)
57
+
58
+ If you think experiment0 is better than experiment1, you should output 0. If you think experiment1 is better than experiment0, you should output 1. If you think experiment0 and experiment1 are equally good, you should output 2.
59
+
60
+ Your output should be strictly in following format:
61
+ Your thinking process:
62
+ ...
63
+
64
+ Your choice:
65
+ <feasibility>{{Your choice for feasibility}}</feasibility>
66
+ <quality>{{Your choice for quality}}</quality>
67
+ <clarity>{{Your choice for clarity}}</clarity>
68
+ """
69
+ return prompt
prompts/review_agent_prompts.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_review_search_related_paper_prompt(idea,topic):
2
+ prompt = f"""
3
+ You are a paper reviewer with expertise in the field.
4
+
5
+ The paper presents the idea: {idea}. Your task is to conduct a thorough literature review in the relevant field to assess the feasibility and originality of this idea, and to determine whether it has already been explored by others.
6
+
7
+ Please provide the literature search queries(no more than 3 queries) you would use to search for papers related to the paper idea.
8
+ Each query should be a string and should be enclosed in double quotes.
9
+
10
+ Your output should be strictly in the following format:
11
+ <queries> ["query1", "query2", ...] </queries>
12
+
13
+ For example:
14
+ <queries>["Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks","Automatic agent learning for QA","QA task planning with minimal human intervention"]</queries>
15
+ """
16
+ return prompt
17
+
18
+ def get_review_suggestions_from_papers_prompt(idea,topic,paper):
19
+ prompt = f"""
20
+ You are a manuscript review expert.
21
+ Here are some relevant literature knowledge you have: {paper}.
22
+
23
+ Currently you are assessing a paper on the topic: {topic}.
24
+ The idea presented in the paper is: {idea}.
25
+
26
+ Please analyze the feasibility and novelty of the paper's idea and provide suggestions for improvement, if any. (If there are no suggestions, please do not include any output.)(Please include the title of the paper you are referencing in the suggestion section)
27
+ You should also pay attention to whether the idea is related to the topic we are studying({topic}), and analyze whether it can help us solve topic-related problems.
28
+
29
+ There are some suggestions for you to consider:
30
+ 1. Point out any confusion you had while reading the idea and suggest changes.
31
+ 2. Based on relevant knowledge, think about the feasibility of the idea, whether the design of each step is reasonable, whether the statement is clear, and put forward your relevant suggestions.
32
+ 3. Think about how the method can be improved to increase its novelty and feasibility, while trying not to increase the complexity of the method.
33
+
34
+ Your output should be strictly in the following format:
35
+ <suggestions> {{your suggestions to modify the idea}} </suggestions>
36
+
37
+ if you have no suggestions, please provide:
38
+ <suggestions></suggestions>
39
+ """
40
+
41
+ return prompt
42
+
43
+
44
+
45
+ def get_review_experiment_design_suggestions_prompt(idea, experiment,entities):
46
+ prompt = f"""
47
+ You are an expert in paper review. Your task is to analyze whether a given experiment can effectively verify a specific idea, as well as assess the detail and feasibility of the experiment.
48
+
49
+ Here are the relevant entities to consider: {entities}.
50
+
51
+ The idea presented is: {idea}.
52
+
53
+ The corresponding experiment designed for this idea is: {experiment}.
54
+
55
+ Please conduct your analysis based on the following criteria:
56
+ 1. Can the experiment validate the idea? If not, identify the issues and suggest improvements to enhance its verification capability and feasibility.
57
+ 2. Are there specific experimental procedures that are confusing or poorly designed? Discuss any methods that may not be feasible, uncertainties in constructing the dataset, or a lack of explanation regarding the implementation of certain methods.
58
+ 3. Evaluate the clarity, detail, reasonableness, and feasibility of the experimental design.
59
+ 4. Provide suggestions for improving the experiment based on the shortcomings identified in your analysis.
60
+ 5. Focus solely on the experiment design; please refrain from altering the original idea.
61
+ 6. Ensure that your suggestions are constructive, concise, and specific.
62
+
63
+ Please strictly follow the following format for output:
64
+ <suggestion>{{Suggestions for improving the experiment}}</suggestion>
65
+ """
66
+ return prompt
requirements.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiohttp
2
+ beautifulsoup4
3
+ delft
4
+ httpx
5
+ lmdb
6
+ lxml
7
+ numpy
8
+ openai
9
+ pandas
10
+ protobuf
11
+ PyYAML
12
+ Requests
13
+ setuptools
14
+ spacy
15
+ tenacity
16
+ textstat
17
+ tqdm
18
+ fastapi
19
+ uvicorn[standard]
20
+ jinja2
21
+ markdown
22
+ apscheduler
23
+ Pillow
searcher/.DS_Store ADDED
Binary file (6.15 kB). View file
 
searcher/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # from .arxiv_reader import Arxiv_Reader
2
+ # from .google_crawl import GoogleCrawler
3
+ from .sementic_search import SementicSearcher,Result
4
+ # from .ResearchAgentSearch import ResearchSearcher
searcher/sementic_search.py ADDED
@@ -0,0 +1,397 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import json
3
+ import yaml
4
+ import scipdf
5
+ import os
6
+ import time
7
+ import aiohttp
8
+ import asyncio
9
+ import numpy as np
10
+
11
+
12
+ def get_content_between_a_b(start_tag, end_tag, text):
13
+ extracted_text = ""
14
+ start_index = text.find(start_tag)
15
+ while start_index != -1:
16
+ end_index = text.find(end_tag, start_index + len(start_tag))
17
+ if end_index != -1:
18
+ extracted_text += text[start_index + len(start_tag) : end_index] + " "
19
+ start_index = text.find(start_tag, end_index + len(end_tag))
20
+ else:
21
+ break
22
+ return extracted_text.strip()
23
+
24
+
25
+ def extract(text, type):
26
+ if text:
27
+ target_str = get_content_between_a_b(f"<{type}>", f"</{type}>", text)
28
+ if target_str:
29
+ return target_str
30
+ else:
31
+ return text
32
+ else:
33
+ return ""
34
+
35
+
36
+ async def fetch(url):
37
+ await asyncio.sleep(1)
38
+ try:
39
+ timeout = aiohttp.ClientTimeout(total=120)
40
+ async with aiohttp.ClientSession(timeout=timeout) as session:
41
+ async with session.get(url) as response:
42
+ if response.status == 200:
43
+ content = await response.read() # Read the response content as bytes
44
+ return content
45
+ else:
46
+ await asyncio.sleep(0.01)
47
+ print(f"Failed to fetch the URL: {url} with status code: {response.status}")
48
+ return None
49
+ except aiohttp.ClientError as e: # 更具体的异常捕获
50
+ await asyncio.sleep(0.01)
51
+ print(f"An error occurred while fetching the URL: {url}")
52
+ print(e)
53
+ return None
54
+ except Exception as e:
55
+ await asyncio.sleep(0.01)
56
+ print(f"An unexpected error occurred while fetching the URL: {url}")
57
+ print(e)
58
+ return None
59
+
60
+ class Result:
61
+ def __init__(self,title="",abstract="",article = "",citations_conut = 0,year = None) -> None:
62
+ self.title = title
63
+ self.abstract = abstract
64
+ self.article = article
65
+ self.citations_conut = citations_conut
66
+ self.year = year
67
+
68
+ # Define the API endpoint URL
69
+
70
+ semantic_fields = ["title", "abstract", "year", "authors.name", "authors.paperCount", "authors.citationCount","authors.hIndex","url","referenceCount","citationCount","influentialCitationCount","isOpenAccess","openAccessPdf","fieldsOfStudy","s2FieldsOfStudy","embedding.specter_v1","embedding.specter_v2","publicationDate","citations"]
71
+
72
+
73
+ fieldsOfStudy = ["Computer Science","Medicine","Chemistry","Biology","Materials Science","Physics","Geology","Art","History","Geography","Sociology","Business","Political Science","Philosophy","Art","Literature","Music","Economics","Philosophy","Mathematics","Engineering","Environmental Science","Agricultural and Food Sciences","Education","Law","Linguistics"]
74
+
75
+ # citations.paperId, citations.title, citations.year, citations.authors.name, citations.authors.paperCount, citations.authors.citationCount, citations.authors.hIndex, citations.url, citations.referenceCount, citations.citationCount, citations.influentialCitationCount, citations.isOpenAccess, citations.openAccessPdf, citations.fieldsOfStudy, citations.s2FieldsOfStudy, citations.publicationDate
76
+
77
+ # publicationDateOrYear: 2019-03-05 ; 2019-03 ; 2019 ; 2016-03-05:2020-06-06 ; 1981-08-25: ; :2020-06-06 ; 1981:2020
78
+
79
+ # publicationTypes: Review ; JournalArticle CaseReport ; ClinicalTrial ; Dataset ; Editorial ; LettersAndComments ; MetaAnalysis ; News ; Study ; Book ; BookSection
80
+
81
+
82
+
83
+ def process_fields(fields):
84
+ return ",".join(fields)
85
+
86
+
87
+ class SementicSearcher:
88
+ def __init__(self, ban_paper = []) -> None:
89
+ self.ban_paper = ban_paper
90
+
91
+ async def search_papers_async(self, query, limit=5, offset=0, fields=["title", "paperId", "abstract", "isOpenAccess", 'openAccessPdf', "year","publicationDate","citations.title","citations.abstract","citations.isOpenAccess","citations.openAccessPdf","citations.citationCount","citationCount","citations.year"],
92
+ publicationDate=None, minCitationCount=0, year=None,
93
+ publicationTypes=None, fieldsOfStudy=None):
94
+ url = 'https://api.semanticscholar.org/graph/v1/paper/search'
95
+ fields = process_fields(fields) if isinstance(fields, list) else fields
96
+
97
+ # More specific query parameter
98
+ query_params = {
99
+ 'query': query,
100
+ "limit": limit,
101
+ "offset": offset,
102
+ 'fields': fields,
103
+ 'publicationDateOrYear': publicationDate,
104
+ 'minCitationCount': minCitationCount,
105
+ 'year': year,
106
+ 'publicationTypes': publicationTypes,
107
+ 'fieldsOfStudy': fieldsOfStudy
108
+ }
109
+ # Load the API key from the configuration file
110
+ api_key = os.environ.get('SEMENTIC_SEARCH_API_KEY',None)
111
+ headers = {'x-api-key': api_key} if api_key else None
112
+ await asyncio.sleep(0.5)
113
+ try:
114
+ filtered_query_params = {key: value for key, value in query_params.items() if value is not None}
115
+ response = requests.get(url, params=filtered_query_params, headers=headers)
116
+
117
+ if response.status_code == 200:
118
+ response_data = response.json()
119
+ return response_data
120
+ elif response.status_code == 429:
121
+ time.sleep(1)
122
+ print(f"Request failed with status code {response.status_code}: begin to retry")
123
+ return await self.search_papers_async(query, limit, offset, fields, publicationDate, minCitationCount, year, publicationTypes, fieldsOfStudy)
124
+ else:
125
+ print(f"Request failed with status code {response.status_code}: {response.text}")
126
+ return None
127
+ except requests.RequestException as e:
128
+ print(f"An error occurred: {e}")
129
+ return None
130
+
131
+ def cal_cosine_similarity(self, vec1, vec2):
132
+ return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
133
+
134
+ def read_arxiv_from_path(self, pdf_path):
135
+ article_dict = scipdf.parse_pdf_to_dict(pdf_path)
136
+ return article_dict
137
+
138
+ async def get_paper_embbeding_and_score_async(self,query_embedding, paper,llm):
139
+ paper_content = f"""
140
+ Title: {paper['title']}
141
+ Abstract: {paper['abstract']}
142
+ """
143
+ paper_embbeding = await llm.get_embbeding_async(paper_content)
144
+ paper_embbeding = np.array(paper_embbeding)
145
+ score = self.cal_cosine_similarity(query_embedding,paper_embbeding)
146
+ return [paper,score]
147
+
148
+
149
+ async def rerank_papers_async(self, query_embedding, paper_list,llm):
150
+ if len(paper_list) >= 50:
151
+ paper_list = paper_list[:50]
152
+ results = await asyncio.gather(*[self.get_paper_embbeding_and_score_async(query_embedding, paper,llm) for paper in paper_list if paper])
153
+ reranked_papers = sorted(results,key = lambda x: x[1],reverse = True)
154
+ return reranked_papers
155
+
156
+ async def get_embbeding_and_score_async(self,query_embedding, text,llm):
157
+ text_embbeding = await llm.get_embbeding_async(text)
158
+ text_embbeding = np.array(text_embbeding)
159
+ score = self.cal_cosine_similarity(query_embedding,text_embbeding)
160
+ return score
161
+
162
+ async def get_embbeding_and_score_from_texts_async(self,query_embedding, texts,llm):
163
+ results = await asyncio.gather(*[self.get_embbeding_and_score_async(query_embedding, text,llm) for text in texts])
164
+ return results
165
+
166
+ async def get_paper_details_async(self, paper_id, fields = ["title", "abstract", "year","citationCount","isOpenAccess","openAccessPdf"]):
167
+ url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}'
168
+ fields = process_fields(fields)
169
+ paper_data_query_params = {'fields': fields}
170
+ try:
171
+ async with aiohttp.ClientSession() as session:
172
+ filtered_query_params = {key: value for key, value in paper_data_query_params.items() if value is not None}
173
+ headers = {'x-api-key': os.environ.get('SEMENTIC_SEARCH_API_KEY',None)}
174
+ async with session.get(url, params=filtered_query_params, headers=headers) as response:
175
+ if response.status == 200:
176
+ response_data = await response.json()
177
+ return response_data
178
+ else:
179
+ await asyncio.sleep(0.01)
180
+ print(f"Request failed with status code {response.status}: {await response.text()}")
181
+ return None
182
+ except Exception as e:
183
+ print(f"Failed to get paper details for paper ID: {paper_id}")
184
+ return None
185
+
186
+ async def batch_retrieve_papers_async(self, paper_ids, fields = semantic_fields):
187
+ url = 'https://api.semanticscholar.org/graph/v1/paper/batch'
188
+ paper_data_query_params = {'fields': process_fields(fields)}
189
+ paper_ids_json = {"ids": paper_ids}
190
+ try:
191
+ async with aiohttp.ClientSession() as session:
192
+ filtered_query_params = {key: value for key, value in paper_data_query_params.items() if value is not None}
193
+ headers = {'x-api-key': os.environ.get('SEMENTIC_SEARCH_API_KEY',None)}
194
+ async with session.post(url, json=paper_ids_json, params=filtered_query_params, headers=headers) as response:
195
+ if response.status == 200:
196
+ response_data = await response.json()
197
+ return response_data
198
+ else:
199
+ await asyncio.sleep(0.01)
200
+ print(f"Request failed with status code {response.status}: {await response.text()}")
201
+ return None
202
+ except Exception as e:
203
+ print(f"Failed to batch retrieve papers for paper IDs: {paper_ids}")
204
+ return None
205
+
206
+ async def search_paper_from_title_async(self, query,fields = ["title","paperId"]):
207
+ url = 'https://api.semanticscholar.org/graph/v1/paper/search/match'
208
+ fields = process_fields(fields)
209
+ query_params = {'query': query, 'fields': fields}
210
+ try:
211
+ async with aiohttp.ClientSession() as session:
212
+ filtered_query_params = {key: value for key, value in query_params.items() if value is not None}
213
+ headers = {'x-api-key': os.environ.get('SEMENTIC_SEARCH_API_KEY',None)}
214
+ async with session.get(url, params=filtered_query_params, headers=headers) as response:
215
+ if response.status == 200:
216
+ response_data = await response.json()
217
+ return response_data
218
+ else:
219
+ await asyncio.sleep(0.01)
220
+ print(f"Request failed with status code {response.status}: {await response.text()}")
221
+ return None
222
+ except Exception as e:
223
+ await asyncio.sleep(0.01)
224
+ print(f"Failed to search paper from title: {query}")
225
+ return None
226
+
227
+
228
+ async def search_async(self,query,max_results = 5 ,paper_list = None ,rerank_query = None,llm = None,year = None,publicationDate = None,need_download = True,fields = ["title", "paperId", "abstract", "isOpenAccess", 'openAccessPdf', "year","publicationDate","citationCount"]):
229
+ if rerank_query:
230
+ rerank_query_embbeding = llm.get_embbeding(rerank_query)
231
+ rerank_query_embbeding = np.array(rerank_query_embbeding)
232
+
233
+ readed_papers = []
234
+ if paper_list:
235
+ if isinstance(paper_list,set):
236
+ paper_list = list(paper_list)
237
+ if len(paper_list) == 0 :
238
+ pass
239
+ elif isinstance(paper_list[0], str):
240
+ readed_papers = paper_list
241
+ elif isinstance(paper_list[0], Result):
242
+ readed_papers = [paper.title for paper in paper_list]
243
+
244
+ print(f"Searching for papers related to the query: <{query}>")
245
+ results = await self.search_papers_async(query,limit = 10 * max_results,year=year,publicationDate = publicationDate,fields = fields)
246
+ if not results or "data" not in results:
247
+ return []
248
+
249
+ new_results = []
250
+ for result in results['data']:
251
+ if result['title'] in self.ban_paper:
252
+ continue
253
+ new_results.append(result)
254
+ results = new_results
255
+
256
+ final_results = []
257
+ if need_download:
258
+ paper_candidates = []
259
+ for result in results:
260
+ if not result['isOpenAccess'] or not result['openAccessPdf'] or result['title'] in readed_papers:
261
+ continue
262
+ else:
263
+ paper_candidates.append(result)
264
+ else:
265
+ paper_candidates = results
266
+
267
+ if llm and rerank_query:
268
+ paper_candidates = await self.rerank_papers_async(rerank_query_embbeding, paper_candidates,llm)
269
+ paper_candidates = [paper[0] for paper in paper_candidates if paper]
270
+
271
+ if need_download:
272
+ for result in paper_candidates:
273
+ pdf_link = result['openAccessPdf']["url"]
274
+ try:
275
+ content = await self.download_pdf_async(pdf_link)
276
+ if not content:
277
+ continue
278
+ except Exception as e:
279
+ continue
280
+ title = result['title']
281
+ abstract = result['abstract']
282
+ citationCount = result['citationCount']
283
+ year = result['year']
284
+ article = scipdf.parse_pdf_to_dict(content)
285
+ if not article:
286
+ continue
287
+ final_results.append(Result(title,abstract,article,citationCount,year))
288
+ if len(final_results) >= max_results:
289
+ break
290
+ else:
291
+ for result in paper_candidates:
292
+ title = result['title']
293
+ abstract = result['abstract']
294
+ citationCount = result['citationCount']
295
+ year = result['year']
296
+ final_results.append(Result(title,abstract,None,citationCount,year))
297
+ if len(final_results) >= max_results:
298
+ break
299
+ return final_results
300
+
301
+ async def search_related_paper_async(self,title,need_citation = True,need_reference = True,rerank_query = None,llm = None,paper_list = []):
302
+ print(f"Searching for the related papers of <{title}>")
303
+ fileds = ["title","abstract","citations.title","citations.abstract","citations.citationCount","references.title","references.abstract","references.citationCount","citations.isOpenAccess","citations.openAccessPdf","references.isOpenAccess","references.openAccessPdf","citations.year","references.year"]
304
+ results = await self.search_papers_async(title,limit = 3,fields=fileds)
305
+ related_papers = []
306
+ related_papers_title = []
307
+ if not results or "data" not in results:
308
+ return None
309
+ for result in results["data"]:
310
+ if not result:
311
+ continue
312
+ if need_citation:
313
+ for citation in result["citations"]:
314
+ if "openAccessPdf" not in citation or not citation["openAccessPdf"]:
315
+ continue
316
+ elif citation["title"] in related_papers_title or citation["title"] in self.ban_paper or citation["title"] in paper_list:
317
+ continue
318
+ elif citation["isOpenAccess"] == False or citation["openAccessPdf"] == None:
319
+ continue
320
+ else:
321
+ related_papers.append(citation)
322
+ related_papers_title.append(citation["title"])
323
+ if need_reference:
324
+ for reference in result["references"]:
325
+ if "openAccessPdf" not in reference or not reference["openAccessPdf"]:
326
+ continue
327
+ elif reference["title"] in related_papers_title or reference["title"] in self.ban_paper or reference["title"] in paper_list:
328
+ continue
329
+ elif reference["isOpenAccess"] == False or reference["openAccessPdf"] == None:
330
+ continue
331
+ else:
332
+ related_papers.append(reference)
333
+ related_papers_title.append(reference["title"])
334
+ if result:
335
+ break
336
+ if len(related_papers) >= 200:
337
+ related_papers = related_papers[:200]
338
+
339
+ if rerank_query and llm:
340
+ rerank_query_embbeding = llm.get_embbeding(rerank_query)
341
+ rerank_query_embbeding = np.array(rerank_query_embbeding)
342
+ related_papers = await self.rerank_papers_async(rerank_query_embbeding, related_papers,llm)
343
+ related_papers = [paper[0] for paper in related_papers]
344
+ related_papers = [[paper["title"],paper["abstract"],paper["openAccessPdf"]["url"],paper["citationCount"],paper['year']] for paper in related_papers]
345
+ else:
346
+ related_papers = [[paper["title"],paper["abstract"],paper["openAccessPdf"]["url"],paper["citationCount"],paper['year']] for paper in related_papers]
347
+ related_papers = sorted(related_papers,key = lambda x: x[3],reverse = True)
348
+ print(f"Found {len(related_papers)} related papers")
349
+ for paper in related_papers:
350
+ url = paper[2]
351
+ content = await self.download_pdf_async(url)
352
+ if content:
353
+ article = scipdf.parse_pdf_to_dict(content)
354
+ if not article:
355
+ continue
356
+ result = Result(paper[0],paper[1],article,paper[3],paper[4])
357
+ return result
358
+ return None
359
+
360
+
361
+ async def download_pdf_async(self, pdf_link):
362
+ content = await fetch(pdf_link)
363
+ if not content:
364
+ return None
365
+ else:
366
+ return content
367
+
368
+ def read_paper_title_abstract(self,article):
369
+ title = article["title"]
370
+ abstract = article["abstract"]
371
+ paper_content = f"""
372
+ Title: {title}
373
+ Abstract: {abstract}
374
+ """
375
+ return paper_content
376
+
377
+ def read_paper_content(self,article):
378
+ paper_content = self.read_paper_title_abstract(article)
379
+ for section in article["sections"]:
380
+ paper_content += f"section: {section['heading']}\n content: {section['text']}\n ref_ids: {section['publication_ref']}\n"
381
+ return paper_content
382
+
383
+ def read_paper_content_with_ref(self,article):
384
+ paper_content = self.read_paper_content(article)
385
+ paper_content += "<References>\n"
386
+ i = 1
387
+ for refer in article["references"]:
388
+ ref_id = refer["ref_id"]
389
+ title = refer["title"]
390
+ year = refer["year"]
391
+ paper_content += f"Ref_id:{ref_id} Title: {title} Year: ({year})\n"
392
+ i += 1
393
+ paper_content += "</References>\n"
394
+ return paper_content
395
+
396
+
397
+
start.sh ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/sh
2
+
3
+ # 启动gradlew服务并将其放到后台运行
4
+ cd grobid
5
+ ./gradlew run --console=plain &
6
+
7
+ cd ..
8
+ # 启动uvicorn服务
9
+ uvicorn app:app --host 0.0.0.0 --port 7860
style.css ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ /* styles.css */
2
+ .same-height {
3
+ height: 100%;
4
+ }
supervisord.conf ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [supervisord]
2
+ nodaemon=true
3
+ pidfile=/dev/null
4
+ logfile=/dev/null
5
+ logfile_maxbytes=0
6
+ logfile_backups=0
7
+ loglevel=info
8
+
9
+ [program:gradle_service]
10
+ command=/app/grobid/gradlew run
11
+ environment=JAVA_HOME="/opt/jdk-11.0.2",PATH="/opt/jdk-11.0.2/bin:/home/user/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
12
+ directory=/app
13
+ autostart=true
14
+ autorestart=true
15
+ startsecs=10
16
+ stdout_logfile=/dev/stdout
17
+ stderr_logfile=/dev/stder
18
+
19
+ [program:uvicorn_service]
20
+ command=uvicorn app:app --host "0.0.0.0" --port 7860
21
+ directory=/app
22
+
utils.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def get_content_between_a_b(start_tag, end_tag, text):
2
+ extracted_text = ""
3
+ start_index = text.find(start_tag)
4
+ while start_index != -1:
5
+ end_index = text.find(end_tag, start_index + len(start_tag))
6
+ if end_index != -1:
7
+ extracted_text += text[start_index + len(start_tag) : end_index] + " "
8
+ start_index = text.find(start_tag, end_index + len(end_tag))
9
+ else:
10
+ break
11
+
12
+ return extracted_text.strip()
13
+
14
+
15
+ def extract(text, type,hard = True):
16
+ if text:
17
+ target_str = get_content_between_a_b(f"<{type}>", f"</{type}>", text)
18
+ if target_str:
19
+ return target_str
20
+ elif hard:
21
+ return text
22
+ else:
23
+ return ""
24
+ else:
25
+ return ""
26
+
27
+
28
+ def extract_json(text):
29
+ if "```json" in text:
30
+ target_str = get_content_between_a_b("```json", "```", text)
31
+ return target_str
32
+ else:
33
+ return text
34
+
35
+
36
+