Spaces:

DAMO-NLP-SG
/

CoI_Agent

Running

App Files Files Community

jianghuyihei commited on 27 days ago

Commit

863d8a3

•

1 Parent(s): ef12410

first commit

Browse files

Files changed (19) hide show

.gitattributes copy +35 -0
Dockerfile +84 -0
LLM.py +186 -0
README.md +5 -3
agents.py +358 -0
app.py +345 -0
main.py +38 -0
prompts/__init__.py +4 -0
prompts/deep_research_agent_promts.py +465 -0
prompts/juder_prompts.py +69 -0
prompts/review_agent_prompts.py +66 -0
requirements.txt +23 -0
searcher/.DS_Store +0 -0
searcher/__init__.py +4 -0
searcher/sementic_search.py +397 -0
start.sh +9 -0
style.css +4 -0
supervisord.conf +22 -0
utils.py +36 -0

.gitattributes copy ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,84 @@

+FROM python:3.9
+# 创建用户并切换到该用户
+RUN useradd -m -u 1000 user
+# 设置环境变量
+ENV PATH="/home/user/.local/bin:$PATH"
+# /home/user/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
+RUN echo $PATH
+# 设置环境变量，以避免在安装过程中出现交互提示
+ENV DEBIAN_FRONTEND=noninteractive
+# 设置工作目录
+WORKDIR /app
+USER root
+# # 安装 sudo
+RUN apt-get update && apt-get install -y sudo supervisor
+# 克隆并安装 scipdf_parser
+RUN git clone -q https://github.com/titipata/scipdf_parser.git
+RUN pip install -q git+https://github.com/titipata/scipdf_parser
+# 进入 scipdf_parser 目录并安装 spaCy 模型
+WORKDIR /app/scipdf_parser
+RUN python -m spacy download en_core_web_sm
+# 返回工作目录
+WORKDIR /app
+# 复制requirements.txt文件并安装Python依赖
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install -q python-multipart
+RUN pip install -q --upgrade pip
+RUN pip install -q --upgrade -r requirements.txt
+# 下载并解压 OpenJDK 11
+RUN wget -q  https://download.oracle.com/java/GA/jdk11/9/GPL/openjdk-11.0.2_linux-x64_bin.tar.gz \
+    && tar -zxvf openjdk-11.0.2_linux-x64_bin.tar.gz > /dev/null \
+    && mv jdk-11.0.2 /opt/ \
+    && rm openjdk-11.0.2_linux-x64_bin.tar.gz
+# 设置环境变量
+ENV JAVA_HOME=/opt/jdk-11.0.2
+ENV PATH=$JAVA_HOME/bin:$PATH
+# 验证 Java 安装
+RUN java -version
+# RUN sudo apt-get install -y maven
+RUN wget -q  https://github.com/kermitt2/grobid/archive/0.7.3.zip
+RUN unzip 0.7.3.zip > /dev/null
+# 安装依赖
+RUN git clone -q https://github.com/kermitt2/grobid.git
+WORKDIR /app/grobid
+RUN chmod +x gradlew
+RUN ./gradlew clean install --console=plain
+WORKDIR /app
+# 暴露端口
+EXPOSE 8070
+EXPOSE 7860
+COPY --chown=user ./start.sh start.sh
+RUN chmod +x start.sh
+RUN chmod -R 777 grobid
+# 复制应用程序代码
+COPY --chown=user . /app
+CMD ["./start.sh"]
+# CMD ["supervisord", "-c", "supervisord.conf"]
+# # 设置默认命令
+# CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
+# "uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"
+# uvicorn app:app --host "0.0.0.0" --port 7860

LLM.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from openai import AzureOpenAI, OpenAI,AsyncAzureOpenAI,AsyncOpenAI
+from abc import abstractmethod
+import os
+import httpx
+import base64
+import logging
+import asyncio
+import numpy as np
+from tenacity import (
+    retry,
+    stop_after_attempt,
+    wait_fixed,
+)
+def get_content_between_a_b(start_tag, end_tag, text):
+    extracted_text = ""
+    start_index = text.find(start_tag)
+    while start_index != -1:
+        end_index = text.find(end_tag, start_index + len(start_tag))
+        if end_index != -1:
+            extracted_text += text[start_index + len(start_tag) : end_index] + " "
+            start_index = text.find(start_tag, end_index + len(end_tag))
+        else:
+            break
+    return extracted_text.strip()
+def before_retry_fn(retry_state):
+    if retry_state.attempt_number > 1:
+        logging.info(f"Retrying API call. Attempt #{retry_state.attempt_number}, f{retry_state}")
+def encode_image(image_path):
+  with open(image_path, "rb") as image_file:
+    return base64.b64encode(image_file.read()).decode('utf-8')
+def get_openai_url(img_pth):
+    end = img_pth.split(".")[-1]
+    if end == "jpg":
+        end = "jpeg"
+    base64_image = encode_image(img_pth)
+    return f"data:image/{end};base64,{base64_image}"
+class base_llm:
+    def __init__(self) -> None:
+        pass
+    @abstractmethod
+    def response(self,messages,**kwargs):
+        pass
+    def get_imgs(self,prompt, save_path="saves/dalle3.jpg"):
+        pass
+class openai_llm(base_llm):
+    def __init__(self,model = "gpt4o-0513") -> None:
+        super().__init__()
+        self.model = model
+        if "AZURE_OPENAI_ENDPOINT" not in os.environ or os.environ["AZURE_OPENAI_ENDPOINT"] == "":
+            raise ValueError("AZURE_OPENAI_ENDPOINT is not set")
+        if "AZURE_OPENAI_KEY" not in os.environ or os.environ["AZURE_OPENAI_KEY"] == "":
+            raise ValueError("AZURE_OPENAI_KEY is not set")
+        api_version = os.environ.get("AZURE_OPENAI_API_VERSION",None)
+        if api_version == "":
+            api_version = None
+        self.client = AzureOpenAI(
+            azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+            api_key=os.environ["AZURE_OPENAI_KEY"],
+            api_version= api_version
+            )
+        self.async_client = AsyncAzureOpenAI(
+            azure_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"],
+            api_key=os.environ["AZURE_OPENAI_KEY"],
+            api_version= api_version
+            )
+    def cal_cosine_similarity(self, vec1, vec2):
+        if isinstance(vec1, list):
+            vec1 = np.array(vec1)
+        if isinstance(vec2, list):
+            vec2 = np.array(vec2)
+        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(10), before=before_retry_fn)
+    def response(self,messages,**kwargs):
+        try:
+            response = self.client.chat.completions.create(
+                model=kwargs.get("model", self.model),
+                messages=messages,
+                n = kwargs.get("n", 1),
+                temperature= kwargs.get("temperature", 0.7),
+                max_tokens=kwargs.get("max_tokens", 4000),
+                timeout=kwargs.get("timeout", 180)
+            )
+        except Exception as e:
+            model = kwargs.get("model", self.model)
+            print(f"get {model} response failed: {e}")
+            print(e)
+            logging.info(e)
+            return
+        return response.choices[0].message.content
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(10), before=before_retry_fn)
+    def get_embbeding(self,text):
+        if os.environ.get("EMBEDDING_API_ENDPOINT"):
+            client = AzureOpenAI(
+            azure_endpoint=os.environ.get("EMBEDDING_API_ENDPOINT",None),
+            api_key=os.environ.get("EMBEDDING_API_KEY",None),
+            api_version= os.environ.get("AZURE_OPENAI_API_VERSION",None),
+            azure_deployment="embedding-3-large"
+            )
+        else:
+            client = self.client
+        try:
+            embbeding = client.embeddings.create(
+                model=os.environ.get("EMBEDDING_MODEL","text-embedding-3-large"),
+                input=text,
+                timeout= 180
+            )
+            return embbeding.data[0].embedding
+        except Exception as e:
+            print(f"get embbeding failed: {e}")
+            print(e)
+            logging.info(e)
+            return
+    async def get_embbeding_async(self,text):
+        if os.environ.get("EMBEDDING_API_ENDPOINT",None):
+            async_client = AsyncAzureOpenAI(
+            azure_endpoint=os.environ.get("EMBEDDING_API_ENDPOINT",None),
+            api_key=os.environ.get("EMBEDDING_API_KEY",None),
+            api_version= os.environ.get("AZURE_OPENAI_API_VERSION",None),
+            azure_deployment="embedding-3-large"
+            )
+        else:
+            async_client = self.async_client
+        try:
+            embbeding = await async_client.embeddings.create(
+                model=os.environ.get("EMBEDDING_MODEL","text-embedding-3-large"),
+                input=text,
+                timeout= 180
+            )
+            return embbeding.data[0].embedding
+        except Exception as e:
+            await asyncio.sleep(0.1)
+            print(f"get embbeding failed: {e}")
+            print(e)
+            logging.info(e)
+            return
+    @retry(wait=wait_fixed(10), stop=stop_after_attempt(10), before=before_retry_fn)
+    async def response_async(self,messages,**kwargs):
+        try:
+            response = await self.async_client.chat.completions.create(
+                model=kwargs.get("model", self.model),
+                messages=messages,
+                n = kwargs.get("n", 1),
+                temperature= kwargs.get("temperature", 0.7),
+                max_tokens=kwargs.get("max_tokens", 4000),
+                timeout=kwargs.get("timeout", 180)
+            )
+        except Exception as e:
+            await asyncio.sleep(0.1)
+            model = kwargs.get("model", self.model)
+            print(f"get {model} response failed: {e}")
+            print(e)
+            logging.info(e)
+            return
+        return response.choices[0].message.content
+if __name__ == "__main__":
+    llm = gemini_llm(api_key="")
+    prompt = """
+"""
+    messages = [{"role":"user","content":prompt}]
+    response = asyncio.run(llm.response_async(messages))
+    print(response)

README.md CHANGED Viewed

@@ -1,11 +1,13 @@
 ---
 title: CoI Agent
-emoji: 🌖
-colorFrom: purple
-colorTo: yellow
 sdk: docker
 pinned: false
 license: apache-2.0
 short_description: 'Online demo of paper: Chain of Ideas: Revolutionizing Resear'
 ---

 ---
 title: CoI Agent
+emoji: 🐢
+colorFrom: indigo
+colorTo: indigo
 sdk: docker
 pinned: false
 license: apache-2.0
+app_port: 7860
+startup_duration_timeout: 1h
 short_description: 'Online demo of paper: Chain of Ideas: Revolutionizing Resear'
 ---

agents.py ADDED Viewed

	@@ -0,0 +1,358 @@

+import json
+import time
+import asyncio
+import os
+from searcher import Result,SementicSearcher
+from LLM import openai_llm
+from prompts import *
+from utils import extract
+def get_llm(model = "gpt4o-0513"):
+    return openai_llm(model)
+def get_llms():
+    main_llm = get_llm("gpt4o-0513")
+    cheap_llm = get_llm("gpt-4o-mini")
+    return main_llm,cheap_llm
+async def judge_idea(i,j,idea0,idea1,topic,llm):
+    prompt = get_judge_idea_all_prompt(idea0,idea1,topic)
+    messages = [{"role":"user","content":prompt}]
+    response = await llm.response_async(messages)
+    novelty = extract(response,"novelty")
+    relevance = extract(response,"relevance")
+    significance = extract(response,"significance")
+    clarity = extract(response,"clarity")
+    feasibility = extract(response,"feasibility")
+    effectiveness = extract(response,"effectiveness")
+    return i,j,novelty,relevance,significance,clarity,feasibility,effectiveness
+class DeepResearchAgent:
+    def __init__(self,llm = None,cheap_llm=None,publicationData = None,ban_paper = [],**kwargs) -> None:
+        self.reader = SementicSearcher(ban_paper = ban_paper)
+        self.begin_time = time.time()
+        self.llm = llm
+        self.cheap_llm = cheap_llm
+        self.read_papers = set()
+        self.paper_storage = []
+        self.paper_info_for_refine_experiment = []
+        self.search_qeuries = []
+        self.deep_research_chains = []
+        self.deep_ideas = []
+        self.check_novel_results = []
+        self.score_results = []
+        self.topic =None
+        self.publicationData = publicationData
+        self.improve_cnt = kwargs.get("improve_cnt",1)
+        self.max_chain_length = kwargs.get("max_chain_length",5)
+        self.min_chain_length = kwargs.get("min_chain_length",3)
+        self.max_chain_numbers = kwargs.get("max_chain_numbers",10)
+    def wrap_messages(self,prompt):
+        return [{"role":"user","content":prompt}]
+    async def get_openai_response_async(self,messages):
+        return await self.llm.response_async(messages)
+    async def get_cheap_openai_response_async(self,messages):
+        return await self.cheap_llm.response_async(messages,max_tokens = 16000)
+    async def get_search_query(self,topic = None,query=None):
+        prompt = get_deep_search_query_prompt(topic,query)
+        messages = self.wrap_messages(prompt)
+        response = await self.get_openai_response_async(messages)
+        search_query = extract(response,"queries")
+        try:
+            search_query = json.loads(search_query)
+            self.search_qeuries.append({"query":query,"search_query":search_query})
+        except:
+            search_query = [query]
+        return search_query
+    async def generate_idea_with_chain(self,topic):
+        self.topic = topic
+        print(f"begin to generate search query for {topic}")
+        search_query = await self.get_search_query(topic=topic)
+        papers = []
+        for query in search_query:
+            failed_query = []
+            current_papers = []
+            cnt = 0
+            while len(current_papers) == 0 and cnt < 10:
+                paper = await self.reader.search_async(query,1,paper_list=self.read_papers,llm=self.llm,rerank_query=f"{topic}",publicationDate=self.publicationData)
+                if paper and len(paper) > 0 and paper[0]:
+                    self.read_papers.add(paper[0].title)
+                    current_papers.append(paper[0])
+                else:
+                    failed_query.append(query)
+                    prompt = get_deep_rewrite_query_prompt(failed_query,topic)
+                    messages = self.wrap_messages(prompt)
+                    new_query = await self.get_openai_response_async(messages)
+                    new_query = extract(new_query,"query")
+                    print(f"Failed to search papers for {query}, regenerating query {new_query} to search papers.")
+                    query = new_query
+                cnt += 1
+            papers.extend(current_papers)
+            if len(papers) >= self.max_chain_numbers:
+                break
+        if len(papers) == 0:
+            print(f"failed to generate idea {topic}")
+            return None,None,None,None,None,None,None,None,None
+        tasks = [self.deep_research_paper_with_chain(paper) for paper in papers]
+        results = await asyncio.gather(*tasks)
+        results = [result for result in results if result]
+        if len(results) ==0:
+            print(f"failed to generate idea {topic}")
+            return None,None,None,None,None,None,None,None,None
+        ideas,idea_chains,experiments,entities,trends,futures,humans,years = [[result[i] for result in results] for i in range(8)]
+        tasks = []
+        for i,idea_1 in enumerate(ideas):
+            for j,idea_2 in enumerate(ideas):
+                if i != j:
+                    tasks.append(judge_idea(i,j,idea_1,idea_2,topic,self.llm))
+        results = await asyncio.gather(*tasks)
+        elo_scores = [0 for _ in range(len(ideas))]
+        elo_selected = 0
+        def change_winner_to_score(winner,score_1,score_2):
+            try:
+                winner = int(winner)
+            except:
+                return score_1+0.5,score_2+0.5
+            if winner == 0:
+                return score_1+1,score_2
+            if winner == 2:
+                return score_1+0.5,score_2+0.5
+            return score_1,score_2+1
+        for result in results:
+            i,j,novelty,relevance,significance,clarity,feasibility,effectiveness = result
+            for dimension in [novelty,relevance,significance,clarity,feasibility,effectiveness]:
+                elo_scores[i],elo_scores[j] = change_winner_to_score(dimension,elo_scores[i],elo_scores[j])
+            print(f"i:{i},j:{j},novelty:{novelty},relevance:{relevance},significance:{significance},clarity:{clarity},feasibility:{feasibility},effectiveness:{effectiveness}")
+        print(elo_scores)
+        try:
+            elo_selected = elo_scores.index(max(elo_scores))
+        except:
+            elo_selected = 0
+        idea,experiment,entities,idea_chain,trend,future,human,year = ideas[elo_selected],experiments[elo_selected],entities[elo_selected],idea_chains[elo_selected],trends[elo_selected],futures[elo_selected],humans[elo_selected],years[elo_selected]
+        print(f"successfully generated idea")
+        return idea,experiment,entities,idea_chain,ideas,trend,future,human,year
+    async def get_paper_idea_experiment_references_info(self,paper):
+        article = paper.article
+        if not article:
+            return None
+        paper_content = self.reader.read_paper_content(article)
+        prompt = get_deep_reference_prompt(paper_content,self.topic)
+        messages = self.wrap_messages(prompt)
+        response = await self.get_cheap_openai_response_async(messages)
+        entities = extract(response,"entities")
+        idea = extract(response,"idea")
+        experiment = extract(response,"experiment")
+        references = extract(response,"references")
+        return idea,experiment,entities,references,paper.title
+    async def get_article_idea_experiment_references_info(self,article):
+        paper_content = self.reader.read_paper_content_with_ref(article)
+        prompt = get_deep_reference_prompt(paper_content,self.topic)
+        messages = self.wrap_messages(prompt)
+        response = await self.get_cheap_openai_response_async(messages)
+        entities = extract(response,"entities")
+        idea = extract(response,"idea")
+        experiment = extract(response,"experiment")
+        references = extract(response,"references")
+        return idea,experiment,entities,references
+    async def deep_research_paper_with_chain(self,paper:Result):
+        print(f"begin to deep research paper {paper.title}")
+        article = paper.article
+        if not article:
+            print(f"failed to deep research paper {paper.title}")
+            return None
+        idea_chain = []
+        idea_papers = []
+        experiments = []
+        total_entities = []
+        years = []
+        idea,experiment,entities,references = await self.get_article_idea_experiment_references_info(article)
+        try:
+            references = json.loads(references)
+        except:
+            references = []
+        total_entities.append(entities)
+        idea_chain.append(idea)
+        idea_papers.append(paper.title)
+        experiments.append(experiment)
+        years.append(paper.year)
+        current_title = paper.title
+        current_abstract = paper.abstract
+        # search before
+        while len(idea_chain)<self.max_chain_length:
+            rerank_query = f"{self.topic} {current_title} {current_abstract}"
+            citation_paper = await self.reader.search_related_paper_async(current_title,need_reference=False,rerank_query=rerank_query,llm=self.llm,paper_list=idea_papers)
+            if not citation_paper:
+                print(f"failed to find citation paper for {current_title}")
+                break
+            title = citation_paper.title
+            abstract = citation_paper.abstract
+            prompt = get_deep_judge_relevant_prompt(current_title,current_abstract,self.topic)
+            messages = self.wrap_messages(prompt)
+            response = await self.get_openai_response_async(messages)
+            relevant = extract(response,"relevant")
+            if relevant != "0":
+                result = await self.get_paper_idea_experiment_references_info(citation_paper)
+                if not result:
+                    break
+                idea,experiment,entities,_,_ = result
+                idea_chain.append(idea)
+                experiments.append(experiment)
+                total_entities.append(entities)
+                idea_papers.append(citation_paper.title)
+                years.append(citation_paper.year)
+                current_title = citation_paper.title
+                current_abstract = citation_paper.abstract
+            else:
+                print(f"the paper {title} is not relevant")
+                break
+        current_title = paper.title
+        current_abstract = paper.abstract
+        # search after
+        while len(idea_chain) < self.max_chain_length and len(references) > 0:
+            search_paper = []
+            article = None
+            print(f"The references find:{references}")
+            while len(references) > 0 and len(search_paper) == 0:
+                reference = references[0]
+                references.pop(0)
+                if reference in self.read_papers:
+                    continue
+                search_paper = await self.reader.search_async(reference,3,llm=self.llm,publicationDate=self.publicationData,paper_list= idea_papers)
+                if len(search_paper) > 0:
+                    s_p = search_paper[0]
+                    if s_p and  s_p.title not in self.read_papers:
+                        prompt = get_deep_judge_relevant_prompt(current_title,current_abstract,self.topic)
+                        messages = self.wrap_messages(prompt)
+                        response = await self.get_openai_response_async(messages)
+                        relevant = extract(response,"relevant")
+                        if relevant != "0" or len(idea_chain) < self.min_chain_length:
+                            article = s_p.article
+                            if article:
+                                cite_paper = s_p
+                                break
+                        else:
+                            print(f"the paper {s_p.title} is not relevant")
+                search_paper = []
+            if not article:
+                rerank_query = f"topic: {self.topic} Title: {current_title} Abstract: {current_abstract}"
+                search_paper = await self.reader.search_related_paper_async(current_title,need_citation=False,rerank_query = rerank_query,llm=self.llm,paper_list=idea_papers)
+                if not search_paper:
+                    print(f"failed to find citation paper for {current_title}")
+                    continue
+                s_p = search_paper
+                if len(idea_chain) < self.min_chain_length:
+                    article = s_p.article
+                    if not article:
+                        continue
+                    else:
+                        cite_paper = s_p
+                        break
+                else:
+                    if s_p and s_p.title not in self.read_papers:
+                        prompt = get_deep_judge_relevant_prompt(current_title,current_abstract,self.topic)
+                        messages = self.wrap_messages(prompt)
+                        response = await self.get_openai_response_async(messages)
+                        relevant = extract(response,"relevant")
+                        if relevant == "1" or len(idea_chain) < self.min_chain_length:
+                            article = await s_p.article
+                            if not article:
+                                continue
+                            else:
+                                cite_paper = s_p
+                                break
+            if not article:
+                print(f"failed to find citation paper for {current_title}")
+                continue
+            print("find the citation paper, begin to deep research")
+            paper_content = self.reader.read_paper_content_with_ref(article)
+            prompt = get_deep_reference_prompt(paper_content,self.topic)
+            messages = self.wrap_messages(prompt)
+            response = await self.get_cheap_openai_response_async(messages)
+            idea = extract(response,"idea")
+            references = extract(response,"references")
+            experiment = extract(response,"experiment")
+            entities = extract(response,"entities")
+            try:
+                references = json.loads(references)
+            except:
+                references = []
+            current_title = cite_paper.title
+            current_abstract = cite_paper.abstract
+            years = [cite_paper.year] + years
+            idea_chain = [idea] + idea_chain
+            idea_papers = [cite_paper.title] + idea_papers
+            experiments = [experiment] + experiments
+            total_entities = [entities] + total_entities
+            if len(idea_chain) >= self.min_chain_length:
+                if cite_paper.citations_conut > 1000:
+                    break
+        print("successfully generate idea chain")
+        idea_chains = ""
+        for i,idea,title in zip(range(len(idea_chain)),idea_chain,idea_papers):
+            idea_chains += f"{i}.Paper:{title} idea:{idea}\n \n"
+        prompt = get_deep_trend_idea_chains_prompt(idea_chains,entities,self.topic)
+        messages = self.wrap_messages(prompt)
+        response = await self.get_openai_response_async(messages)
+        trend = extract(response,"trend")
+        self.deep_research_chains.append({"idea_chains":idea_chains,"trend":trend,"topic":self.topic,"ideas":idea_chain,"experiments":experiments,"entities":total_entities,"years":years})
+        prompt = f"""The current research topic is: {self.topic}. Please help me summarize and refine the following entities by merging, simplifying, or deleting them : {total_entities}
+    Please output strictly in the following format:
+    <entities> {{cleaned entities}}</entities>
+"""
+        messages = self.wrap_messages(prompt)
+        response = await self.get_openai_response_async(messages)
+        total_entities = extract(response,"entities")
+        bad_case = []
+        prompt = get_deep_generate_future_direciton_prompt(idea_chain,trend,self.topic,total_entities)
+        messages = self.wrap_messages(prompt)
+        response = await self.get_openai_response_async(messages)
+        future = extract(response,"future")
+        human = extract(response,"human")
+        prompt = get_deep_generate_idea_prompt(idea_chains,trend,self.topic,total_entities,future,bad_case)
+        messages = self.wrap_messages(prompt)
+        response = await self.get_openai_response_async(messages)
+        method = extract(response,"method")
+        novelty = extract(response,"novelty")
+        motivation = extract(response,"motivation")
+        idea = {"motivation":motivation,"novelty":novelty,"method":method}
+        prompt = get_deep_final_idea_prompt(idea_chains,trend,idea,self.topic)
+        messages = self.wrap_messages(prompt)
+        response = await self.get_openai_response_async(messages)
+        final_idea = extract(response,"final_idea")
+        idea = final_idea
+        self.deep_ideas.append(idea)
+        print(f"successfully deep research paper {paper.title}")
+        return idea,idea_chains,trend,experiments,total_entities,future,human,years
+if __name__ == "__main__":
+    reader = SementicSearcher()

app.py ADDED Viewed

	@@ -0,0 +1,345 @@

+from fastapi import FastAPI, Form
+from fastapi.responses import HTMLResponse
+from jinja2 import Template
+import markdown
+import time
+from datetime import datetime, timedelta
+from apscheduler.schedulers.background import BackgroundScheduler
+import asyncio
+from agents import DeepResearchAgent, get_llms
+app = FastAPI()
+# 每日最大回复次数
+MAX_REPLIES_PER_DAY = 100
+# 当日回复次数计数器
+reply_count = 0
+# 启动时设置计数器重置
+last_reset_time = datetime.now()
+# HTML模板
+html_template = """
+<!DOCTYPE html>
+<html>
+<head>
+    <title>CoI Agent online demo 😊</title>
+    <style>
+        body {
+            font-family: 'Arial', sans-serif;
+            background-color: #f4f4f9;
+            margin: 0;
+            padding: 0;
+            display: flex;
+            justify-content: center;
+            align-items: center;
+            min-height: 100vh;
+        }
+        .container {
+            width: 95%;
+            max-width: 1200px;
+            background-color: #fff;
+            padding: 2rem;
+            border-radius: 10px;
+            box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
+        }
+        h1 {
+            font-size: 2rem;
+            margin-bottom: 1.5rem;
+            color: #333;
+            text-align: center;
+        }
+        form {
+            margin-bottom: 1.5rem;
+        }
+        .form-group {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 1.5rem;
+        }
+        .form-group label {
+            flex: 0;
+            font-size: 1 rem; /* 增大字体 */
+            color: #333;
+            margin-right: 0.5rem;
+            background-color: #f0f8ff; /* 气泡背景颜色 */
+            padding: 0.5rem 1rem; /* 气泡内边距 */
+            border-radius: 10px; /* 气泡圆角 */
+            text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1); /* 艺术字效果 */
+            font-family: 'Times new roman', cursive, sans-serif; /* 艺术字体 */
+            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1); /* 气泡阴影 */
+        }
+        .form-group input {
+            flex: 4;
+            padding: 0.6rem;
+            font-size: 1rem;
+            border: 1px solid #ccc;
+            border-radius: 5px;
+            margin-left: 1rem;
+        }
+        .form-group button {
+            flex: 0;
+            padding: 0.6rem 1rem;
+            font-size: 1rem;
+            background-color: #F2A582;
+            color: #fff;
+            border: none;
+            border-radius: 5px;
+            cursor: pointer;
+            transition: background-color 0.3s ease;
+            margin-left: 1rem;
+        }
+        .form-group button:hover {
+            background-color: #0056b3;
+        }
+        .loading,
+        .time-box,
+        .counter-box,
+        .result,
+        .error {
+            margin-top: 1.5rem;
+        }
+        .loading {
+            font-size: 1.2rem;
+            color: #007bff;
+            animation: fadeIn 0.5s ease-in-out;
+            text-align: center;
+        }
+        .time-counter-container {
+            display: flex;
+            justify-content: space-between;
+        }
+        .time-box,
+        .counter-box {
+            display: inline-block;
+            padding: 0.5rem 1rem;
+            background-color: #e9ecef;
+            border-radius: 10px;
+            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
+            font-size: 0.9rem;
+            margin: 0.5rem;
+            flex: 1;
+            text-align: center;
+        }
+        .result {
+            display: flex;
+            justify-content: space-between;
+            flex-wrap: wrap;
+        }
+        .result .box {
+            flex: 1;
+            margin: 0.5rem;
+            padding: 1rem;
+            background-color: #e9ecef;
+            border-radius: 10px;
+            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
+            word-wrap: break-word;
+            height: 400px;
+            overflow-y: auto;
+            font-size: 1rem;
+            font-family: "Times New Roman", Times, serif;
+            line-height: 1.5;
+        }
+        .error .box {
+            width: 100%;
+            padding: 1rem;
+            background-color: #f8d7da;
+            color: #721c24;
+            border-radius: 10px;
+            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
+            word-wrap: break-word;
+        }
+        h2 {
+            font-size: 1.3rem;
+            margin-bottom: 1rem;
+            color: #333;
+        }
+        @keyframes fadeIn {
+            from { opacity: 0; }
+            to { opacity: 1; }
+        }
+        .progress-bar-container {
+            width: 100%;
+            background-color: #e9ecef;
+            border-radius: 10px;
+            overflow: hidden;
+            margin-top: 1.5rem;
+            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
+        }
+        .progress-bar {
+            height: 20px;
+            background-color: #727372;
+            width: 0%;
+            transition: width 0.1s ease;
+        }
+        .example-container {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 1.5rem;
+        }
+        .example-label {
+            flex: 0.7;
+            font-size: 1 rem;
+            color: #333;
+            text-align: center;
+            margin-right: 0rem;
+            padding: 0.5rem 0.2rem;
+            background-color: #f0f8ff;
+            border-radius: 10px;
+            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
+            font-family: 'Times new roman', cursive, sans-serif;
+            text-shadow: 1px 1px 2px rgba(0, 0, 0, 0.1);
+            box-shadow: 0 0 5px rgba(0, 0, 0, 0.1);
+        }
+        .example-topics {
+            flex: 6;
+            display: flex;
+            justify-content: space-around;
+        }
+        .example-topics button {
+            padding: 0.5rem 1rem;
+            font-size: 1rem;
+            background-color: #ffa07a; /* 浅橙色 */
+            color: #fff;
+            border: none;
+            border-radius: 5px;
+            cursor: pointer;
+            margin: 0.3rem;
+            transition: background-color 0.3s ease;
+        }
+        .example-topics button:hover {
+            background-color: #ff4500; /* 深橙色 */
+        }
+    </style>
+    <script>
+        let startTime = 0;
+        let intervalId = null;
+        let progressIntervalId = null;
+        let maxTime = 180; // 最大时间180秒
+        function showLoading() {
+            document.getElementById("loading").style.display = "block";
+            document.getElementById("submit-btn").disabled = true;
+            startTime = Date.now();
+            intervalId = setInterval(updateTime, 100);
+            progressIntervalId = setInterval(updateProgressBar, 100);
+        }
+        function hideLoading() {
+            document.getElementById("loading").style.display = "none";
+            document.getElementById("submit-btn").disabled = false;
+            if (intervalId) {
+                clearInterval(intervalId);
+                intervalId = null;
+            }
+            if (progressIntervalId) {
+                clearInterval(progressIntervalId);
+                progressIntervalId = null;
+            }
+            updateProgressBar(100); // 立即更新进度条至100%
+        }
+        function updateTime() {
+            const now = Date.now();
+            const elapsed = ((now - startTime) / 1000).toFixed(2);
+            document.getElementById("time-taken").innerText = `Time Taken: ${elapsed} s`;
+        }
+        function updateProgressBar(percentage = null) {
+            const progressBar = document.getElementById("progress-bar");
+            if (percentage !== null) {
+                progressBar.style.width = `${percentage}%`;
+            } else {
+                const now = Date.now();
+                const elapsed = (now - startTime) / 1000;
+                const progress = Math.min((elapsed / maxTime) * 60, 97);
+                progressBar.style.width = `${progress}%`;
+            }
+        }
+        function fillTopic(topic) {
+            document.getElementById("topic").value = topic;
+        }
+    </script>
+</head>
+<body>
+    <div class="container">
+        <h1>CoI Agent online demo 😊</h1>
+        <div class="time-counter-container">
+            <div id="time-taken" class="time-box">Time Taken: {{ time_taken }} seconds</div>
+            <div class="counter-box">Today's Replies: {{ reply_count }}</div>
+        </div>
+        <div class="example-container">
+        <div class="example-label">Example Input:</div>
+            <div class="example-topics">
+                <button onclick="fillTopic('Realistic Image Synthesis in Medical Imaging')">Realistic Image Synthesis in Medical Imaging</button>
+                <button onclick="fillTopic('Using diffusion to generate road layout')">Using diffusion to generate road layout</button>
+                <button onclick="fillTopic('Using LLM-based agent to generate idea')">Using LLM-based agent to generate idea</button>
+            </div>
+        </div>
+        <form action="/" method="post" onsubmit="showLoading()">
+            <div class="form-group">
+                <label for="topic">Topic:</label>
+                <input type="text" id="topic" name="topic">
+                <button type="submit" id="submit-btn">Generate</button>
+            </div>
+        </form>
+        <div id="loading" class="loading">Generating content, Usually takes 3-4 minutes, please wait...</div>
+        <div class="progress-bar-container">
+            <div id="progress-bar" class="progress-bar"></div>
+        </div>
+        <div class="result">
+            <div class="box">
+                <h2>Idea</h2>
+                <div>{{ idea | safe }}</div>
+            </div>
+        </div>
+        {% if error %}
+        <div class="error">
+            <div class="box">
+                <h2>Error</h2>
+                <div>{{ error }}</div>
+            </div>
+        </div>
+        {% endif %}
+    </div>
+    <script>
+        hideLoading();
+    </script>
+</body>
+</html>
+"""
+# 重置每日计数器
+def reset_counter():
+    global reply_count
+    reply_count = 0
+# 设置定时任务每天0点重置计数器
+scheduler = BackgroundScheduler()
+scheduler.add_job(reset_counter, 'cron', hour=0, minute=0)
+scheduler.start()
+@app.get("/", response_class=HTMLResponse)
+def form_get():
+    return Template(html_template).render(idea= "This is a example of the idea geneartion", error=None, reply_count=reply_count)
+@app.post("/", response_class=HTMLResponse)
+def form_post(topic: str = Form(...)):
+    global reply_count
+    start_time = time.time()
+    # 检查是否超过每日最大回复次数
+    if reply_count >= MAX_REPLIES_PER_DAY:
+        error_message = "Today's maximum number of replies has been reached. Please try again tomorrow."
+        return Template(html_template).render(idea="", error=error_message, reply_count=reply_count)
+    try:
+        main_llm, cheap_llm = get_llms()
+        deep_research_agent = DeepResearchAgent(llm=main_llm, cheap_llm=cheap_llm, improve_cnt=1, max_chain_length=5, min_chain_length=3, max_chain_numbers=1)
+        print(f"begin to generate idea of topic {topic}")
+        idea, related_experiments, entities, idea_chain, ideas, trend, future, human, year = asyncio.run(deep_research_agent.generate_idea_with_chain(topic))
+        idea_md = markdown.markdown(idea)
+        # 更新每日回复次数
+        reply_count += 1
+        end_time = time.time()
+        time_taken = round(end_time - start_time, 2)
+        return Template(html_template).render(idea=idea_md, error=None, reply_count=reply_count, time_taken=time_taken)
+    except Exception as e:
+        end_time = time.time()
+        time_taken = round(end_time - start_time, 2)
+        return Template(html_template).render(idea="", error=str(e), reply_count=reply_count, time_taken=time_taken)

main.py ADDED Viewed

	@@ -0,0 +1,38 @@

+from agents import DeepResearchAgent,ReviewAgent,get_llms
+import asyncio
+import json
+import argparse
+if __name__ == '__main__':
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("--topic",type=str,help="research topic",default="Using diffusion to generate urban road layout map")
+    argparser.add_argument("--anchor_paper_path",type=str,help="PDF path of the anchor paper",default= None)
+    argparser.add_argument("--save_file",type=str,default="saves/",help="save file path")
+    argparser.add_argument("--improve_cnt",type=int,default= 1,help="experiment refine count")
+    argparser.add_argument("--max_chain_length t",type=int,default=5,help="max chain length")
+    argparser.add_argument("--min_chain_length",type=int,default=3,help="min chain length")
+    argparser.add_argument("--max_chain_numbers",type=int,default=1,help="max chain numbers")
+    args = argparser.parse_args()
+    main_llm , cheap_llm = get_llms()
+    topic = args.topic
+    anchor_paper_path = args.anchor_paper_path
+    review_agent = ReviewAgent(save_file=args.save_file,llm=main_llm,cheap_llm=cheap_llm)
+    deep_research_agent = DeepResearchAgent(llm=main_llm,cheap_llm=cheap_llm,**vars(args))
+    print(f"begin to generate idea and experiment of topic {topic}")
+    idea,related_experiments,entities,idea_chain,ideas,trend,future,human,year=  asyncio.run(deep_research_agent.generate_idea_with_chain(topic,anchor_paper_path))
+    experiment = asyncio.run(deep_research_agent.generate_experiment(idea,related_experiments,entities))
+    for i in range(args.improve_cnt):
+        experiment = asyncio.run(deep_research_agent.improve_experiment(review_agent,idea,experiment,entities))
+    print(f"succeed to generate idea and experiment of topic {topic}")
+    res = {"idea":idea,"experiment":experiment,"related_experiments":related_experiments,"entities":entities,"idea_chain":idea_chain,"ideas":ideas,"trend":trend,"future":future,"year":year,"human":human}
+    with open("result.json","w") as f:
+        json.dump(res,f)

prompts/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# from .idea_refiner_prompts import *
+from .juder_prompts import *
+from .review_agent_prompts import *
+from .deep_research_agent_promts import *

prompts/deep_research_agent_promts.py ADDED Viewed

	@@ -0,0 +1,465 @@

+use_entities = True
+def get_deep_search_query_prompt(topic = None,query = None) -> str:
+    if topic and query:
+        prompt = f"""
+    You are a master of literature searcher, tasked with finding relevant research literatures based on a specific topic and idea.
+    Currently, we would like to study the following topic: {topic}.
+    And we have the following idea: {query}.
+    Please provide the literature search queries you would use to search for papers related to the topic and idea.
+        """
+    elif topic:
+        prompt = f"""
+    You are a master of literature searcher, tasked with finding relevant research literatures based on a specific topic.
+    Currently, we would like to study the following topic: {topic}.
+    Please provide the literature search queries you would use to search for papers related to the topic.
+    """
+    elif query:
+        prompt = f"""
+    You are a master of literature searcher, tasked with finding relevant research literatures based on a specific idea.
+    Currently, we would like to search for papers related to the following idea: {query}.
+    Please provide the literature search querie syou would use to search for papers related to the paper idea.
+    """
+    output_format = """
+    Each query should be a string and should be enclosed in double quotes.It is best to output one query representing the whole and other queries representing other different aspects of the whole.(no more than 5 queries)
+    Output strictly in the following format:
+    <queries>["query1", "query2", ...]</queries>
+    For example:
+    <queries>["Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks","Automatic agent learning for QA","QA task planning with minimal human intervention", "Few-shot learning for QA"]</queries>
+"""
+    return prompt + output_format
+def get_deep_check_idea_novel_search_query_prompt(idea,topic: str) -> str:
+    prompt = f"""
+You are a scientific research expert.
+Your task is to check whether the target idea is similar to existing research.
+The target idea you need to check is as follows:{idea}
+The topic you are studying is: {topic}
+Please provide multiple search queries to find relevant papers that can help you determine whether the idea is novel(no more than 3 queries).
+Output strictly in the following format:
+<queries>["query1", "query2", "query3"]</queries>
+For example:
+<queries>["Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks","Automatic agent learning for QA","QA task planning with minimal human intervention"]</queries>
+"""
+    return prompt
+def get_deep_rewrite_query_prompt(failed_query,topic):
+    prompt = f"""
+You are a master of search engine query writing. We want to utilize the literature search engine to find relevant paper.
+The queries that have been used so far are as follows: {failed_query}. Unfortunately, no satisfactory answers were found. Please rewrite a query to help us locate the literature we need (do not repeat the failed query).
+The topic you are studying is: {topic}.
+Please provide a new search query to find the relevant papers.
+Try to make your query more concise and general so that it can be used to search for a wide range of papers.
+If you failed more than 5 times, you can use a short query(no more than 5 words) to search for the paper.
+Please output strictly in the following format:
+<query>{{new query}}</query>
+For example:
+<query>Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks</query>
+"""
+    return prompt
+def get_deep_reference_prompt(paper_content: str,topic) -> str:
+    prompt = f"""
+You are a scientific research expert, tasked with extracting and summarizing information from provided paper content relevant to the topic: {topic}. Your deliverables will include pertinent references, extracted entities, a detailed summary, and the experimental design.
+The topic you are studying is: {topic}. (Ensure that the references are pertinent to this topic.)
+Extraction Requirements:
+Entities
+1. Identify unique entities mentioned in the paper, such as model names, datasets, metrics, and specialized terminology.
+2. Format the entities with a name followed by a brief description.
+3. Ensure all entities are relevant to the specified topic ([topic]).
+Summary Idea:
+1. Background: Elaborate on the task's context and previous work, outlining the starting point of this paper.
+2. Novelty: Describe the main innovations and contributions of this paper in comparison to prior work.
+3. Contribution: Explain the primary methods used, detailing the theory and functions of each core component.
+4. Detail Reason: Provide a thorough explanation of why the chosen methods are effective, including implementation details for further research.
+5. Limitation: Discuss current shortcomings of the approach.
+Experimental Content:
+1. Experimental Process: Detail the entire experimental procedure, from dataset construction to specific steps, ensuring clarity and thoroughness.
+2. Technical Details: Describe any specific technologies involved, providing detailed implementation processes.
+3. Clarity of Plan: State your experimental plan concisely to facilitate understanding without unnecessary complexity.
+4. Baseline: Elaborate on the baseline used, comparative methods, and experimental design, illustrating how these support and validate the conclusions drawn.
+5. Verification: Explain how your experimental design assists in verifying the core idea and ensure it is detailed and feasible.
+Relevance Criteria:
+1. Method Relevance: References must directly correlate with the paper's methodology, indicating improvements or modifications.
+2. Task Relevance: References should address the same task, even if methods differ, better have the same topic {topic}.
+3. Baseline Relevance: References should serve as baselines for the methods discussed in the paper.
+4. Output Format: Provide references without author names or publication years, formatted as titles only.
+5. Specific paper titles will be placed between <References></References>. Based on the precise citation location and the corresponding ref_id in the paper, you need to infer the specific title of your output relevant references.
+The paper content is as follows:
+{paper_content}
+Please provide the entities, summary idea, experimental design, and the three most relevant references (Sort by relevance, with priority given to new ones with the same level of relevance, do not reference the original paper.) based on the paper's content.
+Note: Ensure the references are pertinent to the topic you are studying: {topic}. If there are no relevant references, output <references>[]</references>.
+Now please output strictly in the following format:
+<entities>{{A list of entities you extract}}</entities>
+<idea>{{Background: ... \nNovelty: ...\nContribution:...\nMethods:...\nDetail reason:...\nLimitation:...\n }}</idea>
+<experiment>{{Step1:... Step2:...}}</experiment>
+<references>["{{Title1}}", "{{Title2}}",  ...]</references>
+"""
+    return prompt
+def get_deep_trend_idea_chains_prompt(idea_chains,entities,topic) -> str:
+    entities = f"""
+Here are the entities you need to know: {entities}
+""" if use_entities else ""
+    prompt = f"""
+You are a scientific research expert tasked with summarizing the historical progression of research related to our current topic, based on the literature we have reviewed.
+{entities}
+The topic you are studying is: {topic}
+The literature from early to late: {idea_chains}
+Your objective is to outline the historical evolution of the research in light of current trends. Please follow these requirements:
+Analysis of Published Viewpoints: Examine the progression of ideas across the identified papers. Detail how each paper transitions to the next—for instance, how Paper 0 leads to Paper 1, and so forth. Focus on understanding how Paper 1 builds upon the concepts in Paper 0. Elaborate on specific advancements made, including proposed modules, their designs, and the rationale behind their effectiveness in addressing previous challenges. Apply this analytical approach to each paper in the sequence.
+Please present your findings in the following format:
+<trend> {{The research trend you summarized based on the past work}} </trend>
+Example:
+<trend>from Paper 0 to Paper 1: ... \nfrom Paper 1 to Paper 2: ... \n </trend>
+"""
+    return prompt
+def get_deep_judge_relevant_prompt(target_paper_title,target_paper_abstract,topic) -> str:
+    prompt = f"""
+You are an expert researcher tasked with evaluating whether a given paper is relevant to our research topic.
+Below are the details of the paper you need to assess:
+Title: {target_paper_title}
+Abstract: {target_paper_abstract}
+The topic is: {topic}
+if the paper title and abstract are related to the topic, output <relevant>1</relevant>, otherwise output <relevant>0</relevant>. As long as you feel that this article has reference value for your question, you can use it to help you study the topic, it does not need to be completely consistent in topic.
+Please output strictly in the following format(no extra content):
+<think>{{your thinking steps}}</think>
+<relevant>{{0/1}}</relevant>
+    """
+    return prompt
+def get_deep_generate_future_direciton_prompt(idea_chains,trend,topic,entities) -> str:
+    entities = f"""
+Here are the entities you need to know: {entities}
+""" if use_entities else ""
+    prompt = f"""
+You are a scientific research expert tasked with proposing future research directions based on the literature we have reviewed.
+{entities}
+The topic you are studying is: {topic}
+The literature you have studied is as follows:
+{idea_chains}
+The following section delineates the progressive relationships among the previously summarized research papers:
+<the begin of previous trend>{trend}</the end of previous trend>
+Based on previous research, analyze how human experts think and transition from previous methods to subsequent approaches. Focus on their reasoning logic and the sources of their thought processes. Learn to emulate their reasoning patterns to further develop and guide your own research direction in a natural and coherent manner.
+Additionally, you are encouraged to adopt the following three modes of thinking:
+1. Reflection: Reflect on scenarios where a specific method encounters significant challenges. Consider potential solutions that could effectively address these issues, make the solutions sounds reasonable, novel and amazing.
+2. Analogy: Identify a specific problem you are currently facing and research existing solutions that have successfully tackled similar challenges. Explore these solutions and adapt key principles and strategies to your situation. Think creatively about how tools and approaches from other domains can be reimagined to devise a novel strategy for your issue. Encourage you to actively explore methods in other fields to solve your current problems.
+3. Deep Dive: Some methods may present specific approaches to addressing a particular problem. Consider whether there are aspects that could be modified to enhance their rationale and effectiveness.
+Note:Each article's limitations are specific to that particular piece and should not be applied to others. Carefully consider the task at hand and analyze the potential issues you might encounter if you proceed with your original approach, reflecting on the challenges previously faced. Then, think critically about how to address these issues effectively.
+You are encouraged to apply human reasoning strategies to identify future research directions based on prior studies. Aim for in-depth analysis rather than mere integration of existing ideas. Please avoid introducing unfamiliar information, ensuring that the trends you present are both authentic and reasonable. Before proposing any trends, take a moment to reflect on the principles underlying the methods you're employing and assess their relevance to your research area.
+The future research direction should be related to the topic: {topic}.
+Please output strictly in the following format:
+<human>{{The human reasoning way you analyzed based on the previous research}}</human>
+<future>{{the future research direction}}</future>
+"""
+    return prompt
+def get_deep_generate_idea_prompt(idea_chains,trend,topic,entities,future = None,bad_case = []) -> str:
+    bad_case_content = ""
+    if len(bad_case) > 0:
+        bad_case_content = "The following are examples of ideas you have proposed in the past that are similar to real papers. Please avoid this situation as much as possible. You can continue to make in-depth innovations, but avoid plagiarism:\n"
+        for i,(paper,summary) in enumerate(bad_case):
+            bad_case_content += f"<example>{i}. Your orig idea:{summary} \n Similar paper Title: {paper.title}\n Abstract: {paper.abstract}</example>\n"
+    trend = f"""
+The following section delineates the progressive relationships among the previously summarized research papers:
+<the begin of previous trend>{trend}</the end of previous trend>
+    """ if trend else ""
+    future = f"""
+The following section outlines the potential future research directions based on the literature you have studied:
+<the begin of future>{future}</the end of future>
+    """ if future else ""
+    entities = f"""
+Here are the entities you need to know: {entities}
+""" if use_entities else ""
+    prompt = f"""
+You are a scientific expert tasked with formulating a novel and innovative research idea based on your comprehensive literature review. Your objective is to propose a feasible approach that could significantly advance the field.
+{bad_case_content}
+{entities}
+The topic you are studying is: {topic}
+The literature you have studied is as follows:
+{idea_chains}
+Task: Based on the current literature, propose a research idea that incorporates the following components:
+Your idea is composed of the following components:
+Motivation:
+1. Provide a background for your idea, summarizing relevant past work.
+2. Identify shortcomings in previous research and highlight the specific problems that remain unsolved and that you aim to address.
+Novelty:
+1. Distinguish your proposed method from existing methods (preferably by naming specific approaches).
+2. Detail the improvements your method brings compared to previous work.
+3. Clearly outline at least three contributions your idea offers to the field, including the problems it resolves and the benefits it delivers.
+Method:
+1. Present a detailed description of your idea, focusing on the core method, the specific problem it solves, and enhancements over earlier research (citing relevant literature with titles).
+2. Explain the step-by-step methodology, including the functions of each module and the rationale for why this approach effectively addresses previous challenges.
+Please adhere to the following guidelines:
+1. Your research idea should be innovative, feasible, and contribute meaningfully to the field.Please carefully examine the idea you have proposed, avoid immediate perception, and try to be different from the previous methods as much as possible.
+2. Ensure your proposal is solid, clearly defined, and practical to implement. Logic should underpin your reasoning.
+3. Write in clear, concise language aimed at an audience with limited background knowledge in the subject. Avoid complex technical jargon, but when professional terms are necessary, provide thorough explanations.
+4. Refrain from introducing concepts from uncertain fields to prevent proposing ideas that may be incorrect or impractical.
+5. When referencing other research, please include the titles of the cited papers.
+6. Please avoid introducing unfamiliar information, ensuring that the trends you present are both authentic and reasonable. Before proposing any trends, take a moment to reflect on the principles underlying the methods you're employing and assess their relevance to your research area.
+7. Each article's limitations are specific to that particular piece and should not be applied to others. Carefully consider the task at hand and analyze the potential issues you might encounter if you proceed with your original approach, reflecting on the challenges previously faced. Then, think critically about how to address these issues effectively.
+{trend}
+{future}
+Please output strictly in the following format:
+<motivation>{{the motivation of your idea}}</motivation>
+<novelty> {{the novelty of your idea}} </novelty>
+<method> {{the method of your idea}} </method>
+    """
+    return prompt
+def get_deep_final_idea_prompt(idea_chains,trend,idea,topic):
+    idea = f"""
+Here is your thinking steps:
+{idea}
+    """ if idea else ""
+    if idea and trend:
+        trend = f"""The relationship between each paper are as follows: {trend}"""
+    elif trend:
+        trend = f"""
+The following section outlines the progress relationships between the previously summarized research papers:
+<the begin of summarize>{trend}</the end of summarize>
+        """
+    else:
+        trend = ""
+    prompt = f"""
+    You are an scientific expert with the primary objective of proposing a research idea based on the literature you have studied. Your goal is to propose a novel, feasible, and innovative research idea that can advance the field.
+    The topic you are studying is: {topic}
+Here are the literature you have studied:
+{idea_chains}
+Task: Based on the current literature, propose a research idea that incorporates the following components:
+Please adhere to the following guidelines:
+1. Your research idea should be innovative, feasible, and contribute meaningfully to the field. Please carefully examine the idea you have proposed, avoid immediate perception, and try to be different from the previous methods as much as possible
+2. Ensure your proposal is solid, clearly defined, and practical to implement. Logic should underpin your reasoning.
+3. Write in clear, concise language aimed at an audience with limited background knowledge in the subject. Avoid complex technical jargon, but when professional terms are necessary, provide thorough explanations.
+4. Refrain from introducing concepts from uncertain fields to prevent proposing ideas that may be incorrect or impractical.
+When referencing other research, please include the titles of the cited papers.
+{trend}
+{idea}
+The final idea should contains the title, clearly explain the origins, motivation, and challenges of your idea, detailing how you overcame these hurdles.
+Please output strictly in the following format:
+<final_idea> {{the final idea}} </final_idea>
+"""
+    return prompt
+def get_deep_check_idea_novel_prompt(idea,papers):
+    papers_content = ""
+    for i,paper in enumerate(papers):
+        papers_content += f"Paper {i}: Title:{paper.title}\n Abstract:{paper.abstract}\n"
+    prompt = f"""
+You are a scientific research expert tasked with evaluating the similarity between a specified idea and existing research. Your objective is to determine if the target idea closely resembles any findings in the provided papers.
+The target idea you need to check is as follows:
+{idea}
+The relevant papers you need to refer to are as follows:
+{papers_content}
+Here are your guidlines:
+1. Comparison Process: Begin by thoroughly comparing each paper's ideas with the target idea. Consider the methodologies, conclusions, and underlying concepts in each paper in your analysis.
+2. Similarity Assessment: If the target idea shares fundamental similarities with any existing research to the extent that they can be considered identical, classify this as plagiarism.
+3. Output: Your output should provide a clear thought process, the similarity assessment, a summary of the target idea, and the ID of the most relevant similar paper.
+Please output strictly in the following format:
+<think>{{your thinking steps}}</think>
+<similar>{{0/1}}</similar>
+<summary>{{the summary of the target idea}}</summary>
+<similar_paper_id>{{the id of the similar paper}}</similar_paper_id>
+For example:
+<think> There are my think steps:... </think>
+<similar>0</similar>
+<summary> It proposes ... </summary>
+<similar_paper_id>0</similar_paper_id>
+"""
+    return prompt
+def get_deep_generate_experiment_prompt(idea,experiments,entities) -> str:
+    prompt = f"""
+You are a scientific expert tasked with designing rigorous, feasible, and impactful experiments based on specified scientific questions and the methodologies derived from the idea I provide, along with relevant past research. Your goal is to assist researchers in systematically testing hypotheses and validating innovative discoveries that could significantly advance their fields.
+Past Related Research Experiments: {experiments}
+Here are the entities you need to know: {entities}.
+Here is the idea you need to design an experiment for: {idea}.
+Please propose a detailed experimental plan addressing the following points:
+1. Experimental Design: Develop rigorous experiments to ensure the reliability and validity of your results. Provide a comprehensive explanation of the baseline used, comparative methods, ablation study design, and criteria for data analysis and result evaluation. Clarify how these components collectively reinforce and validate the conclusions of your research. Structure your experimental design in a clear, logical, and step-by-step manner, ensuring each step is well-defined and easy to understand.
+2. Implementation of Technologies/Methods: If your experimental design involves specific technologies or methodologies, describe the implementation process in detail, including key technical aspects. For any critical concepts utilized, provide thorough explanations. For instance, if you propose a modular approach, detail its construction, components, and functionality.
+3. Feasibility Assessment: Ensure your experimental plan is realistic, considering technological availability, timelines, resources, and personnel. Identify potential challenges and propose strategies for addressing them.
+4. References to Previous Studies: When citing related literature, include titles and pertinent details of the original papers. Strive to use as many references as necessary to support your experimental design.
+5. Visual Aids: If useful, provide pseudocode or a flowchart to illustrate the implementation process. For example, you can use pseudocode to detail the core algorithm or the model architecture, or employ a flowchart to map out the experimental procedure and data flow.
+6. Clarity of Language: Use straightforward language to describe your methods, assuming the reader may have limited knowledge of the subject matter. Avoid complex jargon and utilize accessible terminology. If professional terms are necessary, please provide clear and detailed explanations.
+Please output strictly in the following format:
+<experiment>{{your experimental plan}}</experiment>
+For example:
+<experiment> Step1: ... \n Step2: ..., ..., ... </experiment>
+"""
+    return prompt
+def get_deep_refine_experiment_prompt(experiment,suggestions,paper_infos=None,entities = None) -> str:
+    infos = f"""
+The literature infos you maybe need to refer to are as follows: {paper_infos}
+""" if paper_infos else ""
+    prompt = f"""
+You are a research expert tasked with refining and improving an experimental plan based on the feedback received.
+{infos}
+The experimental plan you proposed is as follows:
+{experiment}
+Please propose a detailed experimental plan addressing the following points:
+1. Experimental Design: Develop rigorous experiments to ensure the reliability and validity of your results. Provide a comprehensive explanation of the baseline used, comparative methods, ablation study design, and criteria for data analysis and result evaluation. Clarify how these components collectively reinforce and validate the conclusions of your research. Structure your experimental design in a clear, logical, and step-by-step manner, ensuring each step is well-defined and easy to understand.
+2. Implementation of Technologies/Methods: If your experimental design involves specific technologies or methodologies, describe the implementation process in detail, including key technical aspects. For any critical concepts utilized, provide thorough explanations. For instance, if you propose a modular approach, detail its construction, components, and functionality.
+3. Feasibility Assessment: Ensure your experimental plan is realistic, considering technological availability, timelines, resources, and personnel. Identify potential challenges and propose strategies for addressing them.
+4. References to Previous Studies: When citing related literature, include titles and pertinent details of the original papers. Strive to use as many references as necessary to support your experimental design.
+5. Visual Aids: If useful, provide pseudocode or a flowchart to illustrate the implementation process. For example, you can use pseudocode to detail the core algorithm or the model architecture, or employ a flowchart to map out the experimental procedure and data flow.
+6. Clarity of Language: Use straightforward language to describe your methods, assuming the reader may have limited knowledge of the subject matter. Avoid complex jargon and utilize accessible terminology. If professional terms are necessary, please provide clear and detailed explanations.
+You have received the following suggestions for improvement:
+{suggestions}
+Please refine your experimental plan based on the feedback provided. Ensure your refined plan is feasible, clearly defined, and addresses the feedback you received.
+Please output strictly in the following format:
+<experiment>{{your refined experimental plan}}</experiment>
+"""
+    return prompt
+def get_deep_refine_experiment_search_query_prompt(experiment,suggestions):
+    prompt = f"""
+You are a research expert tasked with refining and improving an experimental plan based on the feedback received.
+The experimental plan you proposed is as follows:
+{experiment}
+You have received the following suggestions for improvement:
+{suggestions}
+Please decide whether you need to search for relevant papers to obtain relevant knowledge to improve your experiment.
+If you need to search for relevant papers, please provide a  search query(only a conciese phrase) for literature search, else provide "".
+For example: if suggestions say that the dynamic query additional information and update knowledge graph described in the experiment is not clearly described, so you need to output <query>dynamic knowledge graph update</query>(only a conciese phrase) .
+Please output strictly in the following format:
+<query>{{the search query}}</query>, or <query></query> if no search is needed.
+For example:
+<query>Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks</query>
+"""
+    return prompt
+def get_deep_paper_info_prompt_for_refine_experiment(paper,experiment,suggestions) -> str:
+    prompt = f"""
+You are a scientific research expert.
+Your task is to research the relevant literature to refine your experiment.
+The literature you need to study is:
+{paper}
+The experiment designed for the idea is:
+{experiment}
+You have received the following suggestions for improvement:
+{suggestions}
+Please extract useful information from the paper that can help you improve your experiment.For example, if the paper describes a method or dataset or matric that can be used in your experiment, you should extract this method.
+Please output strictly in the following format:
+<info>{{The information you extracted from the paper}}</info>
+"""
+    return prompt

prompts/juder_prompts.py ADDED Viewed

	@@ -0,0 +1,69 @@

+def get_judge_idea_all_prompt(idea0,idea1,topic):
+    prompt = f"""
+You are a judge in a competition. You have to decide which idea is better.
+The idea0 is: {idea0}
+The idea1 is: {idea1}
+The topic is: {topic}
+Which idea do you think is better? Please write a short paragraph to explain your choice.
+Here are your evaluation criteria:
+1. Novelty: Are the problems or approaches new? Is this a novel combination of familiar techniques? Is it clear how this work differs from previous contributions? Is related work adequately referenced?
+2. Significance: Are the idea important? Are other people (practitioners or researchers) likely to use these ideas or build on them? Does the idea address a difficult problem in a better way than previous research? Does it provide a unique theoretical or pragmatic approach?
+3. Feasibility: Can the idea be realized with existing technology or methods? Are there any technical difficulties or bottlenecks? Is the idea clear and logical? Is there any obvious error or unreasonable part in the idea, and can the experiment be designed normally according to this idea.
+4. Clarity: Is the paper clearly written? Is it well-organized? Does it adequately inform the reader?
+5. Effectiveness: How likely the proposed idea is going to work well (e.g., better than existing baselines).
+Note:
+Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. Be as objective as possible. (very important!!!)
+If you think idea0 is better than idea1, you should output 0. If you think idea1 is better than idea0, you should output 1. If you think idea0 and idea1 are equally good, you should output 2.
+Your output should be strictly in following format:
+Your thinking process:
+...
+Your choice:
+<novelty>{{ Your choice for novelty }}</novelty>
+<significance>{{ Your choice for significance }}</significance>
+<feasibility>{{ Your choice for feasibility }}</feasibility>
+<clarity>{{ Your choice for clarity }}</clarity>
+<effectiveness>{{ Your choice for effectiveness }}</effectiveness>
+    """
+    return prompt
+def get_judge_experiment_all_prompt(idea0,experiment0,idea1,experiment1):
+    prompt = f"""
+You are a judge in a competition. You have to decide which experiment is better.
+The idea of experiment0 is: {idea0}
+The experiment0 is: {experiment0}
+The idea of experiment1 is: {idea1}
+The experiment1 is: {experiment1}
+Which experiment do you think is better? Please write a short paragraph to explain your choice.
+Here are your evaluation criteria:
+1. Feasibility: Can the experiment be realized with existing technology or methods? Are there any technical difficulties or bottlenecks? Is the experimental plan detailed and feasible? Are the experimental steps clear and logical? Is there any obvious error or unreasonable part in the experiment. Consider the rationality of its steps and the possibility that the idea can be successfully implemented.
+2. Quality: Is there a clear rationale for each step of the experimental design? Are the baseline and evaluation metrics chosen appropriately? Has the design taken into account the potential advantages and limitations of the methods used? Can this experimental design effectively support the claims made in the idea.
+3. Clarity: Is the experimental plan clearly written? Dose it provide enough information for the expert reader to understand the experiment? Is it well organized? Does it adequately inform the reader?
+Note: Avoid any position biases and ensure that the order in which the responses were presented does not influence your decision. DO NOT allow the LENGTH of the responses to influence your evaluation, choose the one that is straight-to-the-point instead of unnecessarily verbose. Be as objective as possible. (very important!!!)
+If you think experiment0 is better than experiment1, you should output 0. If you think experiment1 is better than experiment0, you should output 1. If you think experiment0 and experiment1 are equally good, you should output 2.
+Your output should be strictly in following format:
+Your thinking process:
+...
+Your choice:
+<feasibility>{{Your choice for feasibility}}</feasibility>
+<quality>{{Your choice for quality}}</quality>
+<clarity>{{Your choice for clarity}}</clarity>
+    """
+    return prompt

prompts/review_agent_prompts.py ADDED Viewed

	@@ -0,0 +1,66 @@

+def get_review_search_related_paper_prompt(idea,topic):
+    prompt = f"""
+You are a paper reviewer with expertise in the field.
+The paper presents the idea: {idea}. Your task is to conduct a thorough literature review in the relevant field to assess the feasibility and originality of this idea, and to determine whether it has already been explored by others.
+Please provide the literature search queries(no more than 3 queries) you would use to search for papers related to the paper idea.
+Each query should be a string and should be enclosed in double quotes.
+Your output should be strictly in the following format:
+<queries> ["query1", "query2", ...] </queries>
+For example:
+<queries>["Reducing reliance on large-scale annotated data and closed-source models for planning in QA tasks","Automatic agent learning for QA","QA task planning with minimal human intervention"]</queries>
+"""
+    return prompt
+def get_review_suggestions_from_papers_prompt(idea,topic,paper):
+    prompt = f"""
+You are a manuscript review expert.
+Here are some relevant literature knowledge you have: {paper}.
+Currently you are assessing a paper on the topic: {topic}.
+The idea presented in the paper is: {idea}.
+Please analyze the feasibility and novelty of the paper's idea and provide suggestions for improvement, if any. (If there are no suggestions, please do not include any output.)(Please include the title of the paper you are referencing in the suggestion section)
+You should also pay attention to whether the idea is related to the topic we are studying({topic}), and analyze whether it can help us solve topic-related problems.
+There are some suggestions for you to consider:
+1. Point out any confusion you had while reading the idea and suggest changes.
+2. Based on relevant knowledge, think about the feasibility of the idea, whether the design of each step is reasonable, whether the statement is clear, and put forward your relevant suggestions.
+3. Think about how the method can be improved to increase its novelty and feasibility, while trying not to increase the complexity of the method.
+Your output should be strictly in the following format:
+<suggestions> {{your suggestions to modify the idea}} </suggestions>
+if you have no suggestions, please provide:
+<suggestions></suggestions>
+"""
+    return prompt
+def get_review_experiment_design_suggestions_prompt(idea, experiment,entities):
+    prompt = f"""
+You are an expert in paper review. Your task is to analyze whether a given experiment can effectively verify a specific idea, as well as assess the detail and feasibility of the experiment.
+Here are the relevant entities to consider: {entities}.
+The idea presented is: {idea}.
+The corresponding experiment designed for this idea is: {experiment}.
+Please conduct your analysis based on the following criteria:
+1. Can the experiment validate the idea? If not, identify the issues and suggest improvements to enhance its verification capability and feasibility.
+2. Are there specific experimental procedures that are confusing or poorly designed? Discuss any methods that may not be feasible, uncertainties in constructing the dataset, or a lack of explanation regarding the implementation of certain methods.
+3. Evaluate the clarity, detail, reasonableness, and feasibility of the experimental design.
+4. Provide suggestions for improving the experiment based on the shortcomings identified in your analysis.
+5. Focus solely on the experiment design; please refrain from altering the original idea.
+6. Ensure that your suggestions are constructive, concise, and specific.
+Please strictly follow the following format for output:
+<suggestion>{{Suggestions for improving the experiment}}</suggestion>
+"""
+    return prompt

requirements.txt ADDED Viewed

	@@ -0,0 +1,23 @@

+aiohttp
+beautifulsoup4
+delft
+httpx
+lmdb
+lxml
+numpy
+openai
+pandas
+protobuf
+PyYAML
+Requests
+setuptools
+spacy
+tenacity
+textstat
+tqdm
+fastapi
+uvicorn[standard]
+jinja2
+markdown
+apscheduler
+Pillow

searcher/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

searcher/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# from .arxiv_reader import Arxiv_Reader
+# from .google_crawl import GoogleCrawler
+from .sementic_search import SementicSearcher,Result
+# from .ResearchAgentSearch import ResearchSearcher

searcher/sementic_search.py ADDED Viewed

	@@ -0,0 +1,397 @@

+import requests
+import json
+import yaml
+import scipdf
+import os
+import time
+import aiohttp
+import asyncio
+import numpy as np
+def get_content_between_a_b(start_tag, end_tag, text):
+    extracted_text = ""
+    start_index = text.find(start_tag)
+    while start_index != -1:
+        end_index = text.find(end_tag, start_index + len(start_tag))
+        if end_index != -1:
+            extracted_text += text[start_index + len(start_tag) : end_index] + " "
+            start_index = text.find(start_tag, end_index + len(end_tag))
+        else:
+            break
+    return extracted_text.strip()
+def extract(text, type):
+    if text:
+        target_str = get_content_between_a_b(f"<{type}>", f"</{type}>", text)
+        if target_str:
+            return target_str
+        else:
+            return text
+    else:
+        return ""
+async def fetch(url):
+    await asyncio.sleep(1)
+    try:
+        timeout = aiohttp.ClientTimeout(total=120)
+        async with aiohttp.ClientSession(timeout=timeout) as session:
+            async with session.get(url) as response:
+                if response.status == 200:
+                    content = await response.read()  # Read the response content as bytes
+                    return content
+                else:
+                    await asyncio.sleep(0.01)
+                    print(f"Failed to fetch the URL: {url} with status code: {response.status}")
+                    return None
+    except aiohttp.ClientError as e:  # 更具体的异常捕获
+        await asyncio.sleep(0.01)
+        print(f"An error occurred while fetching the URL: {url}")
+        print(e)
+        return None
+    except Exception as e:
+        await asyncio.sleep(0.01)
+        print(f"An unexpected error occurred while fetching the URL: {url}")
+        print(e)
+        return None
+class Result:
+    def __init__(self,title="",abstract="",article = "",citations_conut = 0,year = None) -> None:
+        self.title = title
+        self.abstract = abstract
+        self.article = article
+        self.citations_conut = citations_conut
+        self.year = year
+# Define the API endpoint URL
+semantic_fields = ["title", "abstract", "year", "authors.name", "authors.paperCount", "authors.citationCount","authors.hIndex","url","referenceCount","citationCount","influentialCitationCount","isOpenAccess","openAccessPdf","fieldsOfStudy","s2FieldsOfStudy","embedding.specter_v1","embedding.specter_v2","publicationDate","citations"]
+fieldsOfStudy = ["Computer Science","Medicine","Chemistry","Biology","Materials Science","Physics","Geology","Art","History","Geography","Sociology","Business","Political Science","Philosophy","Art","Literature","Music","Economics","Philosophy","Mathematics","Engineering","Environmental Science","Agricultural and Food Sciences","Education","Law","Linguistics"]
+# citations.paperId, citations.title, citations.year, citations.authors.name, citations.authors.paperCount, citations.authors.citationCount, citations.authors.hIndex, citations.url, citations.referenceCount, citations.citationCount, citations.influentialCitationCount, citations.isOpenAccess, citations.openAccessPdf, citations.fieldsOfStudy, citations.s2FieldsOfStudy, citations.publicationDate
+# publicationDateOrYear： 2019-03-05 ； 2019-03 ； 2019 ； 2016-03-05:2020-06-06 ； 1981-08-25: ； :2020-06-06 ； 1981:2020
+# publicationTypes: Review ； JournalArticle CaseReport ； ClinicalTrial ； Dataset ； Editorial ； LettersAndComments ； MetaAnalysis ； News ； Study ； Book ； BookSection
+def process_fields(fields):
+   return ",".join(fields)
+class SementicSearcher:
+    def __init__(self, ban_paper = []) -> None:
+        self.ban_paper = ban_paper
+    async def search_papers_async(self, query, limit=5, offset=0, fields=["title", "paperId", "abstract", "isOpenAccess", 'openAccessPdf', "year","publicationDate","citations.title","citations.abstract","citations.isOpenAccess","citations.openAccessPdf","citations.citationCount","citationCount","citations.year"],
+                            publicationDate=None, minCitationCount=0, year=None,
+                            publicationTypes=None, fieldsOfStudy=None):
+        url = 'https://api.semanticscholar.org/graph/v1/paper/search'
+        fields = process_fields(fields) if isinstance(fields, list) else fields
+        # More specific query parameter
+        query_params = {
+            'query': query,
+            "limit": limit,
+            "offset": offset,
+            'fields': fields,
+            'publicationDateOrYear': publicationDate,
+            'minCitationCount': minCitationCount,
+            'year': year,
+            'publicationTypes': publicationTypes,
+            'fieldsOfStudy': fieldsOfStudy
+        }
+        # Load the API key from the configuration file
+        api_key = os.environ.get('SEMENTIC_SEARCH_API_KEY',None)
+        headers = {'x-api-key': api_key} if api_key else None
+        await asyncio.sleep(0.5)
+        try:
+            filtered_query_params = {key: value for key, value in query_params.items() if value is not None}
+            response = requests.get(url, params=filtered_query_params, headers=headers)
+            if response.status_code == 200:
+                response_data = response.json()
+                return response_data
+            elif response.status_code == 429:
+                time.sleep(1)
+                print(f"Request failed with status code {response.status_code}: begin to retry")
+                return await self.search_papers_async(query, limit, offset, fields, publicationDate, minCitationCount, year, publicationTypes, fieldsOfStudy)
+            else:
+                print(f"Request failed with status code {response.status_code}: {response.text}")
+                return None
+        except requests.RequestException as e:
+            print(f"An error occurred: {e}")
+            return None
+    def cal_cosine_similarity(self, vec1, vec2):
+        return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
+    def read_arxiv_from_path(self, pdf_path):
+        article_dict = scipdf.parse_pdf_to_dict(pdf_path)
+        return article_dict
+    async def get_paper_embbeding_and_score_async(self,query_embedding, paper,llm):
+        paper_content = f"""
+Title: {paper['title']}
+Abstract: {paper['abstract']}
+"""
+        paper_embbeding = await llm.get_embbeding_async(paper_content)
+        paper_embbeding = np.array(paper_embbeding)
+        score = self.cal_cosine_similarity(query_embedding,paper_embbeding)
+        return [paper,score]
+    async def rerank_papers_async(self, query_embedding, paper_list,llm):
+        if len(paper_list) >= 50:
+            paper_list = paper_list[:50]
+        results = await asyncio.gather(*[self.get_paper_embbeding_and_score_async(query_embedding, paper,llm) for paper in paper_list if paper])
+        reranked_papers = sorted(results,key = lambda x: x[1],reverse = True)
+        return reranked_papers
+    async def get_embbeding_and_score_async(self,query_embedding, text,llm):
+        text_embbeding = await llm.get_embbeding_async(text)
+        text_embbeding = np.array(text_embbeding)
+        score = self.cal_cosine_similarity(query_embedding,text_embbeding)
+        return score
+    async def get_embbeding_and_score_from_texts_async(self,query_embedding, texts,llm):
+        results = await asyncio.gather(*[self.get_embbeding_and_score_async(query_embedding, text,llm) for text in texts])
+        return results
+    async def get_paper_details_async(self, paper_id, fields = ["title", "abstract", "year","citationCount","isOpenAccess","openAccessPdf"]):
+        url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}'
+        fields = process_fields(fields)
+        paper_data_query_params = {'fields': fields}
+        try:
+            async with aiohttp.ClientSession() as session:
+                filtered_query_params = {key: value for key, value in paper_data_query_params.items() if value is not None}
+                headers = {'x-api-key': os.environ.get('SEMENTIC_SEARCH_API_KEY',None)}
+                async with session.get(url, params=filtered_query_params, headers=headers) as response:
+                    if response.status == 200:
+                        response_data = await response.json()
+                        return response_data
+                    else:
+                        await asyncio.sleep(0.01)
+                        print(f"Request failed with status code {response.status}: {await response.text()}")
+                        return None
+        except Exception as e:
+            print(f"Failed to get paper details for paper ID: {paper_id}")
+            return None
+    async def batch_retrieve_papers_async(self, paper_ids, fields = semantic_fields):
+        url = 'https://api.semanticscholar.org/graph/v1/paper/batch'
+        paper_data_query_params = {'fields': process_fields(fields)}
+        paper_ids_json = {"ids": paper_ids}
+        try:
+            async with aiohttp.ClientSession() as session:
+                filtered_query_params = {key: value for key, value in paper_data_query_params.items() if value is not None}
+                headers = {'x-api-key': os.environ.get('SEMENTIC_SEARCH_API_KEY',None)}
+                async with session.post(url, json=paper_ids_json, params=filtered_query_params, headers=headers) as response:
+                    if response.status == 200:
+                        response_data = await response.json()
+                        return response_data
+                    else:
+                        await asyncio.sleep(0.01)
+                        print(f"Request failed with status code {response.status}: {await response.text()}")
+                        return None
+        except Exception as e:
+            print(f"Failed to batch retrieve papers for paper IDs: {paper_ids}")
+            return None
+    async def search_paper_from_title_async(self, query,fields = ["title","paperId"]):
+        url = 'https://api.semanticscholar.org/graph/v1/paper/search/match'
+        fields = process_fields(fields)
+        query_params = {'query': query, 'fields': fields}
+        try:
+            async with aiohttp.ClientSession() as session:
+                filtered_query_params = {key: value for key, value in query_params.items() if value is not None}
+                headers = {'x-api-key': os.environ.get('SEMENTIC_SEARCH_API_KEY',None)}
+                async with session.get(url, params=filtered_query_params, headers=headers) as response:
+                    if response.status == 200:
+                        response_data = await response.json()
+                        return response_data
+                    else:
+                        await asyncio.sleep(0.01)
+                        print(f"Request failed with status code {response.status}: {await response.text()}")
+                        return None
+        except Exception as e:
+            await asyncio.sleep(0.01)
+            print(f"Failed to search paper from title: {query}")
+            return None
+    async def search_async(self,query,max_results = 5 ,paper_list = None ,rerank_query = None,llm = None,year = None,publicationDate = None,need_download = True,fields = ["title", "paperId", "abstract", "isOpenAccess", 'openAccessPdf', "year","publicationDate","citationCount"]):
+        if rerank_query:
+            rerank_query_embbeding = llm.get_embbeding(rerank_query)
+            rerank_query_embbeding = np.array(rerank_query_embbeding)
+        readed_papers = []
+        if paper_list:
+            if isinstance(paper_list,set):
+                paper_list = list(paper_list)
+            if len(paper_list) == 0 :
+                pass
+            elif isinstance(paper_list[0], str):
+                readed_papers = paper_list
+            elif isinstance(paper_list[0], Result):
+                readed_papers = [paper.title for paper in paper_list]
+        print(f"Searching for papers related to the query: <{query}>")
+        results = await self.search_papers_async(query,limit = 10 * max_results,year=year,publicationDate = publicationDate,fields = fields)
+        if not results or "data" not in results:
+            return []
+        new_results = []
+        for result in results['data']:
+            if result['title'] in self.ban_paper:
+                continue
+            new_results.append(result)
+        results = new_results
+        final_results = []
+        if need_download:
+            paper_candidates = []
+            for result in results:
+                if not result['isOpenAccess'] or  not result['openAccessPdf'] or result['title'] in readed_papers:
+                    continue
+                else:
+                    paper_candidates.append(result)
+        else:
+            paper_candidates = results
+        if llm and rerank_query:
+            paper_candidates = await self.rerank_papers_async(rerank_query_embbeding, paper_candidates,llm)
+            paper_candidates = [paper[0] for paper in paper_candidates if paper]
+        if need_download:
+            for result in paper_candidates:
+                pdf_link = result['openAccessPdf']["url"]
+                try:
+                    content = await self.download_pdf_async(pdf_link)
+                    if not content:
+                        continue
+                except Exception as e:
+                    continue
+                title = result['title']
+                abstract = result['abstract']
+                citationCount = result['citationCount']
+                year = result['year']
+                article = scipdf.parse_pdf_to_dict(content)
+                if not article:
+                    continue
+                final_results.append(Result(title,abstract,article,citationCount,year))
+                if len(final_results) >= max_results:
+                    break
+        else:
+            for result in paper_candidates:
+                title = result['title']
+                abstract = result['abstract']
+                citationCount = result['citationCount']
+                year = result['year']
+                final_results.append(Result(title,abstract,None,citationCount,year))
+                if len(final_results) >= max_results:
+                    break
+        return final_results
+    async def search_related_paper_async(self,title,need_citation = True,need_reference = True,rerank_query = None,llm = None,paper_list = []):
+        print(f"Searching for the related papers of <{title}>")
+        fileds = ["title","abstract","citations.title","citations.abstract","citations.citationCount","references.title","references.abstract","references.citationCount","citations.isOpenAccess","citations.openAccessPdf","references.isOpenAccess","references.openAccessPdf","citations.year","references.year"]
+        results = await self.search_papers_async(title,limit = 3,fields=fileds)
+        related_papers = []
+        related_papers_title = []
+        if not results or "data" not in results:
+            return None
+        for result in results["data"]:
+            if not result:
+                continue
+            if need_citation:
+                for citation in result["citations"]:
+                    if "openAccessPdf" not in citation or not citation["openAccessPdf"]:
+                        continue
+                    elif citation["title"] in related_papers_title or citation["title"] in self.ban_paper or citation["title"] in paper_list:
+                        continue
+                    elif citation["isOpenAccess"] == False or citation["openAccessPdf"] == None:
+                        continue
+                    else:
+                        related_papers.append(citation)
+                        related_papers_title.append(citation["title"])
+            if need_reference:
+                for reference in result["references"]:
+                    if "openAccessPdf" not in reference or not reference["openAccessPdf"]:
+                        continue
+                    elif reference["title"] in related_papers_title or reference["title"] in self.ban_paper or reference["title"] in paper_list:
+                        continue
+                    elif reference["isOpenAccess"] == False or reference["openAccessPdf"] == None:
+                        continue
+                    else:
+                        related_papers.append(reference)
+                        related_papers_title.append(reference["title"])
+            if result:
+                break
+        if len(related_papers) >= 200:
+            related_papers = related_papers[:200]
+        if rerank_query and llm:
+            rerank_query_embbeding = llm.get_embbeding(rerank_query)
+            rerank_query_embbeding = np.array(rerank_query_embbeding)
+            related_papers = await self.rerank_papers_async(rerank_query_embbeding, related_papers,llm)
+            related_papers = [paper[0] for paper in related_papers]
+            related_papers = [[paper["title"],paper["abstract"],paper["openAccessPdf"]["url"],paper["citationCount"],paper['year']] for paper in related_papers]
+        else:
+            related_papers = [[paper["title"],paper["abstract"],paper["openAccessPdf"]["url"],paper["citationCount"],paper['year']] for paper in related_papers]
+            related_papers = sorted(related_papers,key = lambda x: x[3],reverse = True)
+        print(f"Found {len(related_papers)} related papers")
+        for paper in related_papers:
+            url = paper[2]
+            content = await self.download_pdf_async(url)
+            if content:
+                article = scipdf.parse_pdf_to_dict(content)
+                if not article:
+                    continue
+                result = Result(paper[0],paper[1],article,paper[3],paper[4])
+                return result
+        return None
+    async def download_pdf_async(self, pdf_link):
+        content = await fetch(pdf_link)
+        if not content:
+            return None
+        else:
+            return content
+    def read_paper_title_abstract(self,article):
+        title = article["title"]
+        abstract = article["abstract"]
+        paper_content = f"""
+Title: {title}
+Abstract: {abstract}
+        """
+        return paper_content
+    def read_paper_content(self,article):
+        paper_content = self.read_paper_title_abstract(article)
+        for section in article["sections"]:
+            paper_content += f"section: {section['heading']}\n content: {section['text']}\n ref_ids: {section['publication_ref']}\n"
+        return paper_content
+    def read_paper_content_with_ref(self,article):
+        paper_content = self.read_paper_content(article)
+        paper_content += "<References>\n"
+        i = 1
+        for refer in article["references"]:
+            ref_id = refer["ref_id"]
+            title = refer["title"]
+            year = refer["year"]
+            paper_content += f"Ref_id:{ref_id} Title: {title} Year: ({year})\n"
+            i += 1
+        paper_content += "</References>\n"
+        return paper_content

start.sh ADDED Viewed

	@@ -0,0 +1,9 @@

+#!/bin/sh
+# 启动gradlew服务并将其放到后台运行
+cd grobid
+./gradlew run --console=plain &
+cd ..
+# 启动uvicorn服务
+uvicorn app:app --host 0.0.0.0 --port 7860

style.css ADDED Viewed

	@@ -0,0 +1,4 @@

+/* styles.css */
+.same-height {
+    height: 100%;
+}

supervisord.conf ADDED Viewed

	@@ -0,0 +1,22 @@

+[supervisord]
+nodaemon=true
+pidfile=/dev/null
+logfile=/dev/null
+logfile_maxbytes=0
+logfile_backups=0
+loglevel=info
+[program:gradle_service]
+command=/app/grobid/gradlew run
+environment=JAVA_HOME="/opt/jdk-11.0.2",PATH="/opt/jdk-11.0.2/bin:/home/user/.local/bin:/usr/local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin"
+directory=/app
+autostart=true
+autorestart=true
+startsecs=10
+stdout_logfile=/dev/stdout
+stderr_logfile=/dev/stder
+[program:uvicorn_service]
+command=uvicorn app:app --host "0.0.0.0" --port 7860
+directory=/app

utils.py ADDED Viewed

	@@ -0,0 +1,36 @@

+def get_content_between_a_b(start_tag, end_tag, text):
+    extracted_text = ""
+    start_index = text.find(start_tag)
+    while start_index != -1:
+        end_index = text.find(end_tag, start_index + len(start_tag))
+        if end_index != -1:
+            extracted_text += text[start_index + len(start_tag) : end_index] + " "
+            start_index = text.find(start_tag, end_index + len(end_tag))
+        else:
+            break
+    return extracted_text.strip()
+def extract(text, type,hard = True):
+    if text:
+        target_str = get_content_between_a_b(f"<{type}>", f"</{type}>", text)
+        if target_str:
+            return target_str
+        elif hard:
+            return text
+        else:
+            return ""
+    else:
+        return ""
+def extract_json(text):
+    if "```json" in text:
+        target_str = get_content_between_a_b("```json", "```", text)
+        return target_str
+    else:
+        return text