File size: 20,552 Bytes
f71c233 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 |
import json
import os
import numpy as np
import pymupdf
import pymupdf4llm
from pypdf import PdfReader
from strictjson import strict_json
from ai_scientist.llm import (
extract_json_between_markers,
get_batch_responses_from_llm,
get_response_from_llm,
llm_json_auto_correct,
)
# Format the content in JSON
def format_llm_review_json(text):
res = strict_json(
system_prompt="You are a JSON formatter",
user_prompt=text,
return_as_json=True,
output_format={
"Summary": "A summary of the paper content and its contributions.",
"Strengths": "A list of strengths of the paper, type: list",
"Weaknesses": "A list of weaknesses of the paper, type: list",
"Originality": "A rating from 1 to 4 (low, medium, high, very high), type: int",
"Quality": "A rating from 1 to 4 (low, medium, high, very high), type: int",
"Clarity": "A rating from 1 to 4 (low, medium, high, very high), type: int",
"Significance": "A rating from 1 to 4 (low, medium, high, very high), type: int",
"Questions": "A set of clarifying questions to be answered by the paper authors, type: list",
"Limitations": "A set of limitations and potential negative societal impacts of the work, type: str",
"Ethical Concerns": "A boolean value indicating whether there are ethical concerns, type: bool",
"Soundness": "A rating from 1 to 4 (poor, fair, good, excellent), type: int",
"Presentation": "A rating from 1 to 4 (poor, fair, good, excellent), type: int",
"Contribution": "A rating from 1 to 4 (poor, fair, good, excellent), type: int",
"Overall": "A rating from 1 to 10 (very strong reject to award quality), type: int",
"Confidence": "A rating from 1 to 5 (low, medium, high, very high, absolute), type: int",
"Decision": "A decision that has to be Accept or Reject, type: str",
},
llm=llm_json_auto_correct,
)
text = json.loads(res)
return text
reviewer_system_prompt_base = (
"You are an AI researcher who is reviewing a paper that was submitted to a prestigious ML venue."
"Be critical and cautious in your decision."
)
reviewer_system_prompt_neg = (
reviewer_system_prompt_base
+ "If a paper is bad or you are unsure, give it bad scores and reject it."
)
reviewer_system_prompt_pos = (
reviewer_system_prompt_base
+ "If a paper is good or you are unsure, give it good scores and accept it."
)
template_instructions = """
Respond in the following format:
THOUGHT:
<THOUGHT>
REVIEW JSON:
```json
<JSON>
```
In <THOUGHT>, first briefly discuss your intuitions and reasoning for the evaluation.
Detail your high-level arguments, necessary choices and desired outcomes of the review.
Do not make generic comments here, but be specific to your current paper.
Treat this as the note-taking phase of your review.
In <JSON>, provide the review in JSON format with the following fields in the order:
- "Summary": A summary of the paper content and its contributions.
- "Strengths": A list of strengths of the paper.
- "Weaknesses": A list of weaknesses of the paper.
- "Originality": A rating from 1 to 4 (low, medium, high, very high).
- "Quality": A rating from 1 to 4 (low, medium, high, very high).
- "Clarity": A rating from 1 to 4 (low, medium, high, very high).
- "Significance": A rating from 1 to 4 (low, medium, high, very high).
- "Questions": A set of clarifying questions to be answered by the paper authors.
- "Limitations": A set of limitations and potential negative societal impacts of the work.
- "Ethical Concerns": A boolean value indicating whether there are ethical concerns.
- "Soundness": A rating from 1 to 4 (poor, fair, good, excellent).
- "Presentation": A rating from 1 to 4 (poor, fair, good, excellent).
- "Contribution": A rating from 1 to 4 (poor, fair, good, excellent).
- "Overall": A rating from 1 to 10 (very strong reject to award quality).
- "Confidence": A rating from 1 to 5 (low, medium, high, very high, absolute).
- "Decision": A decision that has to be one of the following: Accept, Reject.
For the "Decision" field, don't use Weak Accept, Borderline Accept, Borderline Reject, or Strong Reject. Instead, only use Accept or Reject.
This JSON will be automatically parsed, so ensure the format is precise.
"""
neurips_form = (
"""
## Review Form
Below is a description of the questions you will be asked on the review form for each paper and some guidelines on what to consider when answering these questions.
When writing your review, please keep in mind that after decisions have been made, reviews and meta-reviews of accepted papers and opted-in rejected papers will be made public.
1. Summary: Briefly summarize the paper and its contributions. This is not the place to critique the paper; the authors should generally agree with a well-written summary.
- Strengths and Weaknesses: Please provide a thorough assessment of the strengths and weaknesses of the paper, touching on each of the following dimensions:
- Originality: Are the tasks or methods new? Is the work a novel combination of well-known techniques? (This can be valuable!) Is it clear how this work differs from previous contributions? Is related work adequately cited
- Quality: Is the submission technically sound? Are claims well supported (e.g., by theoretical analysis or experimental results)? Are the methods used appropriate? Is this a complete piece of work or work in progress? Are the authors careful and honest about evaluating both the strengths and weaknesses of their work
- Clarity: Is the submission clearly written? Is it well organized? (If not, please make constructive suggestions for improving its clarity.) Does it adequately inform the reader? (Note that a superbly written paper provides enough information for an expert reader to reproduce its results.)
- Significance: Are the results important? Are others (researchers or practitioners) likely to use the ideas or build on them? Does the submission address a difficult task in a better way than previous work? Does it advance the state of the art in a demonstrable way? Does it provide unique data, unique conclusions about existing data, or a unique theoretical or experimental approach?
2. Questions: Please list up and carefully describe any questions and suggestions for the authors. Think of the things where a response from the author can change your opinion, clarify a confusion or address a limitation. This can be very important for a productive rebuttal and discussion phase with the authors.
3. Limitations: Have the authors adequately addressed the limitations and potential negative societal impact of their work? If not, please include constructive suggestions for improvement.
In general, authors should be rewarded rather than punished for being up front about the limitations of their work and any potential negative societal impact. You are encouraged to think through whether any critical points are missing and provide these as feedback for the authors.
4. Ethical concerns: If there are ethical issues with this paper, please flag the paper for an ethics review. For guidance on when this is appropriate, please review the NeurIPS ethics guidelines.
5. Soundness: Please assign the paper a numerical rating on the following scale to indicate the soundness of the technical claims, experimental and research methodology and on whether the central claims of the paper are adequately supported with evidence.
4: excellent
3: good
2: fair
1: poor
6. Presentation: Please assign the paper a numerical rating on the following scale to indicate the quality of the presentation. This should take into account the writing style and clarity, as well as contextualization relative to prior work.
4: excellent
3: good
2: fair
1: poor
7. Contribution: Please assign the paper a numerical rating on the following scale to indicate the quality of the overall contribution this paper makes to the research area being studied. Are the questions being asked important? Does the paper bring a significant originality of ideas and/or execution? Are the results valuable to share with the broader NeurIPS community.
4: excellent
3: good
2: fair
1: poor
8. Overall: Please provide an "overall score" for this submission. Choices:
10: Award quality: Technically flawless paper with groundbreaking impact on one or more areas of AI, with exceptionally strong evaluation, reproducibility, and resources, and no unaddressed ethical considerations.
9: Very Strong Accept: Technically flawless paper with groundbreaking impact on at least one area of AI and excellent impact on multiple areas of AI, with flawless evaluation, resources, and reproducibility, and no unaddressed ethical considerations.
8: Strong Accept: Technically strong paper with, with novel ideas, excellent impact on at least one area of AI or high-to-excellent impact on multiple areas of AI, with excellent evaluation, resources, and reproducibility, and no unaddressed ethical considerations.
7: Accept: Technically solid paper, with high impact on at least one sub-area of AI or moderate-to-high impact on more than one area of AI, with good-to-excellent evaluation, resources, reproducibility, and no unaddressed ethical considerations.
6: Weak Accept: Technically solid, moderate-to-high impact paper, with no major concerns with respect to evaluation, resources, reproducibility, ethical considerations.
5: Borderline accept: Technically solid paper where reasons to accept outweigh reasons to reject, e.g., limited evaluation. Please use sparingly.
4: Borderline reject: Technically solid paper where reasons to reject, e.g., limited evaluation, outweigh reasons to accept, e.g., good evaluation. Please use sparingly.
3: Reject: For instance, a paper with technical flaws, weak evaluation, inadequate reproducibility and incompletely addressed ethical considerations.
2: Strong Reject: For instance, a paper with major technical flaws, and/or poor evaluation, limited impact, poor reproducibility and mostly unaddressed ethical considerations.
1: Very Strong Reject: For instance, a paper with trivial results or unaddressed ethical considerations
9. Confidence: Please provide a "confidence score" for your assessment of this submission to indicate how confident you are in your evaluation. Choices:
5: You are absolutely certain about your assessment. You are very familiar with the related work and checked the math/other details carefully.
4: You are confident in your assessment, but not absolutely certain. It is unlikely, but not impossible, that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work.
3: You are fairly confident in your assessment. It is possible that you did not understand some parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
2: You are willing to defend your assessment, but it is quite likely that you did not understand the central parts of the submission or that you are unfamiliar with some pieces of related work. Math/other details were not carefully checked.
1: Your assessment is an educated guess. The submission is not in your area or the submission was difficult to understand. Math/other details were not carefully checked.
"""
+ template_instructions
)
def perform_review(
text,
model,
client,
num_reflections=1,
num_fs_examples=1,
num_reviews_ensemble=1,
temperature=0.75,
msg_history=None,
return_msg_history=False,
reviewer_system_prompt=reviewer_system_prompt_neg,
review_instruction_form=neurips_form,
):
if num_fs_examples > 0:
fs_prompt = get_review_fewshot_examples(num_fs_examples)
base_prompt = review_instruction_form + fs_prompt
else:
base_prompt = review_instruction_form
base_prompt += f"""
Here is the paper you are asked to review:
```
{text}
```"""
if num_reviews_ensemble > 1:
llm_review, msg_histories = get_batch_responses_from_llm(
base_prompt,
model=model,
client=client,
system_message=reviewer_system_prompt,
print_debug=False,
msg_history=msg_history,
# Higher temperature to encourage diversity.
temperature=0.75,
n_responses=num_reviews_ensemble,
)
parsed_reviews = []
for idx, rev in enumerate(llm_review):
try:
parsed_reviews.append(format_llm_review_json(rev))
except Exception as e:
print(f"Ensemble review {idx} failed: {e}")
parsed_reviews = [r for r in parsed_reviews if r is not None]
review = get_meta_review(model, client, temperature, parsed_reviews)
## Format the content in JSON
review = format_llm_review_json(review)
# take first valid in case meta-reviewer fails
if review is None:
review = parsed_reviews[0]
# print(parsed_reviews, "\n\n\n", review) # debug
# Replace numerical scores with the average of the ensemble.
for score, limits in [
("Originality", (1, 4)),
("Quality", (1, 4)),
("Clarity", (1, 4)),
("Significance", (1, 4)),
("Soundness", (1, 4)),
("Presentation", (1, 4)),
("Contribution", (1, 4)),
("Overall", (1, 10)),
("Confidence", (1, 5)),
]:
scores = []
for r in parsed_reviews:
if score in r and limits[1] >= r[score] >= limits[0]:
scores.append(r[score])
review[score] = int(round(np.mean(scores)))
# Rewrite the message history with the valid one and new aggregated review.
msg_history = msg_histories[0][:-1]
msg_history += [
{
"role": "assistant",
"content": f"""
THOUGHT:
I will start by aggregating the opinions of {num_reviews_ensemble} reviewers that I previously obtained.
REVIEW JSON:
```json
{json.dumps(review)}
```
""",
}
]
else:
llm_review, msg_history = get_response_from_llm(
base_prompt,
model=model,
client=client,
system_message=reviewer_system_prompt,
print_debug=False,
msg_history=msg_history,
temperature=temperature,
)
review = format_llm_review_json(llm_review)
if num_reflections > 1:
for j in range(num_reflections - 1):
# print(f"Relection: {j + 2}/{num_reflections}")
text, msg_history = get_response_from_llm(
reviewer_reflection_prompt,
client=client,
model=model,
system_message=reviewer_system_prompt,
msg_history=msg_history,
temperature=temperature,
)
review = format_llm_review_json(text)
assert review is not None, "Failed to extract JSON from LLM output"
if "I am done" in text:
# print(f"Review generation converged after {j + 2} iterations.")
break
if return_msg_history:
return review, msg_history
else:
return review
reviewer_reflection_prompt = """Round {current_round}/{num_reflections}.
In your thoughts, first carefully consider the accuracy and soundness of the review you just created.
Include any other factors that you think are important in evaluating the paper.
Ensure the review is clear and concise, and the JSON is in the correct format.
Do not make things overly complicated.
In the next attempt, try and refine and improve your review.
Stick to the spirit of the original review unless there are glaring issues.
Respond in the same format as before:
THOUGHT:
<THOUGHT>
REVIEW JSON:
```json
<JSON>
```
If there is nothing to improve, simply repeat the previous JSON EXACTLY after the thought and include "I am done" at the end of the thoughts but before the JSON.
ONLY INCLUDE "I am done" IF YOU ARE MAKING NO MORE CHANGES."""
def load_paper(pdf_path, num_pages=None, min_size=100):
try:
if num_pages is None:
text = pymupdf4llm.to_markdown(pdf_path)
else:
reader = PdfReader(pdf_path)
min_pages = min(len(reader.pages), num_pages)
text = pymupdf4llm.to_markdown(pdf_path, pages=list(range(min_pages)))
if len(text) < min_size:
raise Exception("Text too short")
except Exception as e:
print(f"Error with pymupdf4llm, falling back to pymupdf: {e}")
try:
doc = pymupdf.open(pdf_path) # open a document
if num_pages:
doc = doc[:num_pages]
text = ""
for page in doc: # iterate the document pages
text = text + page.get_text() # get plain text encoded as UTF-8
if len(text) < min_size:
raise Exception("Text too short")
except Exception as e:
print(f"Error with pymupdf, falling back to pypdf: {e}")
reader = PdfReader(pdf_path)
if num_pages is None:
text = "".join(page.extract_text() for page in reader.pages)
else:
text = "".join(page.extract_text() for page in reader.pages[:num_pages])
if len(text) < min_size:
raise Exception("Text too short")
return text
def load_review(path):
with open(path, "r") as json_file:
loaded = json.load(json_file)
return loaded["review"]
# get directory of this file
dir_path = os.path.dirname(os.path.realpath(__file__))
fewshot_papers = [
os.path.join(dir_path, "fewshot_examples/132_automated_relational.pdf"),
os.path.join(dir_path, "fewshot_examples/attention.pdf"),
os.path.join(dir_path, "fewshot_examples/2_carpe_diem.pdf"),
]
fewshot_reviews = [
os.path.join(dir_path, "fewshot_examples/132_automated_relational.json"),
os.path.join(dir_path, "fewshot_examples/attention.json"),
os.path.join(dir_path, "fewshot_examples/2_carpe_diem.json"),
]
def get_review_fewshot_examples(num_fs_examples=1):
fewshot_prompt = """
Below are some sample reviews, copied from previous machine learning conferences.
Note that while each review is formatted differently according to each reviewer's style, the reviews are well-structured and therefore easy to navigate.
"""
for paper, review in zip(
fewshot_papers[:num_fs_examples], fewshot_reviews[:num_fs_examples]
):
txt_path = paper.replace(".pdf", ".txt")
if os.path.exists(txt_path):
with open(txt_path, "r") as f:
paper_text = f.read()
else:
paper_text = load_paper(paper)
review_text = load_review(review)
fewshot_prompt += f"""
Paper:
```
{paper_text}
```
Review:
```
{review_text}
```
"""
return fewshot_prompt
meta_reviewer_system_prompt = """You are an Area Chair at a machine learning conference.
You are in charge of meta-reviewing a paper that was reviewed by {reviewer_count} reviewers.
Your job is to aggregate the reviews into a single meta-review in the same format.
Be critical and cautious in your decision, find consensus, and respect the opinion of all the reviewers."""
def get_meta_review(model, client, temperature, reviews):
# Write a meta-review from a set of individual reviews
review_text = ""
for i, r in enumerate(reviews):
review_text += f"""
Review {i + 1}/{len(reviews)}:
```
{json.dumps(r)}
```
"""
base_prompt = neurips_form + review_text
llm_review, msg_history = get_response_from_llm(
base_prompt,
model=model,
client=client,
system_message=meta_reviewer_system_prompt.format(reviewer_count=len(reviews)),
print_debug=False,
msg_history=None,
temperature=temperature,
)
meta_review = format_llm_review_json(llm_review)
return meta_review
def perform_improvement(review, coder):
improvement_prompt = '''The following review has been created for your research paper:
"""
{review}
"""
Improve the text using the review.'''.format(
review=json.dumps(review)
)
coder_out = coder.run(improvement_prompt) |