# this is .py for store constants MODEL_INFO = ["Model"] TASK_INFO = [ "Avg. All", "Avg. Timbre", "Avg. Tone", "Avg. Melody", "Avg. Space", "Avg. Time", "Avg. Hallucination", "Avg. Intricacy", "Instrument Recognition", "Singer Recognition", "Gunshot Recognition", "Bird Recognition", "Animal Recognition", "Transportation Recognition", "Material Recognition", "Scene Recognition", "Hazard Recognition", "Action Recognition", "Eating Sound Recognition", "Speech Sentiment Analysis", "Meme Understanding", "Music Sentiment Analysis", "Music Genre Classification", "Dance and Music Matching", "Film and Music Matching", "Music Score Matching", "Audio 3D Angle Estimation", "Audio Distance Estimation", "Audio Time Estimation", "Audio-Visual Synchronization", "Action Sequencing", "Hallucination Evaluation", "Action Prediction", "Action Tracing"] AVG_INFO = ["Avg. All", "Avg. Timbre", "Avg. Tone", "Avg. Melody", "Avg. Space", "Avg. Time", "Avg. Hallucination", "Avg. Intricacy"] DATA_TITILE_TYPE = ["markdown"] * len(MODEL_INFO) + ["number"] * len(TASK_INFO) CSV_DIR = "./file/AV-Odyssey_performance.csv" COLUMN_NAMES = MODEL_INFO + TASK_INFO DATA_NUM = [200, 200, 200, 200, 200, 200, 200, 200, 108, 196, 200, 200, 20, 97, 200, 200, 200, 200, 20, 20, 200, 200, 200, 200, 199, 195] LEADERBORAD_INTRODUCTION = """# AV-Odyssey Bench Leaderboard Welcome to the leaderboard of the AV-Odyssey Bench! 🏆 AV-Odyssey Bench, a comprehensive audio-visual benchmark designed to assess whether those MLLMs can truly understand the audio-visual information. This benchmark encompasses 4,555 carefully crafted problems, each incorporating text, visual, and audio components. To successfully infer answers, models must effectively leverage clues from both visual and audio inputs. Please refer to [AV-Odyssey paper](https://arxiv.org/abs/2307.16125) for more details. """ SUBMIT_INTRODUCTION = """# Submit on AV-Odyssey Bench Introduction Note: The format of the submitted json file is a dict for each line. This dict contains two keys: question_id and prediction. Specific examples are as follows: ```shell {"question_id": "5_0", "prediction": "B"} {"question_id": "3_0", "prediction": "B"} ``` ## Submit Example ## If you have any questions, please contact [libohao1998@gmail.com](libohao1998@gmail.com). """ TABLE_INTRODUCTION = """In the table below, we summarize each task performance of all the models. We use accurancy(%) as the primary evaluation metric for each tasks. Performance Average Type is All Average means that calculates the overall accuracy by dividing the total number of correct QA answers by the total number of QA questions. If you have any questions, please feel free to contact us. """ LEADERBORAD_INFO = """ Recently, multimodal large language models (MLLMs), such as GPT-4o, Gemini 1.5 Pro, and Reka Core, have expanded their capabilities to include vision and audio modalities. While these models demonstrate impressive performance across a wide range of audio-visual applications, our proposed DeafTest reveals that MLLMs often struggle with simple tasks humans find trivial: 1) determining which of two sounds is louder, and 2) determining which of two sounds has a higher pitch. Motivated by these observations, we introduce AV-Odyssey Bench. This benchmark encompasses 26 different tasks and 4,555 carefully crafted problems, each incorporating text, visual, and audio components. All data are newly collected and annotated by humans, not from any existing audio-visual dataset. AV-Odyssey Bench demonstrates three major features: 1. Comprehensive Audio Attributes; 2. Extensive Domains; 3. Interleaved Text, Audio, and Visual components. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""{ }"""