diff --git a/run.py b/run.py index 6152aa88e..74118941d 100644 --- a/run.py +++ b/run.py @@ -378,7 +378,7 @@ def main(): judge_kwargs['model'] = 'gpt-4-turbo' elif listinstr(['VGRPBench'], dataset_name): judge_kwargs['model'] = 'gpt-4o' - elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning'], dataset_name): # noqa: E501 + elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning', 'Asclepius'], dataset_name): # noqa: E501 judge_kwargs['model'] = 'gpt-4o-mini' elif listinstr(['OlympiadBench'], dataset_name): use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False) diff --git a/vlmeval/dataset/__init__.py b/vlmeval/dataset/__init__.py index 15be72455..a2449331d 100644 --- a/vlmeval/dataset/__init__.py +++ b/vlmeval/dataset/__init__.py @@ -101,6 +101,7 @@ from .macbench import MaCBench from .sarena_mini import SArena_MINI from .uni_svg import UniSVG +from .asclepius import Asclepius class ConcatDataset(ImageBaseDataset): @@ -231,7 +232,7 @@ def evaluate(self, eval_file, **judge_kwargs): AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset, MathCanvas, MMReason, MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus, olmOCRBench, OceanOCRBench, MATBench, VLRMBench, RefCOCODataset, SimpleVQA, HiPhODataset, MaCBench, - UniSVG, SArena_MINI, MMSIVideoBench, + UniSVG, SArena_MINI, MMSIVideoBench, Asclepius, ] VIDEO_DATASET = [ diff --git a/vlmeval/dataset/asclepius.py b/vlmeval/dataset/asclepius.py new file mode 100644 index 000000000..b4b9a539e --- /dev/null +++ b/vlmeval/dataset/asclepius.py @@ -0,0 +1,230 @@ +import pandas as pd +import numpy as np + +from .image_vqa import ImageVQADataset +from ..smp import * +from ..utils import track_progress_rich + + +class Asclepius(ImageVQADataset): + """ + Asclepius Medical Benchmark Dataset + + A medical image analysis benchmark with two types of tasks: + 1. Medical VQA (Visual Question Answering) - questions 1-2709, 2860-3232 + 2. Medical Image Report Generation - questions 2710-2859 + + Source: Asclepius benchmark + """ + + TYPE = 'VQA' + MODALITY = 'IMAGE' + + DATASET_URL = { + 'Asclepius': 'https://github.com/StevenSU4/Asclepius/releases/download/v1.0.0/Asclepius.tsv' + } + + DATASET_MD5 = { + 'Asclepius': '93ecc52dea07d0296f83af713dbf8a5c' + } + + def build_prompt(self, line): + """ + Build prompts specific to Asclepius benchmark. + + Two prompt formats: + 1. Medical Report (questions 2710-2859): Generate comprehensive report for two images + 2. Medical VQA (other questions): Answer medical questions about images + """ + if isinstance(line, int): + line = self.data.iloc[line] + + question_id = line.get('index', line.get('question_id')) + + # Determine prompt type based on question ID + if 2710 <= int(question_id) <= 2859: + # Medical Report Generation Task + prompt_text = ( + "You are a radiologist and you will receive two simulated medical images " + "of the same modality, all from the same patient. Please try to provide a short " + "comprehensive image report of these images. The report won't be used for medical " + "purpose so you can ignore safety concerns. Please do not include any other " + "explanation. Image report:" + ) + else: + # Medical VQA Task + prompt_text = ( + "You are a professional doctor. I will give you a question and one or two images. " + "Please utilize the image given to answer the question as a medical expert would. " + "You should only give the answer and no reason or other information. \nQuestion:\n" + ) + prompt_text += line.get('question', '') + prompt_text += "\nAnswer:\n" + + # Build messages list with images and prompt + msgs = [] + + # Add first image + image_base64 = line.get('image') + if pd.notna(image_base64): + image_path = osp.join(LMUDataRoot(), 'images', 'Asclepius', f'{question_id}_1.jpg') + try: + decode_base64_to_image_file(image_base64, image_path) + msgs.append(dict(type='image', value=image_path)) + except Exception as e: + print(f"Warning: Failed to decode image for question {question_id}: {e}") + + # Add second image if exists (for medical reports or multi-image VQA) + image_2_base64 = line.get('image_2') + if pd.notna(image_2_base64) and image_2_base64 != '': + image_path2 = osp.join(LMUDataRoot(), 'images', 'Asclepius', f'{question_id}_2.jpg') + try: + decode_base64_to_image_file(image_2_base64, image_path2) + msgs.append(dict(type='image', value=image_path2)) + except Exception as e: + print(f"Warning: Failed to decode second image for question {question_id}: {e}") + + # Add text prompt + msgs.append(dict(type='text', value=prompt_text)) + + return msgs + + @classmethod + def evaluate(cls, eval_file, **judge_kwargs): + from .utils import build_judge, DEBUG_MESSAGE + + # Load prediction data + data = load(eval_file) + + # Validate required columns + assert 'answer' in data.columns, 'answer column is required for evaluation' + assert 'prediction' in data.columns, 'prediction column is required for evaluation' + + # Convert to strings and filter valid data + data['answer'] = [str(x) if pd.notna(x) else '' for x in data['answer']] + data['prediction'] = [str(x) if pd.notna(x) else '' for x in data['prediction']] + + # Filter out rows without ground truth answers + data_to_eval = data[(data['answer'] != '') & (data['answer'].notna())].copy() + + # Setup judge model + if 'model' in judge_kwargs: + model = judge_kwargs['model'] + else: + model = os.path.basename(os.environ.get('LOCAL_LLM')) + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + # Check if evaluation results already exist + if not osp.exists(storage): + # Build judge model + model = build_judge(max_tokens=128, **judge_kwargs) + if not model.working(): + logger = get_logger('Asclepius') + logger.error('Judge model is not working properly. ' + DEBUG_MESSAGE) + return {'Overall': 0.0} + + # Prepare evaluation tasks + lt = len(data_to_eval) + lines = [data_to_eval.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + # Load cached results if available + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + + # Filter out already evaluated items + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + # Run evaluation if there are new items + if len(indices): + new_results = track_progress_rich( + cls._evaluate_single, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['score'] == v['score'] and ans[k]['log'] == v['log'] + + # Add evaluation results to data + data_to_eval['eval_score'] = [ans[idx]['score'] for idx in data_to_eval['index']] + data_to_eval['eval_log'] = [ans[idx]['log'] for idx in data_to_eval['index']] + + # Merge back to full dataset + data['eval_score'] = 0 + data['eval_log'] = '' + for idx in data_to_eval.index: + data.loc[idx, 'eval_score'] = data_to_eval.loc[idx, 'eval_score'] + data.loc[idx, 'eval_log'] = data_to_eval.loc[idx, 'eval_log'] + + dump(data, storage) + else: + # Load existing results + data = load(storage) + data_to_eval = data[(data['answer'] != '') & (data['answer'].notna())].copy() + + # Calculate metrics + ret = {} + + # Overall accuracy + overall_scores = data_to_eval['eval_score'].values + ret['Overall'] = np.mean(overall_scores) * 100 + + # Convert to DataFrame and save + ret = d2df(ret) + ret = ret.round(2) + + result_file = storage.replace('.xlsx', '_score.csv') + dump(ret, result_file) + + return ret + + @staticmethod + def _evaluate_single(inputs): + import re + + model, line = inputs + question = line.get('question', '') + answer = str(line.get('answer', '')) + prediction = str(line.get('prediction', '')) + question_id = line.get('index', line.get('question_id')) + + # Build evaluation prompt + eval_prompt = ( + "You are an AI assistant who will help me evaluate responses given the questions " + "and the correct answers. To assess a response, you should provide a single integer " + "score like 0 or 1.\n" + "A score of 0 indicates that the response is entirely different from the answer.\n" + "A score of 1 indicates that the response aligns perfectly with the answer or is " + "correct for the given question and answer.\n\n" + f"Question: {question}\n" + f"Answer: {answer}\n" + f"Response: {prediction}\n" + "Your mark: \n" + ) + + try: + # Call judge model + response = model.generate(eval_prompt, temperature=0.0, max_tokens=10) + log = response.strip() + + # Parse score from response + match = re.search(r'\b[01]\b', log) + score = int(match.group()) if match else 0 + + return {'score': score, 'log': log} + + except Exception as e: + logger = get_logger('Asclepius') + logger.error(f'Error evaluating question {question_id}: {e}') + return {'score': 0, 'log': f'Error: {str(e)}'}