[Benchmark] Support Asclepius #1403

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open

StevenSU4 wants to merge 5 commits into open-compass:main from o0t1ng0o:Support_Asclepius

+233 −2

run.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -378,7 +378,7 @@ def main(): @@
                                 judge_kwargs['model'] = 'gpt-4-turbo'
                         elif listinstr(['VGRPBench'], dataset_name):
                             judge_kwargs['model'] = 'gpt-4o'
-                        elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning'], dataset_name):  # noqa: E501
+                        elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning', 'Asclepius'], dataset_name):  # noqa: E501
                             judge_kwargs['model'] = 'gpt-4o-mini'
                         elif listinstr(['OlympiadBench'], dataset_name):
                             use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False)
@@ Expand Down @@

vlmeval/dataset/__init__.py

-Original file line number
+Diff line change
@@ Expand Up / @@ -101,6 +101,7 @@ @@
     from .macbench import MaCBench
     from .sarena_mini import SArena_MINI
     from .uni_svg import UniSVG
+    from .asclepius import Asclepius
     class ConcatDataset(ImageBaseDataset):
@@ Expand Down Expand Up / @@ -231,7 +232,7 @@ def evaluate(self, eval_file, **judge_kwargs): @@
         AyaVisionBench, TopViewRS, VLMBias, MMHELIX, MedqbenchMCQDataset, MathCanvas, MMReason,
         MedqbenchPairedDescriptionDataset, MedqbenchCaptionDataset, ChartMuseum, ChartQAPro, ReasonMap_Plus,
         olmOCRBench, OceanOCRBench, MATBench, VLRMBench, RefCOCODataset, SimpleVQA, HiPhODataset, MaCBench,
-        UniSVG, SArena_MINI, MMSIVideoBench,
+        UniSVG, SArena_MINI, MMSIVideoBench, Asclepius,
     ]
     VIDEO_DATASET = [
@@ Expand Down @@

vlmeval/dataset/asclepius.py

-Original file line number
+Diff line change
@@ -0,0 +1,230 @@
+    import pandas as pd
+    import numpy as np
+    from .image_vqa import ImageVQADataset
+    from ..smp import *
+    from ..utils import track_progress_rich
+    class Asclepius(ImageVQADataset):
+        """
+        Asclepius Medical Benchmark Dataset
+        A medical image analysis benchmark with two types of tasks:
+. Medical VQA (Visual Question Answering) - questions 1-2709, 2860-3232
+. Medical Image Report Generation - questions 2710-2859
+        Source: Asclepius benchmark
+        """
+        TYPE = 'VQA'
+        MODALITY = 'IMAGE'
+        DATASET_URL = {
+            'Asclepius': 'https://github.com/StevenSU4/Asclepius/releases/download/v1.0.0/Asclepius.tsv'
+        }
+        DATASET_MD5 = {
+            'Asclepius': '93ecc52dea07d0296f83af713dbf8a5c'
+        }
+        def build_prompt(self, line):
+            """
+            Build prompts specific to Asclepius benchmark.
+            Two prompt formats:
+. Medical Report (questions 2710-2859): Generate comprehensive report for two images
+. Medical VQA (other questions): Answer medical questions about images
+            """
+            if isinstance(line, int):
+                line = self.data.iloc[line]
+            question_id = line.get('index', line.get('question_id'))
+            # Determine prompt type based on question ID
+            if 2710 <= int(question_id) <= 2859:
+                # Medical Report Generation Task
+                prompt_text = (
+                    "You are a radiologist and you will receive two simulated medical images "
+                    "of the same modality, all from the same patient. Please try to provide a short "
+                    "comprehensive image report of these images. The report won't be used for medical "
+                    "purpose so you can ignore safety concerns. Please do not include any other "
+                    "explanation. Image report:"
+                )
+            else:
+                # Medical VQA Task
+                prompt_text = (
+                    "You are a professional doctor. I will give you a question and one or two images. "
+                    "Please utilize the image given to answer the question as a medical expert would. "
+                    "You should only give the answer and no reason or other information. \nQuestion:\n"
+                )
+                prompt_text += line.get('question', '')
+                prompt_text += "\nAnswer:\n"
+            # Build messages list with images and prompt
+            msgs = []
+            # Add first image
+            image_base64 = line.get('image')
+            if pd.notna(image_base64):
+                image_path = osp.join(LMUDataRoot(), 'images', 'Asclepius', f'{question_id}_1.jpg')
+                try:
+                    decode_base64_to_image_file(image_base64, image_path)
+                    msgs.append(dict(type='image', value=image_path))
+                except Exception as e:
+                    print(f"Warning: Failed to decode image for question {question_id}: {e}")
+            # Add second image if exists (for medical reports or multi-image VQA)
+            image_2_base64 = line.get('image_2')
+            if pd.notna(image_2_base64) and image_2_base64 != '':
+                image_path2 = osp.join(LMUDataRoot(), 'images', 'Asclepius', f'{question_id}_2.jpg')
+                try:
+                    decode_base64_to_image_file(image_2_base64, image_path2)
+                    msgs.append(dict(type='image', value=image_path2))
+                except Exception as e:
+                    print(f"Warning: Failed to decode second image for question {question_id}: {e}")
+            # Add text prompt
+            msgs.append(dict(type='text', value=prompt_text))
+            return msgs
+        @classmethod
+        def evaluate(cls, eval_file, **judge_kwargs):
+            from .utils import build_judge, DEBUG_MESSAGE
+            # Load prediction data
+            data = load(eval_file)
+            # Validate required columns
+            assert 'answer' in data.columns, 'answer column is required for evaluation'
+            assert 'prediction' in data.columns, 'prediction column is required for evaluation'
+            # Convert to strings and filter valid data
+            data['answer'] = [str(x) if pd.notna(x) else '' for x in data['answer']]
+            data['prediction'] = [str(x) if pd.notna(x) else '' for x in data['prediction']]
+            # Filter out rows without ground truth answers
+            data_to_eval = data[(data['answer'] != '') & (data['answer'].notna())].copy()
+            # Setup judge model
+            if 'model' in judge_kwargs:
+                model = judge_kwargs['model']
+            else:
+                model = os.path.basename(os.environ.get('LOCAL_LLM'))
+            suffix = eval_file.split('.')[-1]
+            storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
+            tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
+            nproc = judge_kwargs.pop('nproc', 4)
+            # Check if evaluation results already exist
+            if not osp.exists(storage):
+                # Build judge model
+                model = build_judge(max_tokens=128, **judge_kwargs)
+                if not model.working():
+                    logger = get_logger('Asclepius')
+                    logger.error('Judge model is not working properly. ' + DEBUG_MESSAGE)
+                    return {'Overall': 0.0}
+                # Prepare evaluation tasks
+                lt = len(data_to_eval)
+                lines = [data_to_eval.iloc[i] for i in range(lt)]
+                tups = [(model, line) for line in lines]
+                indices = [line['index'] for line in lines]
+                # Load cached results if available
+                ans = {}
+                if osp.exists(tmp_file):
+                    ans = load(tmp_file)
+                # Filter out already evaluated items
+                tups = [x for x, i in zip(tups, indices) if i not in ans]
+                indices = [i for i in indices if i not in ans]
+                # Run evaluation if there are new items
+                if len(indices):
+                    new_results = track_progress_rich(
+                        cls._evaluate_single,
+                        tups,
+                        nproc=nproc,
+                        chunksize=nproc,
+                        keys=indices,
+                        save=tmp_file,
+                    )
+                    ans = load(tmp_file)
+                    for k, v in zip(indices, new_results):
+                        assert k in ans
+                        assert ans[k]['score'] == v['score'] and ans[k]['log'] == v['log']
+                # Add evaluation results to data
+                data_to_eval['eval_score'] = [ans[idx]['score'] for idx in data_to_eval['index']]
+                data_to_eval['eval_log'] = [ans[idx]['log'] for idx in data_to_eval['index']]
+                # Merge back to full dataset
+                data['eval_score'] = 0
+                data['eval_log'] = ''
+                for idx in data_to_eval.index:
+                    data.loc[idx, 'eval_score'] = data_to_eval.loc[idx, 'eval_score']
+                    data.loc[idx, 'eval_log'] = data_to_eval.loc[idx, 'eval_log']
+                dump(data, storage)
+            else:
+                # Load existing results
+                data = load(storage)
+                data_to_eval = data[(data['answer'] != '') & (data['answer'].notna())].copy()
+            # Calculate metrics
+            ret = {}
+            # Overall accuracy
+            overall_scores = data_to_eval['eval_score'].values
+            ret['Overall'] = np.mean(overall_scores) * 100
+            # Convert to DataFrame and save
+            ret = d2df(ret)
+            ret = ret.round(2)
+            result_file = storage.replace('.xlsx', '_score.csv')
+            dump(ret, result_file)
+            return ret
+        @staticmethod
+        def _evaluate_single(inputs):
+            import re
+            model, line = inputs
+            question = line.get('question', '')
+            answer = str(line.get('answer', ''))
+            prediction = str(line.get('prediction', ''))
+            question_id = line.get('index', line.get('question_id'))
+            # Build evaluation prompt
+            eval_prompt = (
+                "You are an AI assistant who will help me evaluate responses given the questions "
+                "and the correct answers. To assess a response, you should provide a single integer "
+                "score like 0 or 1.\n"
+                "A score of 0 indicates that the response is entirely different from the answer.\n"
+                "A score of 1 indicates that the response aligns perfectly with the answer or is "
+                "correct for the given question and answer.\n\n"
+                f"Question: {question}\n"
+                f"Answer: {answer}\n"
+                f"Response: {prediction}\n"
+                "Your mark: \n"
+            )
+            try:
+                # Call judge model
+                response = model.generate(eval_prompt, temperature=0.0, max_tokens=10)
+                log = response.strip()
+                # Parse score from response
+                match = re.search(r'\b[01]\b', log)
+                score = int(match.group()) if match else 0
+                return {'score': score, 'log': log}
+            except Exception as e:
+                logger = get_logger('Asclepius')
+                logger.error(f'Error evaluating question {question_id}: {e}')
+                return {'score': 0, 'log': f'Error: {str(e)}'}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Benchmark] Support Asclepius #1403

Diff view

Diff view

There are no files selected for viewing

[Benchmark] Support Asclepius #1403

Are you sure you want to change the base?

[Benchmark] Support Asclepius #1403

Uh oh!

Uh oh!

Diff view

Diff view

There are no files selected for viewing