1+ # flake8: noqa
2+
3+ from mmengine.config import read_base
4+
5+ from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
6+ from opencompass.runners import LocalRunner
7+ from opencompass.tasks import OpenICLEvalTask, OpenICLInferTask
8+ from copy import deepcopy
9+ from opencompass.utils.text_postprocessors import extract_non_reasoning_content
10+ from opencompass.models import OpenAISDKStreaming
11+
12+ #######################################################################
13+ # PART 0 Essential Configs #
14+ #######################################################################
15+ with read_base():
16+ # Datasets
17+
18+ from opencompass.configs.datasets.mmlu_pro.mmlu_pro_0shot_nocot_genericllmeval_gen_08c1de import (
19+ mmlu_pro_datasets,
20+ )
21+ from opencompass.configs.datasets.gpqa.gpqa_cascade_eval_gen_772ea0 import (
22+ gpqa_datasets,
23+ )
24+ from opencompass.configs.datasets.aime2025.aime2025_cascade_eval_gen_5e9f4f import (
25+ aime2025_datasets,
26+ )
27+ from opencompass.configs.chatml_datasets.IMO_Bench_AnswerBench.IMO_Bench_AnswerBench_gen import (
28+ datasets as IMO_Bench_AnswerBench_chatml
29+ )
30+ from opencompass.configs.datasets.IFBench.IFBench_gen import (
31+ ifbench_datasets,
32+ )
33+ from opencompass.configs.datasets.livecodebench.livecodebench_gen_a4f90b import (
34+ LCBCodeGeneration_dataset,
35+ )
36+ from opencompass.configs.datasets.SmolInstruct.smolinstruct_0shot_instruct_gen import (
37+ smolinstruct_datasets_0shot_instruct as smolinstruct_datasets,
38+ )
39+ from opencompass.configs.datasets.matbench.matbench_llm_judge_gen_0e9276 import (
40+ matbench_datasets,
41+ )
42+ from opencompass.configs.datasets.biodata.biodata_task_gen import (
43+ biodata_task_datasets
44+ )
45+ from opencompass.configs.datasets.MolInstructions_chem.mol_instructions_chem_gen import (
46+ mol_gen_selfies_datasets
47+ )
48+
49+ # Summary Groups
50+ from opencompass.configs.summarizers.groups.mmlu_pro import \
51+ mmlu_pro_summary_groups
52+ from opencompass.configs.summarizers.groups.biodata import (
53+ biodata_summary_groups,
54+ )
55+
56+ LCBCodeGeneration_v6_datasets = deepcopy(LCBCodeGeneration_dataset)
57+ LCBCodeGeneration_v6_datasets['abbr'] = 'lcb_code_generation_v6'
58+ LCBCodeGeneration_v6_datasets['release_version'] = 'v6'
59+ LCBCodeGeneration_v6_datasets['eval_cfg']['evaluator'][
60+ 'release_version'
61+ ] = 'v6'
62+ LCBCodeGeneration_v6_datasets = [LCBCodeGeneration_v6_datasets]
63+
64+ #######################################################################
65+ # PART 1 Datasets List #
66+ #######################################################################
67+ # datasets list for evaluation
68+
69+ repeated_info = [
70+ (gpqa_datasets, 8),
71+ (aime2025_datasets, 32),
72+ ]
73+
74+ for datasets_, num in repeated_info:
75+ for dataset_ in datasets_:
76+ dataset_['n'] = num
77+ dataset_['k'] = num
78+
79+ datasets = sum(
80+ (v for k, v in locals().items() if k.endswith('_datasets')),
81+ [],
82+ )
83+
84+ chatml_datasets = sum(
85+ (v for k, v in locals().items() if k.endswith('_chatml')),
86+ [],
87+ )
88+
89+ # LLM judge config: using LLM to evaluate predictions
90+ judge_cfg = dict(
91+ abbr='YOUR_JUDGE_MODEL',
92+ type=OpenAISDKStreaming,
93+ path='YOUR_JUDGE_MODEL',
94+ key='YOUR_JUDGE_KEY',
95+ openai_api_base='YOUR_JUDGE_URL',
96+ mode='mid',
97+ meta_template=dict(
98+ round=[
99+ dict(role='HUMAN', api_role='HUMAN'),
100+ dict(role='BOT', api_role='BOT', generate=True),
101+ ]
102+ ),
103+ query_per_second=16,
104+ batch_size=64,
105+ temperature=0.001,
106+ max_out_len=8192,
107+ max_seq_len=32768,
108+ )
109+
110+ for item in datasets:
111+ if 'judge_cfg' in item['eval_cfg']['evaluator']:
112+ item['eval_cfg']['evaluator']['judge_cfg'] = judge_cfg
113+ if 'llm_evaluator' in item['eval_cfg']['evaluator'].keys() and 'judge_cfg' in item['eval_cfg']['evaluator']['llm_evaluator']:
114+ item['eval_cfg']['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
115+
116+ for item in chatml_datasets:
117+ if item['evaluator']['type'] == 'llm_evaluator':
118+ item['evaluator']['judge_cfg'] = judge_cfg
119+ if item['evaluator']['type'] == 'cascade_evaluator':
120+ item['evaluator']['llm_evaluator']['judge_cfg'] = judge_cfg
121+
122+
123+ #######################################################################
124+ # PART 2 Datset Summarizer #
125+ #######################################################################
126+
127+ summary_groups = sum(
128+ [v for k, v in locals().items() if k.endswith('_summary_groups')], []
129+ )
130+
131+ summarizer = dict(
132+ dataset_abbrs=[
133+ ['mmlu_pro', 'accuracy'],
134+ ['IFBench', 'score'],
135+ ['GPQA_diamond', 'accuracy (8 runs average)'],
136+ ['aime2025', 'accuracy (32 runs average)'],
137+ ['lcb_code_generation_v6', 'pass@1'],
138+ ['bio_data', 'naive_average'],
139+ ['IMO-Bench-AnswerBench', 'accuracy'],
140+ '',
141+ 'Mol_Instruct',
142+ ['FS-selfies', 'score'],
143+ ['MC-selfies', 'score'],
144+ ['MG-selfies', 'score'],
145+ ['PP-selfies', 'score'],
146+ ['RP-selfies', 'score'],
147+ ['RS-selfies', 'score'],
148+ '',
149+ 'SmolInstruct',
150+ ['NC-I2F-0shot-instruct', 'score'],
151+ ['NC-I2S-0shot-instruct', 'score'],
152+ ['NC-S2F-0shot-instruct', 'score'],
153+ ['NC-S2I-0shot-instruct', 'score'],
154+ ['PP-ESOL-0shot-instruct', 'score'],
155+ ['PP-Lipo-0shot-instruct', 'score'],
156+ ['PP-BBBP-0shot-instruct', 'accuracy'],
157+ ['PP-ClinTox-0shot-instruct', 'accuracy'],
158+ ['PP-HIV-0shot-instruct', 'accuracy'],
159+ ['PP-SIDER-0shot-instruct', 'accuracy'],
160+ ['MC-0shot-instruct', 'score'],
161+ ['MG-0shot-instruct', 'score'],
162+ ['FS-0shot-instruct', 'score'],
163+ ['RS-0shot-instruct', 'score'],
164+ '',
165+ ['matbench_expt_gap', 'mae'],
166+ ['matbench_steels', 'mae'],
167+ ['matbench_expt_is_metal', 'accuracy'],
168+ ['matbench_glass', 'accuracy'],
169+ ],
170+ summary_groups=summary_groups,
171+ )
172+
173+
174+ #######################################################################
175+ # PART 3 Models #
176+ #######################################################################
177+
178+ api_meta_template = dict(
179+ round=[
180+ dict(role='SYSTEM', api_role='SYSTEM'), # System prompt is only needed when evaluating Bio_data and Mol_instructions
181+ dict(role='HUMAN', api_role='HUMAN'),
182+ dict(role='BOT', api_role='BOT', generate=True),
183+ ]
184+ )
185+
186+ models = [
187+ dict(
188+ abbr='intern-s1-pro',
189+ type=OpenAISDKStreaming,
190+ path='intern-s1-pro',
191+ key='YOUR_API_KEY',
192+ openai_api_base='YOUR_API_BASE',
193+ meta_template=api_meta_template,
194+ query_per_second=16,
195+ batch_size=8,
196+ temperature=0.8,
197+ retry=10,
198+ max_out_len=65536,
199+ max_seq_len=65536,
200+ extra_body={
201+ 'chat_template_kwargs': {'enable_thinking': True} # Disable thinking when evaluating scientific benchmarks
202+ },
203+ pred_postprocessor=dict(
204+ type=extract_non_reasoning_content,
205+ ),
206+ ),
207+ ]
208+
209+ #######################################################################
210+ # PART 4 Inference/Evaluation Configuaration #
211+ #######################################################################
212+
213+ # infer with local runner
214+ infer = dict(
215+ partitioner=dict(type=NumWorkerPartitioner, num_worker=8),
216+ runner=dict(
217+ type=LocalRunner,
218+ max_num_workers=16,
219+ task=dict(type=OpenICLInferTask),
220+ ),
221+ )
222+
223+ # eval with local runner
224+ eval = dict(
225+ partitioner=dict(type=NaivePartitioner, n=10),
226+ runner=dict(
227+ type=LocalRunner,
228+ max_num_workers=16,
229+ task=dict(type=OpenICLEvalTask)
230+ ),
231+ )
232+
233+ #######################################################################
234+ # PART 5 Utils Configuaration #
235+ #######################################################################
236+
237+ work_dir = './outputs/oc_intern_s1_pro_eval'
0 commit comments