File size: 4,809 Bytes
8082566
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# This config is used to test all the code benchmarks
from mmengine.config import read_base
import os.path as osp
from opencompass.runners import LocalRunner, VOLCRunner
from opencompass.partitioners import NaivePartitioner, NumWorkerPartitioner
from opencompass.tasks import OpenICLInferTask, OpenICLEvalTask

with read_base():
    # Datasets Part
    # bigcodebench
    from opencompass.configs.datasets.bigcodebench.bigcodebench_full_instruct_gen import (
        bigcodebench_full_instruct_datasets
    )
    from opencompass.configs.datasets.bigcodebench.bigcodebench_hard_instruct_gen import (
        bigcodebench_hard_instruct_datasets
    )
    # livecodebench code generation lite v5
    from opencompass.configs.datasets.livecodebench.livecodebench_time_split_gen_a4f90b import (
        LCB_datasets
    )
    # huamneval series
    from opencompass.configs.datasets.humaneval.humaneval_openai_sample_evals_gen_dcae0e import (
        humaneval_datasets
    )
    from opencompass.configs.datasets.humaneval_pro.humaneval_pro_gen import (
        humanevalpro_datasets
    )
    from opencompass.configs.datasets.humanevalx.humanevalx_gen_620cfa import (
        humanevalx_datasets
    )
    from opencompass.configs.datasets.humaneval_plus.humaneval_plus_gen import (
        humaneval_plus_datasets
    )
    # mbpp series
    from opencompass.configs.datasets.mbpp.mbpp_gen import (
        mbpp_datasets
    )
    from opencompass.configs.datasets.mbpp_pro.mbpp_pro_gen import (
        mbpppro_datasets
    )
    # multipl-e
    from opencompass.configs.datasets.multipl_e.multiple_gen import (
        multiple_datasets
    )
    # ds1000
    from opencompass.configs.datasets.ds1000.ds1000_service_eval_gen_cbc84f import (
        ds1000_datasets
    )

    # Models Part
    from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_7b_instruct import (
        models as lmdeploy_qwen2_5_7b_instruct_model,
    )

    # Summary Groups
    from opencompass.configs.summarizers.groups.ds1000 import (
        ds1000_summary_groups,
    )
    from opencompass.configs.summarizers.groups.multipl_e import (
        multiple_summary_groups,
    )
    from opencompass.configs.summarizers.groups.humanevalx import (
        humanevalx_summary_groups,
    )

# models config
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])

for model in models:
    model['max_seq_len'] = 16384
    model['max_out_len'] = 8192

# datasets config
datasets = sum(
    (v for k, v in locals().items() if k.endswith('_datasets')),
    [],
)

for item in humanevalx_datasets:
    item['eval_cfg']['evaluator'][
        'ip_address'
    ] = 'codeeval.opencompass.org.cn/humanevalx'
    item['eval_cfg']['evaluator']['port'] = ''
for item in ds1000_datasets:
    item['eval_cfg']['evaluator'][
        'ip_address'
    ] = 'codeeval.opencompass.org.cn/ds1000'
    item['eval_cfg']['evaluator']['port'] = ''


for dataset in datasets:
    dataset['infer_cfg']['inferencer']['max_out_len'] = 8192


# summary
summary_groups = sum(
    [v for k, v in locals().items() if k.endswith('_summary_groups')], []
)
summary_groups.append(
    {'name': 'humanevalx', 
    'subsets': ['humanevalx-python', 'humanevalx-cpp', 'humanevalx-java', 'humanevalx-js']}
)
summarizer = dict(
    dataset_abbrs = [
        ['bigcodebench_hard_instruct', 'pass@1'],
        ['bigcodebench_full_instruct', 'pass@1'],
        ['lcb_code_generation', 'pass@1'],
        ['openai_humaneval', 'humaneval_pass@1'],
        ['mbpp', 'score'],
        ['humaneval_pro', 'pass@1'],
        ['mbpp_pro', 'pass@1'],
        ['humaneval_plus', 'humaneval_plus_pass@1'],
        ['multiple', 'naive_average'],
        ['humanevalx', 'naive_average'],
        ['ds1000', 'naive_average'],
        '',
        'humanevalx-python',
        'humanevalx-cpp',
        'humanevalx-java',
        'humanevalx-js',
        '',
        'ds1000_Pandas',
        'ds1000_Numpy',
        'ds1000_Tensorflow',
        'ds1000_Scipy',
        'ds1000_Sklearn',
        'ds1000_Pytorch',
        'ds1000_Matplotlib',
        '',
        'humaneval-multiple-cpp', 
        'humaneval-multiple-cs', 
        'humaneval-multiple-go', 
        'humaneval-multiple-java', 
        'humaneval-multiple-rb', 
        'humaneval-multiple-js', 
        'humaneval-multiple-php', 
        'humaneval-multiple-r', 
        'humaneval-multiple-rs', 
        'humaneval-multiple-sh',
        '',
        'mbpp-multiple-cpp', 
        'mbpp-multiple-cs', 
        'mbpp-multiple-go', 
        'mbpp-multiple-java', 
        'mbpp-multiple-rb', 
        'mbpp-multiple-js', 
        'mbpp-multiple-php', 
        'mbpp-multiple-r', 
        'mbpp-multiple-rs', 
        'mbpp-multiple-sh'
    ],
    summary_groups=summary_groups,
)

work_dir = 'outputs/code'