Skip to content

Commit 5bcb664

Browse files
authored
Merge pull request #21 from modelscope/dev/shuchang_newjudge
feat: integrate FinWorldJudge with OpenJudge support & add project blogs
2 parents 9881a5e + 88abfc1 commit 5bcb664

26 files changed

Lines changed: 1945 additions & 3569 deletions

ajet/utils/metric_helper/reward_metric_helper.py

Lines changed: 50 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -2,12 +2,16 @@
22
deep_finance Reward Metrics Helper
33
44
Provides standalone utility functions for reward_stats extraction and SwanLab metrics formatting.
5-
Decouples deep_finance-specific logic from core code, reducing intrusion into native_compat_trainer.
5+
6+
Data sources:
7+
1. Finance Evaluator (finance_raw, finance_contribution)
8+
2. OpenJudge Graders (openjudge_xxx_raw, openjudge_xxx_contribution)
69
710
SwanLab metrics directory structure:
811
- rewards/ Top-level aggregated scores
9-
- rewards/dimensions/ Raw scores (unweighted)
10-
- rewards/contribution/ Weighted contributions
12+
- rewards/dimensions/ Raw scores (unweighted): finance_raw, openjudge_*_raw
13+
- rewards/contribution/ Weighted contributions: finance_contribution, openjudge_*_contribution
14+
- rewards/openjudge/ OpenJudge grader specific metrics
1115
- judge_time/ Judge time consumption statistics
1216
"""
1317

@@ -41,9 +45,9 @@ def compute_reward_metrics(reward_stats_list: List[Dict[str, Any]], prefix: str
4145
"""
4246
Compute SwanLab metrics from reward_stats list.
4347
44-
Supports two data sources:
45-
1. RM Gallery RewardStats fields (rm_raw, etc.)
46-
2. OpenJudge fields (openjudge_xxx_raw, openjudge_xxx_contribution, etc.)
48+
Data sources:
49+
1. Finance Evaluator (finance_raw, finance_contribution)
50+
2. OpenJudge Graders (openjudge_xxx_raw, openjudge_xxx_contribution)
4751
4852
Args:
4953
reward_stats_list: List of reward_stats dictionaries
@@ -72,61 +76,46 @@ def compute_reward_metrics(reward_stats_list: List[Dict[str, Any]], prefix: str
7276
metrics[f"{prefix}rewards/fused_reward_mean"] = float(np.mean(fused_reward_list))
7377
metrics[f"{prefix}rewards/penalty_mean"] = float(np.mean(penalty_list))
7478
metrics[f"{prefix}rewards/step_reward_mean"] = float(np.mean(step_reward_list))
75-
metrics[f"{prefix}rewards/penalty_count"] = len(non_zero_penalties)
76-
metrics[f"{prefix}rewards/penalty_rate"] = len(non_zero_penalties) / n * 100 if n > 0 else 0.0
77-
78-
# ========== OpenJudge Metrics (PresentationQualityGrader, GroundingGrader) ==========
79-
openjudge_enabled_count = sum(1 for rs in reward_stats_list if rs.get('openjudge_enabled', False))
80-
81-
if openjudge_enabled_count > 0:
82-
# OpenJudge graders: presentation_quality, grounding
83-
openjudge_graders = [
84-
"presentation_quality",
85-
"grounding",
86-
"planning",
87-
"audit",
88-
"traceability",
89-
"cgcv"
90-
]
91-
92-
for grader_name in openjudge_graders:
93-
raw_key = f"openjudge_{grader_name}_raw"
94-
contrib_key = f"openjudge_{grader_name}_contribution"
95-
96-
raw_list = [rs.get(raw_key, 0.0) for rs in reward_stats_list]
97-
contrib_list = [rs.get(contrib_key, 0.0) for rs in reward_stats_list]
98-
99-
# Only report when non-zero values exist
100-
if any(v != 0.0 for v in raw_list):
101-
metrics[f"{prefix}rewards/openjudge/{grader_name}_raw_mean"] = float(np.mean(raw_list))
102-
if any(v != 0.0 for v in contrib_list):
103-
metrics[f"{prefix}rewards/openjudge/{grader_name}_contribution_mean"] = float(np.mean(contrib_list))
104-
105-
# OpenJudge time consumption statistics
106-
grading_time_list = [rs.get('grading_time', 0.0) for rs in reward_stats_list]
107-
if any(v != 0.0 for v in grading_time_list):
108-
metrics[f"{prefix}judge_time/openjudge_grading_time_mean"] = float(np.mean(grading_time_list))
109-
metrics[f"{prefix}judge_time/openjudge_grading_time_max"] = float(np.max(grading_time_list))
110-
111-
# ========== RM Gallery Metrics ==========
112-
113-
# RM Gallery
114-
rm_raw_list = [rs.get('rm_raw', 0.0) for rs in reward_stats_list]
115-
rm_contribution_list = [rs.get('rm_contribution', 0.0) for rs in reward_stats_list]
116-
117-
# dimensions/ raw scores
118-
metrics[f"{prefix}rewards/dimensions/rm_raw_mean"] = float(np.mean(rm_raw_list))
119-
120-
# contribution/ weighted contributions
121-
metrics[f"{prefix}rewards/contribution/rm_contribution_mean"] = float(np.mean(rm_contribution_list))
122-
123-
124-
# Time consumption statistics
125-
rm_time_list = [rs.get('rm_time', 0.0) for rs in reward_stats_list]
126-
metrics[f"{prefix}judge_time/rm_time_mean"] = float(np.mean(rm_time_list))
127-
128-
if rm_time_list:
129-
metrics[f"{prefix}judge_time/rm_time_max"] = float(np.max(rm_time_list))
79+
metrics[f"{prefix}rewards/penalty_count"] = float(len(non_zero_penalties))
80+
metrics[f"{prefix}rewards/penalty_rate"] = float(len(non_zero_penalties) / n * 100) if n > 0 else 0.0
81+
82+
# ========== OpenJudge Metrics ==========
83+
# OpenJudge graders: presentation_quality, grounding, audit
84+
openjudge_graders = [
85+
"presentation_quality",
86+
"grounding",
87+
"planning",
88+
"audit",
89+
]
90+
91+
for grader_name in openjudge_graders:
92+
raw_key = f"openjudge_{grader_name}_raw"
93+
contrib_key = f"openjudge_{grader_name}_contribution"
94+
95+
raw_list = [rs.get(raw_key, 0.0) for rs in reward_stats_list]
96+
contrib_list = [rs.get(contrib_key, 0.0) for rs in reward_stats_list]
97+
98+
# Only report when non-zero values exist
99+
if any(v != 0.0 for v in raw_list):
100+
metrics[f"{prefix}rewards/openjudge/{grader_name}_raw_mean"] = float(np.mean(raw_list))
101+
if any(v != 0.0 for v in contrib_list):
102+
metrics[f"{prefix}rewards/openjudge/{grader_name}_contribution_mean"] = float(np.mean(contrib_list))
103+
104+
# OpenJudge time consumption statistics
105+
grading_time_list = [rs.get('grading_time', 0.0) for rs in reward_stats_list]
106+
if any(v != 0.0 for v in grading_time_list):
107+
metrics[f"{prefix}judge_time/openjudge_grading_time_mean"] = float(np.mean(grading_time_list))
108+
metrics[f"{prefix}judge_time/openjudge_grading_time_max"] = float(np.max(grading_time_list))
109+
110+
# ========== Finance Evaluator Metrics ==========
111+
finance_raw_list = [rs.get('finance_raw', 0.0) for rs in reward_stats_list]
112+
finance_contribution_list = [rs.get('finance_contribution', 0.0) for rs in reward_stats_list]
113+
114+
if any(v != 0.0 for v in finance_raw_list):
115+
metrics[f"{prefix}rewards/dimensions/finance_raw_mean"] = float(np.mean(finance_raw_list))
116+
117+
if any(v != 0.0 for v in finance_contribution_list):
118+
metrics[f"{prefix}rewards/contribution/finance_contribution_mean"] = float(np.mean(finance_contribution_list))
130119

131120
# ========== General Time Consumption Statistics ==========
132121
judge_total_time_list = [rs.get('judge_total_time', 0.0) for rs in reward_stats_list]
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# API keys
2+
OPENAI_API_KEY="sk-xxx"
3+
OPENAI_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
4+
RM_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
5+
RM_API_KEY="sk-xxx"
6+
OPENJUDGE_BASE_URL="https://dashscope.aliyuncs.com/compatible-mode/v1"
7+
OPENJUDGE_API_KEY="sk-xxx"
8+
STRONG_MODEL_API_KEY="sk-xxx"
9+
10+
SWANLAB_API_KEY="xxx"
11+
12+
# data path, save path
13+
ENV_SERVICE_ROOT="/path/to/env_service"
14+
CONDA_PATH="/path/to/conda/conda.sh"
15+
MODEL_PATH="/path/to/base_model"
16+
CKPT_SAVE_PATH="/path/to/ckpt_path"
17+
# 新增:数据文件路径配置
18+
TRAIN_DATA_PATH="/path/to/train_data"
19+
VAL_DATA_PATH="/path/to/val_data"
20+
21+
22+
TRAIN_REF_ANS_PATH="/path/to/train_reference_answer"
23+
VAL_REF_ANS_PATH="/path/to/val_reference_answer"
24+
25+
26+
# Port
27+
ADDR=""
28+
MCP_PORT=""

0 commit comments

Comments
 (0)