forked from NVIDIA-NeMo/Gym
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmultichallenge_nrl.yaml
More file actions
131 lines (115 loc) · 5.25 KB
/
multichallenge_nrl.yaml
File metadata and controls
131 lines (115 loc) · 5.25 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# ============================================================================
# NeMo-Gym Configuration for MultiChallenge Environment
# ============================================================================
# This config defines the MultiChallenge environment and LLM judge setup.
#
# LLM Judge Configuration:
# By default, the judge uses the same vLLM server as the policy model.
# This is efficient as no additional GPU resources are needed.
#
# To use a separate judge endpoint, uncomment the judge_model section below
# and change judge_model_server.name to "judge_model".
# ============================================================================
# ============================================================================
# MultiChallenge Resources Server
# ============================================================================
multichallenge:
resources_servers:
multichallenge:
entrypoint: app.py
# Judge model reference - uses the same vLLM server as the policy model
# This means generation and judging share the same model instance.
# To use a separate judge, change 'name' to 'judge_model' and configure below.
judge_model_server:
type: responses_api_models
name: policy_model # Uses the same model as policy/generation
# Parameters for judge requests
# NOTE: temperature and top_p must match policy.generation settings in grpo config
# because NeMo-RL enforces consistent sampling params for on-policy inference.
judge_responses_create_params:
input: []
max_output_tokens: 512
temperature: 1.0
top_p: 1.0
# Aggregation mode for rubric scores: mean | min | max | all | any | weighted
aggregation_mode: mean
parallel_evaluation: true
# Judge system message
judge_system_message: >-
You are a precise evaluator. Assess responses objectively based on the given criteria.
Analyze the response carefully against the evaluation question.
# Judge prompt template with placeholders
judge_prompt_template: |-
You are evaluating whether a model's response meets a specific criterion.
CONVERSATION CONTEXT:
{context}
MODEL'S FINAL RESPONSE:
{response}
EVALUATION QUESTION:
{question}
EXPECTED ANSWER: {pass_criteria}
Does the model's response satisfy the criterion described in the evaluation question?
Think step by step, then respond with exactly [[YES]] or [[NO]] on the last line.
# Verdict labels
yes_label: "[[YES]]"
no_label: "[[NO]]"
domain: knowledge
description: Targets inference memory, instruction retention, version editing, and self-coherence.
value: Improve complex multi-turn conversational capability
verified: false
# ============================================================================
# MultiChallenge Agent Configuration
# ============================================================================
multichallenge_simple_agent:
responses_api_agents:
simple_agent:
entrypoint: app.py
resources_server:
type: resources_servers
name: multichallenge
model_server:
type: responses_api_models
name: policy_model
datasets:
- name: multichallenge_example
type: example
license: "Apache 2.0"
jsonl_fpath: resources_servers/multichallenge/data/example.jsonl
- name: multichallenge_advanced
type: train
license: "TBD"
jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl
- name: multichallenge_vanilla
type: train
license: "TBD"
jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl
- name: train
type: train
license: Creative Commons Attribution 4.0 International
jsonl_fpath: resources_servers/multichallenge/data/Nemotron-RL-Instruction-Following-MultiTurnChat-v1_train.jsonl
huggingface_identifier:
repo_id: nvidia/Nemotron-RL-Instruction-Following-MultiTurnChat-v1
artifact_fpath: train.jsonl
verified: false
# ============================================================================
# Judge Model Server Configuration (Optional)
# ============================================================================
# By default, the judge uses 'policy_model' (configured above).
# This means the same vLLM server handles both generation and judging.
#
# If you want a SEPARATE judge model, uncomment ONE of the options below
# and change judge_model_server.name from 'policy_model' to 'judge_model'.
# ============================================================================
# ----------------------------------------------------------------------------
# OPTION: External OpenAI-compatible API for Judge
# ----------------------------------------------------------------------------
# Use this to connect to an external endpoint (NVIDIA NIM, OpenAI, etc.)
# Configure credentials in env.yaml or via command-line overrides.
#
# judge_model:
# responses_api_models:
# openai_model:
# entrypoint: app.py
# openai_base_url: ${judge_base_url}
# openai_api_key: ${judge_api_key}
# openai_model: ${judge_model_name}