Gym/resources_servers/multichallenge/configs/multichallenge_nrl.yaml at main · hackIDLE/Gym · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
# ============================================================================
# NeMo-Gym Configuration for MultiChallenge Environment
# ============================================================================
# This config defines the MultiChallenge environment and LLM judge setup.
#
# LLM Judge Configuration:
#   By default, the judge uses the same vLLM server as the policy model.
#   This is efficient as no additional GPU resources are needed.
#
#   To use a separate judge endpoint, uncomment the judge_model section below
#   and change judge_model_server.name to "judge_model".
# ============================================================================

# ============================================================================
# MultiChallenge Resources Server
# ============================================================================
multichallenge:
  resources_servers:
    multichallenge:
      entrypoint: app.py

      # Judge model reference - uses the same vLLM server as the policy model
      # This means generation and judging share the same model instance.
      # To use a separate judge, change 'name' to 'judge_model' and configure below.
      judge_model_server:
        type: responses_api_models
        name: policy_model  # Uses the same model as policy/generation

      # Parameters for judge requests
      # NOTE: temperature and top_p must match policy.generation settings in grpo config
      # because NeMo-RL enforces consistent sampling params for on-policy inference.
      judge_responses_create_params:
        input: []
        max_output_tokens: 512
        temperature: 1.0
        top_p: 1.0

      # Aggregation mode for rubric scores: mean | min | max | all | any | weighted
      aggregation_mode: mean
      parallel_evaluation: true

      # Judge system message
      judge_system_message: >-
        You are a precise evaluator. Assess responses objectively based on the given criteria.
        Analyze the response carefully against the evaluation question.

      # Judge prompt template with placeholders
      judge_prompt_template: |-
        You are evaluating whether a model's response meets a specific criterion.

        CONVERSATION CONTEXT:
        {context}

        MODEL'S FINAL RESPONSE:
        {response}

        EVALUATION QUESTION:
        {question}

        EXPECTED ANSWER: {pass_criteria}

        Does the model's response satisfy the criterion described in the evaluation question?
        Think step by step, then respond with exactly [[YES]] or [[NO]] on the last line.

      # Verdict labels
      yes_label: "[[YES]]"
      no_label: "[[NO]]"

      domain: knowledge
      description: Targets inference memory, instruction retention, version editing, and self-coherence.
      value: Improve complex multi-turn conversational capability
      verified: false

# ============================================================================
# MultiChallenge Agent Configuration
# ============================================================================
multichallenge_simple_agent:
  responses_api_agents:
    simple_agent:
      entrypoint: app.py
      resources_server:
        type: resources_servers
        name: multichallenge
      model_server:
        type: responses_api_models
        name: policy_model
      datasets:
        - name: multichallenge_example
          type: example
          license: "Apache 2.0"
          jsonl_fpath: resources_servers/multichallenge/data/example.jsonl
        - name: multichallenge_advanced
          type: train
          license: "TBD"
          jsonl_fpath: resources_servers/multichallenge/data/advanced.jsonl
        - name: multichallenge_vanilla
          type: train
          license: "TBD"
          jsonl_fpath: resources_servers/multichallenge/data/vanilla.jsonl
        - name: train
          type: train
          license: Creative Commons Attribution 4.0 International
          jsonl_fpath: resources_servers/multichallenge/data/Nemotron-RL-Instruction-Following-MultiTurnChat-v1_train.jsonl
          huggingface_identifier:
            repo_id: nvidia/Nemotron-RL-Instruction-Following-MultiTurnChat-v1
            artifact_fpath: train.jsonl
      verified: false

# ============================================================================
# Judge Model Server Configuration (Optional)
# ============================================================================
# By default, the judge uses 'policy_model' (configured above).
# This means the same vLLM server handles both generation and judging.
#
# If you want a SEPARATE judge model, uncomment ONE of the options below
# and change judge_model_server.name from 'policy_model' to 'judge_model'.
# ============================================================================

# ----------------------------------------------------------------------------
# OPTION: External OpenAI-compatible API for Judge
# ----------------------------------------------------------------------------
# Use this to connect to an external endpoint (NVIDIA NIM, OpenAI, etc.)
# Configure credentials in env.yaml or via command-line overrides.
#
# judge_model:
#   responses_api_models:
#     openai_model:
#       entrypoint: app.py
#       openai_base_url: ${judge_base_url}
#       openai_api_key: ${judge_api_key}
#       openai_model: ${judge_model_name}