ART/dev/yes-no-maybe-kl-advantage-tinker.py at ae9e463b65956c11aa8b587edfcb2742db992088 · OpenPipe/ART · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""Yes-no-maybe training with KL-penalized advantage adjustment (Tinker backend).

Demonstrates the kl_penalty_coef feature: tokens where the policy has drifted
more from the reference model get reduced advantages, while tokens that have
drifted less get increased advantages.

Uses meta-llama/Llama-3.1-8B-Instruct as the base model (trained via Tinker).
"""

import asyncio
from itertools import permutations
import os
import random
import string

from dotenv import load_dotenv
import openai

import art
from art.tinker_native import TinkerNativeBackend


async def rollout(
    client: openai.AsyncOpenAI, model: art.TrainableModel, prompt: str
) -> art.Trajectory:
    messages: art.Messages = [
        {
            "role": "user",
            "content": prompt,
        }
    ]
    chat_completion = await client.chat.completions.create(
        messages=messages, model=model.get_inference_name(), max_tokens=100, timeout=100
    )
    choice = chat_completion.choices[0]
    content = choice.message.content
    assert isinstance(content, str)
    if content == "yes":
        reward = 0.5
    elif content == "no":
        reward = 0.75
    elif content == "maybe":
        reward = 1.0
    else:
        reward = 0.0
    return art.Trajectory(messages_and_choices=[*messages, choice], reward=reward)


def with_quotes(w: str) -> str:
    return f"'{w}'"


async def main():
    load_dotenv()

    backend = TinkerNativeBackend()
    base_model = os.environ.get("BASE_MODEL", "meta-llama/Llama-3.1-8B-Instruct")
    kl_penalty_coef = float(os.environ.get("KL_PENALTY_COEF", "0.1"))
    random_suffix = "".join(random.choices(string.ascii_lowercase, k=4))
    model = art.TrainableModel(
        name=os.environ.get("MODEL_NAME", f"tinker-{random_suffix}-{kl_penalty_coef}"),
        project="yes-no-maybe",
        base_model=base_model,
    )
    await model.register(backend)

    kl_penalty_reference_step: int | None = (
        int(os.environ["KL_REF_STEP"])
        if os.environ.get("KL_REF_STEP") is not None
        else None
    )

    prompts = [
        f"{prefix} with {', '.join([with_quotes(w) if use_quotes else w for w in words]) if len(words) == 3 else f'{words[0]}' + (f' or {words[1]}' if len(words) > 1 else '')}"
        for prefix in ["respond", "just respond"]
        for use_quotes in [True, False]
        for words in (
            list(p) for n in [3, 2] for p in permutations(["yes", "no", "maybe"], n)
        )
    ]

    openai_client = model.openai_client()
    max_steps = int(os.environ.get("NUM_STEPS", "20"))
    start_step = await model.get_step()
    for step in range(start_step, start_step + max_steps):
        train_groups = await art.gather_trajectory_groups(
            (
                art.TrajectoryGroup(
                    rollout(openai_client, model, prompt) for _ in range(32)
                )
                for prompt in prompts
            )
        )
        result = await backend.train(
            model,
            train_groups,
            learning_rate=1e-4,
            kl_penalty_coef=kl_penalty_coef,
            kl_penalty_reference_step=kl_penalty_reference_step,
        )
        await model.log(
            train_groups,
            metrics=result.metrics,
            step=result.step,
            split="train",
        )
        print(f"step {result.step}: {result.metrics}")


if __name__ == "__main__":
    asyncio.run(main())