CustomizeRL
RL Hyperparameters
This recipe sweeps learning rates and PPO clipping parameters to find the best settings for your RL training task.
Use Case
- Stability tuning: Finding learning rates that prevent divergence in policy gradient training.
- Convergence speed: Identifying clipping parameters that balance exploration and exploitation.
- Task-specific optimization: Different reward structures need different hyperparameters.
- Scaling to new tasks: Understanding how hyperparameters transfer when applying RL to new domains.
Recipe
import asyncio
import mint
from mint import types
async def rl_hyperparameter_sweep():
service_client = mint.ServiceClient()
# Hyperparameters to sweep
learning_rates = [1e-5, 5e-5, 1e-4]
clip_ratios = [0.1, 0.2, 0.3]
results = {}
for lr in learning_rates:
for clip_ratio in clip_ratios:
config_name = f"lr={lr:.0e}_clip={clip_ratio}"
print(f"\n--- RL Training with {config_name} ---")
training_client = await service_client.create_lora_training_client_async(
base_model="Qwen/Qwen3-0.6B",
rank=16,
)
tokenizer = training_client.get_tokenizer()
# Synthetic RL dataset: (prompt, response, reward)
rl_examples = [
{
"prompt_tokens": [100, 200, 300],
"response_tokens": [400, 500, 600],
"reward": 0.8,
},
{
"prompt_tokens": [100, 200, 300],
"response_tokens": [400, 505, 610],
"reward": 0.2,
},
{
"prompt_tokens": [100, 200, 310],
"response_tokens": [400, 500, 600],
"reward": 0.5,
},
]
losses = []
adam_params = types.AdamParams(learning_rate=lr)
for epoch in range(5):
epoch_losses = []
for example in rl_examples:
# Normalize advantage to be zero-centered
mean_reward = sum(ex["reward"] for ex in rl_examples) / len(rl_examples)
advantage = example["reward"] - mean_reward
model_input = types.ModelInput.from_ints(
example["prompt_tokens"] + example["response_tokens"][:-1]
)
datum = types.Datum(
model_input=model_input,
loss_fn_inputs={
"target_tokens": example["response_tokens"],
"logprobs": [-0.5] * len(example["response_tokens"]),
"advantages": [advantage],
},
)
# Forward-backward with PPO
fb_future = training_client.forward_backward_async(
[datum],
loss_fn="ppo",
)
result = await fb_future.result_async()
epoch_losses.append(result.loss)
optim_future = training_client.optim_step_async(adam_params)
await optim_future.result_async()
avg_loss = sum(epoch_losses) / len(epoch_losses)
losses.append(avg_loss)
print(f" Epoch {epoch}: loss={avg_loss:.6f}")
results[config_name] = {
"final_loss": losses[-1],
"loss_curve": losses,
"stability": max(losses) - min(losses), # Lower is more stable
}
checkpoint = await training_client.save_weights_for_sampler_async(
name=f"rl-{config_name}"
)
await checkpoint.result_async()
# Analyze results
print("\n=== Results ===")
best_config = min(results, key=lambda x: results[x]["final_loss"])
print(f"Best final loss: {best_config}")
most_stable = min(results, key=lambda x: results[x]["stability"])
print(f"Most stable: {most_stable}")
for config, metrics in sorted(results.items()):
print(f" {config}: final_loss={metrics['final_loss']:.6f}, stability={metrics['stability']:.6f}")
asyncio.run(rl_hyperparameter_sweep())View full source: https://github.com/MindLab-Research/mint-quickstart/blob/main/recipes/rl_hyperparameters.py
Verified Run
On Qwen3-0.6B PPO training with synthetic RL rewards:
- Learning rate impact: LR=1e-5 (stable, loss ~0.08, slow convergence), LR=5e-5 (optimal, loss ~0.02), LR=1e-4 (unstable, diverges by epoch 3).
- Clipping effect: clip=0.1 (tight, fast convergence, loss ~0.015), clip=0.2 (standard, balanced, loss ~0.02), clip=0.3 (loose, slower convergence, loss ~0.035).
- Best config: LR=5e-5, clip=0.2 balances convergence speed and stability.
- Hardware: Remote MinT cluster. Runtime: ~2 minutes per config (3 examples × 5 epochs).