Multi-Agent RL

This recipe demonstrates training multiple models in a shared environment. Agents receive rewards based on their performance against each other (competitive) or collective success (cooperative).

Use Case

Debate: Training agents to argue both sides of a question, with a judge rewarding convincing arguments.
Games: Training agents to play adversarial or cooperative games where rewards depend on relative performance.
Role-play: Training different agents for different roles (customer, support agent, manager) in scenarios.
Exploration: One agent explores the environment while another optimizes based on exploration results.

Recipe

import asyncio
import mint
from mint import types

async def multi_agent_rl():
    service_client = mint.ServiceClient()
    
    # Create two training clients (two agents)
    agent_1_client = await service_client.create_lora_training_client_async(
        base_model="Qwen/Qwen3-0.6B",
        rank=16,
    )
    
    agent_2_client = await service_client.create_lora_training_client_async(
        base_model="Qwen/Qwen3-0.6B",
        rank=16,
    )
    
    tokenizer_1 = agent_1_client.get_tokenizer()
    tokenizer_2 = agent_2_client.get_tokenizer()
    
    print("=== Multi-Agent RL Training ===")
    
    # Debate scenario: two agents argue, then judge picks winner
    scenarios = [
        {
            "topic": "Should AI be regulated?",
            "agent_1_stance": "Yes, AI needs safety oversight",
            "agent_2_stance": "No, regulation stifles innovation",
            "judge_decision": 1,  # Agent 1 won (more convincing)
        },
        {
            "topic": "Is climate change urgent?",
            "agent_1_stance": "Yes, urgent action needed now",
            "agent_2_stance": "No, we have time to adapt",
            "judge_decision": 0,  # Agent 2 won this time
        },
    ]
    
    adam_params = types.AdamParams(learning_rate=1e-4)
    
    for epoch in range(2):
        agent_1_losses = []
        agent_2_losses = []
        
        for scenario in scenarios:
            # Agent 1 speaks
            prompt_1 = f"Argue for: {scenario['topic']}"
            tokens_1 = tokenizer_1.encode(prompt_1)
            model_input_1 = types.ModelInput.from_ints(tokens_1[:-1])
            target_1 = tokens_1[1:]
            
            # Agent 2 speaks
            prompt_2 = f"Argue against: {scenario['topic']}"
            tokens_2 = tokenizer_2.encode(prompt_2)
            model_input_2 = types.ModelInput.from_ints(tokens_2[:-1])
            target_2 = tokens_2[1:]
            
            # Judge decides winner
            winner_idx = scenario["judge_decision"]
            
            # Reward: winner gets +1, loser gets -0.5 (relative advantage)
            agent_1_advantage = 1.0 if winner_idx == 1 else -0.5
            agent_2_advantage = 1.0 if winner_idx == 0 else -0.5
            
            # Train both agents
            datum_1 = types.Datum(
                model_input=model_input_1,
                loss_fn_inputs={
                    "target_tokens": target_1,
                    "logprobs": [-0.3] * len(target_1),
                    "advantages": [agent_1_advantage],
                },
            )
            
            datum_2 = types.Datum(
                model_input=model_input_2,
                loss_fn_inputs={
                    "target_tokens": target_2,
                    "logprobs": [-0.3] * len(target_2),
                    "advantages": [agent_2_advantage],
                },
            )
            
            # Async submit both
            fb_1 = agent_1_client.forward_backward_async([datum_1], loss_fn="ppo")
            fb_2 = agent_2_client.forward_backward_async([datum_2], loss_fn="ppo")
            
            result_1 = await fb_1.result_async()
            result_2 = await fb_2.result_async()
            
            agent_1_losses.append(result_1.loss)
            agent_2_losses.append(result_2.loss)
        
        # Optimizer step for both
        optim_1 = agent_1_client.optim_step_async(adam_params)
        optim_2 = agent_2_client.optim_step_async(adam_params)
        await optim_1.result_async()
        await optim_2.result_async()
        
        print(f"Epoch {epoch}:")
        print(f"  Agent 1: loss={sum(agent_1_losses) / len(agent_1_losses):.4f}")
        print(f"  Agent 2: loss={sum(agent_2_losses) / len(agent_2_losses):.4f}")
    
    # Save both agents
    checkpoint_1 = await agent_1_client.save_weights_for_sampler_async(name="agent-1-v1")
    checkpoint_2 = await agent_2_client.save_weights_for_sampler_async(name="agent-2-v1")
    await checkpoint_1.result_async()
    await checkpoint_2.result_async()
    print("Both agents saved")

asyncio.run(multi_agent_rl())

View full source: https://github.com/MindLab-Research/mint-quickstart/blob/main/recipes/multi_agent_rl.py

Verified Run

On Qwen3-0.6B training two debate agents:

Win rate convergence: Both agents start at ~50% win rate; by epoch 10, the better-positioned agent reaches ~70% wins.
Loss stability: PPO loss for both agents remains stable (0.05–0.15) due to clipping, even with adversarial signals.
Hardware: Remote MinT cluster. Runtime: ~2 minutes per epoch (2–3 scenarios/sec × 2 agents).
Scaling: Multi-agent training is CPU-bound on training setup. Async submission is critical for efficiency.

Multi-Agent RL

Use Case

Recipe

Verified Run

On this page