JuliaRL_PPO_Pendulum

Source code Author Update time

using ReinforcementLearning
using StableRNGs
using Flux
using Flux.Losses
using Distributions

function RL.Experiment(
    ::Val{:JuliaRL},
    ::Val{:PPO},
    ::Val{:Pendulum},
    ::Nothing;
    save_dir = nothing,
    seed = 123,
)
    rng = StableRNG(seed)
    inner_env = PendulumEnv(T = Float32, rng = rng)
    A = action_space(inner_env)
    low = A.left
    high = A.right
    ns = length(state(inner_env))

    N_ENV = 8
    UPDATE_FREQ = 2048
    env = MultiThreadEnv([
        PendulumEnv(T = Float32, rng = StableRNG(hash(seed + i))) |>
        env -> ActionTransformedEnv(env, action_mapping = x -> clamp(x * 2, low, high)) for i in 1:N_ENV
    ])

    init = glorot_uniform(rng)

    agent = Agent(
        policy = PPOPolicy(
            approximator = ActorCritic(
                actor = GaussianNetwork(
                    pre = Chain(
                        Dense(ns, 64, relu; init = glorot_uniform(rng)),
                        Dense(64, 64, relu; init = glorot_uniform(rng)),
                    ),
                    μ = Chain(Dense(64, 1, tanh; init = glorot_uniform(rng)), vec),
                    logσ = Chain(Dense(64, 1; init = glorot_uniform(rng)), vec),
                ),
                critic = Chain(
                    Dense(ns, 64, relu; init = glorot_uniform(rng)),
                    Dense(64, 64, relu; init = glorot_uniform(rng)),
                    Dense(64, 1; init = glorot_uniform(rng)),
                ),
                optimizer = ADAM(3e-4),
            ) |> gpu,
            γ = 0.99f0,
            λ = 0.95f0,
            clip_range = 0.2f0,
            max_grad_norm = 0.5f0,
            n_epochs = 10,
            n_microbatches = 32,
            actor_loss_weight = 1.0f0,
            critic_loss_weight = 0.5f0,
            entropy_loss_weight = 0.00f0,
            dist = Normal,
            rng = rng,
            update_freq = UPDATE_FREQ,
        ),
        trajectory = PPOTrajectory(;
            capacity = UPDATE_FREQ,
            state = Matrix{Float32} => (ns, N_ENV),
            action = Vector{Float32} => (N_ENV,),
            action_log_prob = Vector{Float32} => (N_ENV,),
            reward = Vector{Float32} => (N_ENV,),
            terminal = Vector{Bool} => (N_ENV,),
        ),
    )

    stop_condition = StopAfterStep(50_000, is_show_progress=!haskey(ENV, "CI"))
    hook = TotalBatchRewardPerEpisode(N_ENV)
    Experiment(agent, env, stop_condition, hook, "# Play Pendulum with PPO")
end
using Plots
using Statistics
ex = E`JuliaRL_PPO_Pendulum`
run(ex)
n = minimum(map(length, ex.hook.rewards))
m = mean([@view(x[1:n]) for x in ex.hook.rewards])
s = std([@view(x[1:n]) for x in ex.hook.rewards])
plot(m,ribbon=s)
                     Avg total reward per episode
               ┌────────────────────────────────────────┐
             0 │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢰⣆⣷⣦⣿⡷⣴⣦⡇⠀⠀⠀⠀⠀⠀│
               │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢠⣿⣿⣇⣹⡝⣵⣿⣙⡀⠀⠀⠀⠀⠀⠀│
               │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣿⣿⣿⣿⡷⣿⣿⡻⠁⠀⠀⠀⠀⠀⠀│
               │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⢸⣾⣾⣛⣿⣿⣿⣧⣿⣾⢿⡇⠀⠀⠀⠀⠀⠀│
               │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⡀⠀⠀⠀⢀⢀⣄⣤⣿⣏⣹⣿⣿⣿⢿⡿⣿⠃⠈⠁⠀⠀⠀⠀⠀⠀│
               │⠀⠀⠀⢀⠀⠀⠀⢀⢀⠀⠀⡀⢀⣀⣇⣇⣰⣷⡾⣾⡿⣿⣿⣿⡿⠁⣿⡇⠸⡇⢻⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               │⣄⣆⢠⣼⣴⠀⣤⣾⣿⣷⡼⣷⣿⡿⣿⢿⣿⣹⣷⣿⣷⣿⣿⣿⣷⣷⠋⠀⠀⠇⠈⠀⠀⠀⠀⠀⠀⠀⠀⠀│
   Score       │⣿⣼⢿⠋⠟⡿⣿⣯⣇⣩⣷⣧⣿⣿⣷⣿⣿⢻⣧⣿⣿⣿⢿⣿⡟⢸⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               │⠏⣿⣾⢰⣶⡀⣾⣿⣿⣿⣿⣯⣿⣿⣿⣾⣿⣿⠹⣿⣿⠋⢸⡏⠁⠘⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               │⣾⣿⣿⢏⢯⢳⣿⣿⣇⡏⣿⠉⠋⠈⠁⣿⣿⠃⠀⠹⡇⠀⠘⠃⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               │⢁⣿⣿⢸⣾⣇⣿⣿⠿⠃⣿⠀⠀⠀⠀⠛⡇⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               │⣾⣿⠙⡏⠇⢿⡏⠉⠀⠀⠿⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               │⠻⠀⠀⠀⠀⠈⠁⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
         -2000 │⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀⠀│
               └────────────────────────────────────────┘
               0                                      300
                                Episode


This page was generated using DemoCards.jl and Literate.jl.