From b6f790589ea31b697eebdb4178086f1066e6d94c Mon Sep 17 00:00:00 2001 From: baggepinnen <cont-frb@ulund.org> Date: Fri, 8 Mar 2019 11:44:06 +0100 Subject: [PATCH] update tdlambda --- tdlambda.jl | 20 ++++++------ tdlambda_setup.jl | 79 ++++++++++++++++++++++++++--------------------- 2 files changed, 53 insertions(+), 46 deletions(-) diff --git a/tdlambda.jl b/tdlambda.jl index c0d8a87..6f33e8a 100644 --- a/tdlambda.jl +++ b/tdlambda.jl @@ -1,6 +1,7 @@ +cd(@__DIR__) @everywhere begin - num_episodes = 40 + num_episodes = 150 α = 1 # Initial learning rate const ϵ = 0.5 # Initial chance of choosing random action const decay_rate = 0.99 # decay rate for learning rate and ϵ @@ -13,11 +14,11 @@ end functions = [TDλlearning, TDOλlearning] for fun in functions - mc_runs = 3 - λvec = linspace(0,1,10) + mc_runs = 1 + λvec = linspace(0,1,5) λvecMC = repmat(λvec',mc_runs)[:] n = length(λvec) - res = pmap(λvecMC) do λ + @time res = pmap(λvecMC) do λ fun(num_episodes,α,λ) end rewards = getindex.(res,1) @@ -28,11 +29,10 @@ for fun in functions mean(data,1)[:], std(data,1)[:] end - average_reward,ae = [mean(r.values[end-20:end]) for r in rewards] |> average - average_eval,aee = [mean(r.values) for r in rewards] |> average - max_eval,me = [maximum(r.values) for r in evals] |> average - - scatter(λvec.+0.1rand(n,3)-0.05,[average_reward average_eval max_eval], xlabel="λ", lab=["Average reward" "Average eval" "Max eval"], title=string(fun), - yerror=[ae aee me]) + # @show average_reward,ae = [mean(r.values[end-3:end]) for r in rewards] |> average + # average_eval,aee = [mean(r.values) for r in evals] |> average + # max_eval,me = [maximum(r.values) for r in evals] |> average + # + # scatter(λvec.+0.1rand(n,3)-0.05,[average_reward average_eval max_eval], xlabel="λ", lab=["Average reward" "Average eval" "Max eval"], title=string(fun), yerror=[ae aee me]) end diff --git a/tdlambda_setup.jl b/tdlambda_setup.jl index 2c65bd9..14dc99f 100644 --- a/tdlambda_setup.jl +++ b/tdlambda_setup.jl @@ -4,7 +4,7 @@ const env = GymEnv("CartPole-v0") typealias SARS Tuple{Vector{Float64},Int,Float64,Vector{Float64}} typealias V64 Vector{Float64} -const ϕ = MultiUniformRBFE([linspace(-0.3,0.3) linspace(-2,2) linspace(-0.2,0.2) linspace(-3.2,3.2) linspace(0,1)], [4,4,4,4,2]) +const ϕ = MultiUniformRBFE([linspace(-0.3,0.3) linspace(-2,2) linspace(-0.2,0.2) linspace(-3.2,3.2) linspace(0,1)], [5,5,5,5,2]) const P = length(ϕ(zeros(5))) type Qfun @@ -44,6 +44,7 @@ const ϵpolicy = ϵGreedyPolicy(ϵ, decay_rate) const policy = GreedyPolicy() function TDλlearning(num_episodes,α,λ) + Q.θ .*= 0 reward_history = ValueHistories.History(Float64) eval_history = ValueHistories.History(Float64) for i = 1:num_episodes @@ -52,7 +53,8 @@ function TDλlearning(num_episodes,α,λ) # α *= decay_rate # Decay the learning rate decay!(ϵpolicy) # Decay greedyness t = 0 - for (t,(s::V64,a::Int,r::Float64,s1::V64)) in enumerate(take(ep,max_timesteps)) + for (s::V64,a::Int,r::Float64,s1::V64) in take(ep,max_timesteps) + t += 1 δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a) E = γ*λ*E + ϕ([s;a])::V64 Q.θ += α*δ*E @@ -64,6 +66,7 @@ function TDλlearning(num_episodes,α,λ) end function TDOλlearning(num_episodes,α,λ) + Q.θ .*= 0 reward_history = ValueHistories.History(Float64) eval_history = ValueHistories.History(Float64) for i = 1:num_episodes @@ -72,7 +75,8 @@ function TDOλlearning(num_episodes,α,λ) # α *= decay_rate # Decay the learning rate decay!(ϵpolicy) # Decay greedyness t, qs = 0, 0. - for (t,(s::V64,a::Int,r::Float64,s1::V64)) in enumerate(take(ep,max_timesteps)) + for (s::V64,a::Int,r::Float64,s1::V64) in take(ep,max_timesteps) + t += 1 if t == 1; qs = Q(s,a)::Float64; end qs1 = max_a(Q, s1)::Float64 ϕs = ϕ([s;a])::V64 @@ -97,39 +101,42 @@ function evaluate(i,eval_history) end end -# -# function TDAλlearning(num_episodes,α,λ) -# reward_history = ValueHistories.History(Float64) -# eval_history = ValueHistories.History(Float64) -# sars = Vector{SARS}(max_timesteps) -# for i = 1:num_episodes -# E = zeros(P) # Eligibility trace -# ep = Episode(env, ϵpolicy) -# α *= decay_rate # Decay the learning rate -# decay!(ϵpolicy) # Decay greedyness -# t = 0 -# for (t,sarst::SARS) in enumerate(take(ep,max_timesteps)) -# sars[t] = sarst -# end -# rewards = [sars[3] for sars in sars] -# Return = discounted_return(rewards,γ) -# for (t,(s,a,r,s1)::SARS) in enumerate(sars) -# δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a) -# E = γ*λ*E + ϕ([s;a])::Vector{Float64} -# Q.θ += α*δ*E -# end -# push!(reward_history, i, ep.total_reward) -# if i % 10 == 0 # Evaluate without noise -# ep = Episode(env, policy) -# for sars in take(ep,max_timesteps) -# end -# println("Episode: $i, reward: $(ep.total_reward)") -# push!(eval_history, i, ep.total_reward) -# end -# end -# reward_history,eval_history -# end -# + +function TDAλlearning(num_episodes,α,λ) + reward_history = ValueHistories.History(Float64) + eval_history = ValueHistories.History(Float64) + sars = Vector{SARS}(max_timesteps) + for i = 1:num_episodes + E = zeros(P) # Eligibility trace + ep = Episode(env, ϵpolicy) + α *= decay_rate # Decay the learning rate + decay!(ϵpolicy) # Decay greedyness + t = 0 + for sarst in take(ep,max_timesteps) + t += 1 + sars[t] = sarst + end + rewards = [sars[3] for sars in sars] + Return = discounted_return(rewards,γ) + t = 0 + for (s,a,r,s1) in sars + t += 1 + δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a) + E = γ*λ*E + ϕ([s;a])::Vector{Float64} + Q.θ += α*δ*E + end + push!(reward_history, i, ep.total_reward) + if i % 10 == 0 # Evaluate without noise + ep = Episode(env, policy) + for sars in take(ep,max_timesteps) + end + println("Episode: $i, reward: $(ep.total_reward)") + push!(eval_history, i, ep.total_reward) + end + end + reward_history,eval_history +end + # # function discounted_return(r,γ) # l = length(r) -- GitLab