Skip to content
Snippets Groups Projects
Commit b6f79058 authored by Fredrik Bagge Carlson's avatar Fredrik Bagge Carlson
Browse files

update tdlambda

parent 224417a8
No related branches found
No related tags found
No related merge requests found
cd(@__DIR__)
@everywhere begin @everywhere begin
num_episodes = 40 num_episodes = 150
α = 1 # Initial learning rate α = 1 # Initial learning rate
const ϵ = 0.5 # Initial chance of choosing random action const ϵ = 0.5 # Initial chance of choosing random action
const decay_rate = 0.99 # decay rate for learning rate and ϵ const decay_rate = 0.99 # decay rate for learning rate and ϵ
...@@ -13,11 +14,11 @@ end ...@@ -13,11 +14,11 @@ end
functions = [TDλlearning, TDOλlearning] functions = [TDλlearning, TDOλlearning]
for fun in functions for fun in functions
mc_runs = 3 mc_runs = 1
λvec = linspace(0,1,10) λvec = linspace(0,1,5)
λvecMC = repmat(λvec',mc_runs)[:] λvecMC = repmat(λvec',mc_runs)[:]
n = length(λvec) n = length(λvec)
res = pmap(λvecMC) do λ @time res = pmap(λvecMC) do λ
fun(num_episodes,α,λ) fun(num_episodes,α,λ)
end end
rewards = getindex.(res,1) rewards = getindex.(res,1)
...@@ -28,11 +29,10 @@ for fun in functions ...@@ -28,11 +29,10 @@ for fun in functions
mean(data,1)[:], std(data,1)[:] mean(data,1)[:], std(data,1)[:]
end end
average_reward,ae = [mean(r.values[end-20:end]) for r in rewards] |> average # @show average_reward,ae = [mean(r.values[end-3:end]) for r in rewards] |> average
average_eval,aee = [mean(r.values) for r in rewards] |> average # average_eval,aee = [mean(r.values) for r in evals] |> average
max_eval,me = [maximum(r.values) for r in evals] |> average # max_eval,me = [maximum(r.values) for r in evals] |> average
#
scatter(λvec.+0.1rand(n,3)-0.05,[average_reward average_eval max_eval], xlabel="λ", lab=["Average reward" "Average eval" "Max eval"], title=string(fun), # scatter(λvec.+0.1rand(n,3)-0.05,[average_reward average_eval max_eval], xlabel="λ", lab=["Average reward" "Average eval" "Max eval"], title=string(fun), yerror=[ae aee me])
yerror=[ae aee me])
end end
...@@ -4,7 +4,7 @@ const env = GymEnv("CartPole-v0") ...@@ -4,7 +4,7 @@ const env = GymEnv("CartPole-v0")
typealias SARS Tuple{Vector{Float64},Int,Float64,Vector{Float64}} typealias SARS Tuple{Vector{Float64},Int,Float64,Vector{Float64}}
typealias V64 Vector{Float64} typealias V64 Vector{Float64}
const ϕ = MultiUniformRBFE([linspace(-0.3,0.3) linspace(-2,2) linspace(-0.2,0.2) linspace(-3.2,3.2) linspace(0,1)], [4,4,4,4,2]) const ϕ = MultiUniformRBFE([linspace(-0.3,0.3) linspace(-2,2) linspace(-0.2,0.2) linspace(-3.2,3.2) linspace(0,1)], [5,5,5,5,2])
const P = length(ϕ(zeros(5))) const P = length(ϕ(zeros(5)))
type Qfun type Qfun
...@@ -44,6 +44,7 @@ const ϵpolicy = ϵGreedyPolicy(ϵ, decay_rate) ...@@ -44,6 +44,7 @@ const ϵpolicy = ϵGreedyPolicy(ϵ, decay_rate)
const policy = GreedyPolicy() const policy = GreedyPolicy()
function TDλlearning(num_episodes,α,λ) function TDλlearning(num_episodes,α,λ)
Q.θ .*= 0
reward_history = ValueHistories.History(Float64) reward_history = ValueHistories.History(Float64)
eval_history = ValueHistories.History(Float64) eval_history = ValueHistories.History(Float64)
for i = 1:num_episodes for i = 1:num_episodes
...@@ -52,7 +53,8 @@ function TDλlearning(num_episodes,α,λ) ...@@ -52,7 +53,8 @@ function TDλlearning(num_episodes,α,λ)
# α *= decay_rate # Decay the learning rate # α *= decay_rate # Decay the learning rate
decay!(ϵpolicy) # Decay greedyness decay!(ϵpolicy) # Decay greedyness
t = 0 t = 0
for (t,(s::V64,a::Int,r::Float64,s1::V64)) in enumerate(take(ep,max_timesteps)) for (s::V64,a::Int,r::Float64,s1::V64) in take(ep,max_timesteps)
t += 1
δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a) δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a)
E = γ*λ*E + ϕ([s;a])::V64 E = γ*λ*E + ϕ([s;a])::V64
Q.θ += α*δ*E Q.θ += α*δ*E
...@@ -64,6 +66,7 @@ function TDλlearning(num_episodes,α,λ) ...@@ -64,6 +66,7 @@ function TDλlearning(num_episodes,α,λ)
end end
function TDOλlearning(num_episodes,α,λ) function TDOλlearning(num_episodes,α,λ)
Q.θ .*= 0
reward_history = ValueHistories.History(Float64) reward_history = ValueHistories.History(Float64)
eval_history = ValueHistories.History(Float64) eval_history = ValueHistories.History(Float64)
for i = 1:num_episodes for i = 1:num_episodes
...@@ -72,7 +75,8 @@ function TDOλlearning(num_episodes,α,λ) ...@@ -72,7 +75,8 @@ function TDOλlearning(num_episodes,α,λ)
# α *= decay_rate # Decay the learning rate # α *= decay_rate # Decay the learning rate
decay!(ϵpolicy) # Decay greedyness decay!(ϵpolicy) # Decay greedyness
t, qs = 0, 0. t, qs = 0, 0.
for (t,(s::V64,a::Int,r::Float64,s1::V64)) in enumerate(take(ep,max_timesteps)) for (s::V64,a::Int,r::Float64,s1::V64) in take(ep,max_timesteps)
t += 1
if t == 1; qs = Q(s,a)::Float64; end if t == 1; qs = Q(s,a)::Float64; end
qs1 = max_a(Q, s1)::Float64 qs1 = max_a(Q, s1)::Float64
ϕs = ϕ([s;a])::V64 ϕs = ϕ([s;a])::V64
...@@ -97,39 +101,42 @@ function evaluate(i,eval_history) ...@@ -97,39 +101,42 @@ function evaluate(i,eval_history)
end end
end end
#
# function TDAλlearning(num_episodes,α,λ) function TDAλlearning(num_episodes,α,λ)
# reward_history = ValueHistories.History(Float64) reward_history = ValueHistories.History(Float64)
# eval_history = ValueHistories.History(Float64) eval_history = ValueHistories.History(Float64)
# sars = Vector{SARS}(max_timesteps) sars = Vector{SARS}(max_timesteps)
# for i = 1:num_episodes for i = 1:num_episodes
# E = zeros(P) # Eligibility trace E = zeros(P) # Eligibility trace
# ep = Episode(env, ϵpolicy) ep = Episode(env, ϵpolicy)
# α *= decay_rate # Decay the learning rate α *= decay_rate # Decay the learning rate
# decay!(ϵpolicy) # Decay greedyness decay!(ϵpolicy) # Decay greedyness
# t = 0 t = 0
# for (t,sarst::SARS) in enumerate(take(ep,max_timesteps)) for sarst in take(ep,max_timesteps)
# sars[t] = sarst t += 1
# end sars[t] = sarst
# rewards = [sars[3] for sars in sars] end
# Return = discounted_return(rewards,γ) rewards = [sars[3] for sars in sars]
# for (t,(s,a,r,s1)::SARS) in enumerate(sars) Return = discounted_return(rewards,γ)
# δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a) t = 0
# E = γ*λ*E + ϕ([s;a])::Vector{Float64} for (s,a,r,s1) in sars
# Q.θ += α*δ*E t += 1
# end δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a)
# push!(reward_history, i, ep.total_reward) E = γ*λ*E + ϕ([s;a])::Vector{Float64}
# if i % 10 == 0 # Evaluate without noise Q.θ += α*δ*E
# ep = Episode(env, policy) end
# for sars in take(ep,max_timesteps) push!(reward_history, i, ep.total_reward)
# end if i % 10 == 0 # Evaluate without noise
# println("Episode: $i, reward: $(ep.total_reward)") ep = Episode(env, policy)
# push!(eval_history, i, ep.total_reward) for sars in take(ep,max_timesteps)
# end end
# end println("Episode: $i, reward: $(ep.total_reward)")
# reward_history,eval_history push!(eval_history, i, ep.total_reward)
# end end
# end
reward_history,eval_history
end
# #
# function discounted_return(r,γ) # function discounted_return(r,γ)
# l = length(r) # l = length(r)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment