From b6f790589ea31b697eebdb4178086f1066e6d94c Mon Sep 17 00:00:00 2001
From: baggepinnen <cont-frb@ulund.org>
Date: Fri, 8 Mar 2019 11:44:06 +0100
Subject: [PATCH] update tdlambda

---
 tdlambda.jl       | 20 ++++++------
 tdlambda_setup.jl | 79 ++++++++++++++++++++++++++---------------------
 2 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/tdlambda.jl b/tdlambda.jl
index c0d8a87..6f33e8a 100644
--- a/tdlambda.jl
+++ b/tdlambda.jl
@@ -1,6 +1,7 @@
 
+cd(@__DIR__)
 @everywhere begin
-    num_episodes        = 40
+    num_episodes        = 150
     α                   = 1 # Initial learning rate
     const ϵ             = 0.5 # Initial chance of choosing random action
     const decay_rate    = 0.99 # decay rate for learning rate and ϵ
@@ -13,11 +14,11 @@ end
 functions = [TDλlearning, TDOλlearning]
 
 for fun in functions
-    mc_runs = 3
-    λvec = linspace(0,1,10)
+    mc_runs = 1
+    λvec = linspace(0,1,5)
     λvecMC = repmat(λvec',mc_runs)[:]
     n = length(λvec)
-    res = pmap(λvecMC) do λ
+    @time res = pmap(λvecMC) do λ
         fun(num_episodes,α,λ)
     end
     rewards = getindex.(res,1)
@@ -28,11 +29,10 @@ for fun in functions
         mean(data,1)[:], std(data,1)[:]
     end
 
-    average_reward,ae = [mean(r.values[end-20:end]) for r in rewards] |> average
-    average_eval,aee = [mean(r.values) for r in rewards] |> average
-    max_eval,me = [maximum(r.values) for r in evals] |> average
-
-    scatter(λvec.+0.1rand(n,3)-0.05,[average_reward average_eval max_eval], xlabel="λ", lab=["Average reward" "Average eval" "Max eval"], title=string(fun),
-    yerror=[ae aee me])
+    # @show average_reward,ae = [mean(r.values[end-3:end]) for r in rewards] |> average
+    # average_eval,aee = [mean(r.values) for r in evals] |> average
+    # max_eval,me = [maximum(r.values) for r in evals] |> average
+    #
+    # scatter(λvec.+0.1rand(n,3)-0.05,[average_reward average_eval max_eval], xlabel="λ", lab=["Average reward" "Average eval" "Max eval"], title=string(fun),    yerror=[ae aee me])
 
 end
diff --git a/tdlambda_setup.jl b/tdlambda_setup.jl
index 2c65bd9..14dc99f 100644
--- a/tdlambda_setup.jl
+++ b/tdlambda_setup.jl
@@ -4,7 +4,7 @@ const env = GymEnv("CartPole-v0")
 typealias SARS Tuple{Vector{Float64},Int,Float64,Vector{Float64}}
 typealias V64 Vector{Float64}
 
-const ϕ = MultiUniformRBFE([linspace(-0.3,0.3) linspace(-2,2) linspace(-0.2,0.2) linspace(-3.2,3.2) linspace(0,1)], [4,4,4,4,2])
+const ϕ = MultiUniformRBFE([linspace(-0.3,0.3) linspace(-2,2) linspace(-0.2,0.2) linspace(-3.2,3.2) linspace(0,1)], [5,5,5,5,2])
 const P = length(ϕ(zeros(5)))
 
 type Qfun
@@ -44,6 +44,7 @@ const ϵpolicy       = ϵGreedyPolicy(ϵ, decay_rate)
 const policy        = GreedyPolicy()
 
 function TDλlearning(num_episodes,α,λ)
+    Q.θ .*= 0
     reward_history = ValueHistories.History(Float64)
     eval_history   = ValueHistories.History(Float64)
     for i = 1:num_episodes
@@ -52,7 +53,8 @@ function TDλlearning(num_episodes,α,λ)
         # α *= decay_rate # Decay the learning rate
         decay!(ϵpolicy) # Decay greedyness
         t = 0
-        for (t,(s::V64,a::Int,r::Float64,s1::V64)) in enumerate(take(ep,max_timesteps))
+        for (s::V64,a::Int,r::Float64,s1::V64) in take(ep,max_timesteps)
+            t += 1
             δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a)
             E          = γ*λ*E + ϕ([s;a])::V64
             Q.θ       += α*δ*E
@@ -64,6 +66,7 @@ function TDλlearning(num_episodes,α,λ)
 end
 
 function TDOλlearning(num_episodes,α,λ)
+    Q.θ .*= 0
     reward_history = ValueHistories.History(Float64)
     eval_history   = ValueHistories.History(Float64)
     for i = 1:num_episodes
@@ -72,7 +75,8 @@ function TDOλlearning(num_episodes,α,λ)
         # α *= decay_rate # Decay the learning rate
         decay!(ϵpolicy) # Decay greedyness
         t, qs = 0, 0.
-        for (t,(s::V64,a::Int,r::Float64,s1::V64)) in enumerate(take(ep,max_timesteps))
+        for (s::V64,a::Int,r::Float64,s1::V64) in take(ep,max_timesteps)
+            t += 1
             if t == 1; qs = Q(s,a)::Float64; end
             qs1        = max_a(Q, s1)::Float64
             ϕs         = ϕ([s;a])::V64
@@ -97,39 +101,42 @@ function evaluate(i,eval_history)
     end
 end
 
-#
-# function TDAλlearning(num_episodes,α,λ)
-#     reward_history = ValueHistories.History(Float64)
-#     eval_history   = ValueHistories.History(Float64)
-#     sars           = Vector{SARS}(max_timesteps)
-#     for i = 1:num_episodes
-#         E  = zeros(P)   # Eligibility trace
-#         ep = Episode(env, ϵpolicy)
-#         α *= decay_rate # Decay the learning rate
-#         decay!(ϵpolicy) # Decay greedyness
-#         t = 0
-#         for (t,sarst::SARS) in enumerate(take(ep,max_timesteps))
-#             sars[t] = sarst
-#         end
-#         rewards = [sars[3] for sars in sars]
-#         Return = discounted_return(rewards,γ)
-#         for (t,(s,a,r,s1)::SARS) in enumerate(sars)
-#             δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a)
-#             E          = γ*λ*E + ϕ([s;a])::Vector{Float64}
-#             Q.θ       += α*δ*E
-#         end
-#         push!(reward_history, i, ep.total_reward)
-#         if i % 10 == 0 # Evaluate without noise
-#             ep = Episode(env, policy)
-#             for sars in take(ep,max_timesteps)
-#             end
-#             println("Episode: $i, reward: $(ep.total_reward)")
-#             push!(eval_history, i, ep.total_reward)
-#         end
-#     end
-#     reward_history,eval_history
-# end
-#
+
+function TDAλlearning(num_episodes,α,λ)
+    reward_history = ValueHistories.History(Float64)
+    eval_history   = ValueHistories.History(Float64)
+    sars           = Vector{SARS}(max_timesteps)
+    for i = 1:num_episodes
+        E  = zeros(P)   # Eligibility trace
+        ep = Episode(env, ϵpolicy)
+        α *= decay_rate # Decay the learning rate
+        decay!(ϵpolicy) # Decay greedyness
+        t = 0
+        for sarst in take(ep,max_timesteps)
+            t += 1
+            sars[t] = sarst
+        end
+        rewards = [sars[3] for sars in sars]
+        Return = discounted_return(rewards,γ)
+        t = 0
+        for (s,a,r,s1) in sars
+            t += 1
+            δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a)
+            E          = γ*λ*E + ϕ([s;a])::Vector{Float64}
+            Q.θ       += α*δ*E
+        end
+        push!(reward_history, i, ep.total_reward)
+        if i % 10 == 0 # Evaluate without noise
+            ep = Episode(env, policy)
+            for sars in take(ep,max_timesteps)
+            end
+            println("Episode: $i, reward: $(ep.total_reward)")
+            push!(eval_history, i, ep.total_reward)
+        end
+    end
+    reward_history,eval_history
+end
+
 #
 # function discounted_return(r,γ)
 #     l        = length(r)
-- 
GitLab