Skip to content
GitLab
Explore
Sign in
Register
Primary navigation
Search or go to…
Project
R
reinforcementlearning
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Fredrik Bagge Carlson
reinforcementlearning
Commits
b6f79058
Commit
b6f79058
authored
Mar 8, 2019
by
Fredrik Bagge Carlson
Browse files
Options
Downloads
Patches
Plain Diff
update tdlambda
parent
224417a8
No related branches found
No related tags found
No related merge requests found
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
tdlambda.jl
+10
-10
10 additions, 10 deletions
tdlambda.jl
tdlambda_setup.jl
+43
-36
43 additions, 36 deletions
tdlambda_setup.jl
with
53 additions
and
46 deletions
tdlambda.jl
+
10
−
10
View file @
b6f79058
cd
(
@__DIR__
)
@everywhere
begin
@everywhere
begin
num_episodes
=
4
0
num_episodes
=
15
0
α
=
1
# Initial learning rate
α
=
1
# Initial learning rate
const
ϵ
=
0.5
# Initial chance of choosing random action
const
ϵ
=
0.5
# Initial chance of choosing random action
const
decay_rate
=
0.99
# decay rate for learning rate and ϵ
const
decay_rate
=
0.99
# decay rate for learning rate and ϵ
...
@@ -13,11 +14,11 @@ end
...
@@ -13,11 +14,11 @@ end
functions
=
[
TDλlearning
,
TDOλlearning
]
functions
=
[
TDλlearning
,
TDOλlearning
]
for
fun
in
functions
for
fun
in
functions
mc_runs
=
3
mc_runs
=
1
λvec
=
linspace
(
0
,
1
,
10
)
λvec
=
linspace
(
0
,
1
,
5
)
λvecMC
=
repmat
(
λvec
'
,
mc_runs
)[
:
]
λvecMC
=
repmat
(
λvec
'
,
mc_runs
)[
:
]
n
=
length
(
λvec
)
n
=
length
(
λvec
)
res
=
pmap
(
λvecMC
)
do
λ
@time
res
=
pmap
(
λvecMC
)
do
λ
fun
(
num_episodes
,
α
,
λ
)
fun
(
num_episodes
,
α
,
λ
)
end
end
rewards
=
getindex
.
(
res
,
1
)
rewards
=
getindex
.
(
res
,
1
)
...
@@ -28,11 +29,10 @@ for fun in functions
...
@@ -28,11 +29,10 @@ for fun in functions
mean
(
data
,
1
)[
:
],
std
(
data
,
1
)[
:
]
mean
(
data
,
1
)[
:
],
std
(
data
,
1
)[
:
]
end
end
average_reward
,
ae
=
[
mean
(
r
.
values
[
end
-
20
:
end
])
for
r
in
rewards
]
|>
average
# @show average_reward,ae = [mean(r.values[end-3:end]) for r in rewards] |> average
average_eval
,
aee
=
[
mean
(
r
.
values
)
for
r
in
rewards
]
|>
average
# average_eval,aee = [mean(r.values) for r in evals] |> average
max_eval
,
me
=
[
maximum
(
r
.
values
)
for
r
in
evals
]
|>
average
# max_eval,me = [maximum(r.values) for r in evals] |> average
#
scatter
(
λvec
.+
0.1
rand
(
n
,
3
)
-
0.05
,[
average_reward
average_eval
max_eval
],
xlabel
=
"λ"
,
lab
=
[
"Average reward"
"Average eval"
"Max eval"
],
title
=
string
(
fun
),
# scatter(λvec.+0.1rand(n,3)-0.05,[average_reward average_eval max_eval], xlabel="λ", lab=["Average reward" "Average eval" "Max eval"], title=string(fun), yerror=[ae aee me])
yerror
=
[
ae
aee
me
])
end
end
This diff is collapsed.
Click to expand it.
tdlambda_setup.jl
+
43
−
36
View file @
b6f79058
...
@@ -4,7 +4,7 @@ const env = GymEnv("CartPole-v0")
...
@@ -4,7 +4,7 @@ const env = GymEnv("CartPole-v0")
typealias
SARS
Tuple
{
Vector
{
Float64
},
Int
,
Float64
,
Vector
{
Float64
}}
typealias
SARS
Tuple
{
Vector
{
Float64
},
Int
,
Float64
,
Vector
{
Float64
}}
typealias
V64
Vector
{
Float64
}
typealias
V64
Vector
{
Float64
}
const
ϕ
=
MultiUniformRBFE
([
linspace
(
-
0.3
,
0.3
)
linspace
(
-
2
,
2
)
linspace
(
-
0.2
,
0.2
)
linspace
(
-
3.2
,
3.2
)
linspace
(
0
,
1
)],
[
4
,
4
,
4
,
4
,
2
])
const
ϕ
=
MultiUniformRBFE
([
linspace
(
-
0.3
,
0.3
)
linspace
(
-
2
,
2
)
linspace
(
-
0.2
,
0.2
)
linspace
(
-
3.2
,
3.2
)
linspace
(
0
,
1
)],
[
5
,
5
,
5
,
5
,
2
])
const
P
=
length
(
ϕ
(
zeros
(
5
)))
const
P
=
length
(
ϕ
(
zeros
(
5
)))
type
Qfun
type
Qfun
...
@@ -44,6 +44,7 @@ const ϵpolicy = ϵGreedyPolicy(ϵ, decay_rate)
...
@@ -44,6 +44,7 @@ const ϵpolicy = ϵGreedyPolicy(ϵ, decay_rate)
const
policy
=
GreedyPolicy
()
const
policy
=
GreedyPolicy
()
function
TDλlearning
(
num_episodes
,
α
,
λ
)
function
TDλlearning
(
num_episodes
,
α
,
λ
)
Q
.
θ
.*=
0
reward_history
=
ValueHistories
.
History
(
Float64
)
reward_history
=
ValueHistories
.
History
(
Float64
)
eval_history
=
ValueHistories
.
History
(
Float64
)
eval_history
=
ValueHistories
.
History
(
Float64
)
for
i
=
1
:
num_episodes
for
i
=
1
:
num_episodes
...
@@ -52,7 +53,8 @@ function TDλlearning(num_episodes,α,λ)
...
@@ -52,7 +53,8 @@ function TDλlearning(num_episodes,α,λ)
# α *= decay_rate # Decay the learning rate
# α *= decay_rate # Decay the learning rate
decay!
(
ϵpolicy
)
# Decay greedyness
decay!
(
ϵpolicy
)
# Decay greedyness
t
=
0
t
=
0
for
(
t
,(
s
::
V64
,
a
::
Int
,
r
::
Float64
,
s1
::
V64
))
in
enumerate
(
take
(
ep
,
max_timesteps
))
for
(
s
::
V64
,
a
::
Int
,
r
::
Float64
,
s1
::
V64
)
in
take
(
ep
,
max_timesteps
)
t
+=
1
δ
::
Float64
=
r
+
γ
*
max_a
(
Q
,
s1
)
-
Q
(
s
,
a
)
δ
::
Float64
=
r
+
γ
*
max_a
(
Q
,
s1
)
-
Q
(
s
,
a
)
E
=
γ
*
λ
*
E
+
ϕ
([
s
;
a
])
::
V64
E
=
γ
*
λ
*
E
+
ϕ
([
s
;
a
])
::
V64
Q
.
θ
+=
α
*
δ
*
E
Q
.
θ
+=
α
*
δ
*
E
...
@@ -64,6 +66,7 @@ function TDλlearning(num_episodes,α,λ)
...
@@ -64,6 +66,7 @@ function TDλlearning(num_episodes,α,λ)
end
end
function
TDOλlearning
(
num_episodes
,
α
,
λ
)
function
TDOλlearning
(
num_episodes
,
α
,
λ
)
Q
.
θ
.*=
0
reward_history
=
ValueHistories
.
History
(
Float64
)
reward_history
=
ValueHistories
.
History
(
Float64
)
eval_history
=
ValueHistories
.
History
(
Float64
)
eval_history
=
ValueHistories
.
History
(
Float64
)
for
i
=
1
:
num_episodes
for
i
=
1
:
num_episodes
...
@@ -72,7 +75,8 @@ function TDOλlearning(num_episodes,α,λ)
...
@@ -72,7 +75,8 @@ function TDOλlearning(num_episodes,α,λ)
# α *= decay_rate # Decay the learning rate
# α *= decay_rate # Decay the learning rate
decay!
(
ϵpolicy
)
# Decay greedyness
decay!
(
ϵpolicy
)
# Decay greedyness
t
,
qs
=
0
,
0.
t
,
qs
=
0
,
0.
for
(
t
,(
s
::
V64
,
a
::
Int
,
r
::
Float64
,
s1
::
V64
))
in
enumerate
(
take
(
ep
,
max_timesteps
))
for
(
s
::
V64
,
a
::
Int
,
r
::
Float64
,
s1
::
V64
)
in
take
(
ep
,
max_timesteps
)
t
+=
1
if
t
==
1
;
qs
=
Q
(
s
,
a
)
::
Float64
;
end
if
t
==
1
;
qs
=
Q
(
s
,
a
)
::
Float64
;
end
qs1
=
max_a
(
Q
,
s1
)
::
Float64
qs1
=
max_a
(
Q
,
s1
)
::
Float64
ϕs
=
ϕ
([
s
;
a
])
::
V64
ϕs
=
ϕ
([
s
;
a
])
::
V64
...
@@ -97,39 +101,42 @@ function evaluate(i,eval_history)
...
@@ -97,39 +101,42 @@ function evaluate(i,eval_history)
end
end
end
end
#
# function TDAλlearning(num_episodes,α,λ)
function
TDAλlearning
(
num_episodes
,
α
,
λ
)
# reward_history = ValueHistories.History(Float64)
reward_history
=
ValueHistories
.
History
(
Float64
)
# eval_history = ValueHistories.History(Float64)
eval_history
=
ValueHistories
.
History
(
Float64
)
# sars = Vector{SARS}(max_timesteps)
sars
=
Vector
{
SARS
}(
max_timesteps
)
# for i = 1:num_episodes
for
i
=
1
:
num_episodes
# E = zeros(P) # Eligibility trace
E
=
zeros
(
P
)
# Eligibility trace
# ep = Episode(env, ϵpolicy)
ep
=
Episode
(
env
,
ϵpolicy
)
# α *= decay_rate # Decay the learning rate
α
*=
decay_rate
# Decay the learning rate
# decay!(ϵpolicy) # Decay greedyness
decay!
(
ϵpolicy
)
# Decay greedyness
# t = 0
t
=
0
# for (t,sarst::SARS) in enumerate(take(ep,max_timesteps))
for
sarst
in
take
(
ep
,
max_timesteps
)
# sars[t] = sarst
t
+=
1
# end
sars
[
t
]
=
sarst
# rewards = [sars[3] for sars in sars]
end
# Return = discounted_return(rewards,γ)
rewards
=
[
sars
[
3
]
for
sars
in
sars
]
# for (t,(s,a,r,s1)::SARS) in enumerate(sars)
Return
=
discounted_return
(
rewards
,
γ
)
# δ::Float64 = r + γ*max_a(Q, s1) - Q(s,a)
t
=
0
# E = γ*λ*E + ϕ([s;a])::Vector{Float64}
for
(
s
,
a
,
r
,
s1
)
in
sars
# Q.θ += α*δ*E
t
+=
1
# end
δ
::
Float64
=
r
+
γ
*
max_a
(
Q
,
s1
)
-
Q
(
s
,
a
)
# push!(reward_history, i, ep.total_reward)
E
=
γ
*
λ
*
E
+
ϕ
([
s
;
a
])
::
Vector
{
Float64
}
# if i % 10 == 0 # Evaluate without noise
Q
.
θ
+=
α
*
δ
*
E
# ep = Episode(env, policy)
end
# for sars in take(ep,max_timesteps)
push!
(
reward_history
,
i
,
ep
.
total_reward
)
# end
if
i
%
10
==
0
# Evaluate without noise
# println("Episode: $i, reward: $(ep.total_reward)")
ep
=
Episode
(
env
,
policy
)
# push!(eval_history, i, ep.total_reward)
for
sars
in
take
(
ep
,
max_timesteps
)
# end
end
# end
println
(
"Episode:
$
i, reward:
$
(ep.total_reward)"
)
# reward_history,eval_history
push!
(
eval_history
,
i
,
ep
.
total_reward
)
# end
end
#
end
reward_history
,
eval_history
end
#
#
# function discounted_return(r,γ)
# function discounted_return(r,γ)
# l = length(r)
# l = length(r)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment