From 059197856f2a14580214671a684d12f712a1d511 Mon Sep 17 00:00:00 2001
From: GIngesson <gabriel@control.lth.se>
Date: Wed, 19 Apr 2017 17:20:16 +0200
Subject: [PATCH] Upload new file

---
 .../presentation/RL.tex                       | 1248 +++++++++++++++++
 1 file changed, 1248 insertions(+)
 create mode 100644 hw7_deep_reinforcement_learning/presentation/RL.tex

diff --git a/hw7_deep_reinforcement_learning/presentation/RL.tex b/hw7_deep_reinforcement_learning/presentation/RL.tex
new file mode 100644
index 0000000..ff26fff
--- /dev/null
+++ b/hw7_deep_reinforcement_learning/presentation/RL.tex
@@ -0,0 +1,1248 @@
+\documentclass[]{beamer}
+\usepackage{mathptmx}
+\usepackage{graphicx}
+\usepackage{lmodern}
+\usepackage{listings}
+\usepackage[english]{babel}
+\usepackage{algorithm,algpseudocode}
+\usepackage{enumerate}
+\usepackage[utf8]{inputenc}
+\usepackage{float}
+\usepackage{tikz}
+\usepackage{pgfplots}
+\usepackage{flexisym}
+\usepackage{caption}
+\definecolor{links}{HTML}{2A1B81}
+\hypersetup{colorlinks,linkcolor=,urlcolor=links}
+\usetikzlibrary{positioning,calc}
+\pagestyle{empty}
+
+\usetikzlibrary{shapes,arrows,shadows}
+%\usepgfplotslibrary{external} 
+%\tikzexternalize
+% Define block styles
+\tikzstyle{decision} = [diamond, draw, fill=blue!20, 
+    text width=4.5em, text badly centered, node distance=3cm, inner sep=0pt]
+\tikzstyle{block} = [rectangle, draw, fill=blue!20, 
+    text width=5em, text centered, rounded corners, minimum height=4em]
+\tikzstyle{line} = [draw, -latex']
+\tikzstyle{cloud} = [draw, ellipse,fill=red!20, node distance=3cm,
+    minimum height=2em]
+
+\newcommand{\mx}[1]{\mathbf{\bm{#1}}} % Matrix command
+\newcommand{\vc}[1]{\mathbf{\bm{#1}}} % Vector command
+
+% Define the layers to draw the diagram
+\pgfdeclarelayer{background}
+\pgfdeclarelayer{foreground}
+\pgfsetlayers{background,main,foreground}
+
+% Define block styles used later
+
+\tikzstyle{sensor}=[draw, fill=blue!20, text width=5em, 
+    text centered, minimum height=2.5em,drop shadow]
+\tikzstyle{ann} = [above, text width=5em, text centered]
+\tikzstyle{wa} = [sensor, text width=10em, fill=red!20, 
+    minimum height=4em, rounded corners, drop shadow]
+\tikzstyle{sc} = [sensor, text width=13em, fill=red!20, 
+    minimum height=10em, rounded corners, drop shadow]
+
+% Define distances for bordering
+\def\blockdist{2.3}
+
+
+\usepackage{color}
+\usepackage{booktabs}
+%\usepackage{lipsum}
+\usepackage{epstopdf}
+%\usepackage[linesnumbered]{algorithm2e}
+\usepackage{multicol}
+\usepackage[absolute,overlay]{textpos}
+%\usepackage[linesnumbered]{algorithm2e}
+\DeclareMathOperator*{\argmin}{arg\,min}
+\DeclareMathOperator*{\spann}{span}
+\DeclareMathOperator*{\diag}{diag}
+%%%%%%%%%% Regler Beamer themes %%%%%%%%%%%%%%%%%
+%\usepackage[lionbackground]{beamerthemeRegler}
+%\usepackage[lioncorner]{beamerthemeRegler}
+%\usepackage[lionheader]{beamerthemeRegler}
+\usetheme[lionheader]{Regler}
+%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
+%\usetheme{Malmoe}
+
+\addtobeamertemplate{navigation symbols}{}{%
+    \usebeamerfont{footline}%
+    \usebeamercolor[fg]{footline}%
+    \hspace{1em}%
+    \insertframenumber/\inserttotalframenumber
+}
+
+% Title page
+\title{Deep-Learning Study Circle: \\
+	\medskip
+	Reinforcement Learning}
+\author{Gabriel Ingesson}
+\institute[Lund University]
+{
+\vspace{14pt}
+}
+
+\makeatletter
+\newcommand{\setnextsection}[1]{%
+  \setcounter{section}{\numexpr#1-1\relax}%
+  \beamer@tocsectionnumber=\numexpr#1-1\relax\space}
+\makeatother
+
+% Slide numbering
+\definecolor{FootGrey}{RGB}{83,121,170}
+\setbeamercolor{foot}{fg=blue,bg=white}
+\setbeamertemplate{footline}{
+    \begin{beamercolorbox}[right, sep=2.5pt]{foot}
+        \insertframenumber{} / \inserttotalframenumber
+    \end{beamercolorbox}
+}
+
+\tikzstyle{sensor}=[draw, fill=blue!20, text width=5em, 
+    text centered, minimum height=2.5em]
+\tikzstyle{ann} = [above, text width=5em]
+\tikzstyle{naveqs} = [sensor, text width=6em, fill=red!20, 
+    minimum height=12em, rounded corners]
+\def\blockdist{2.3}
+\def\edgedist{2.5}
+
+% Outline at the start of each section
+\AtBeginSection[]
+{
+
+}
+\setbeamercovered{invisible}
+\begin{document}
+
+{
+\setbeamertemplate{footline}{}
+\begin{frame}[noframenumbering]
+    \titlepage
+\end{frame}
+}
+
+\begin{frame}
+\frametitle{Reinforcement Learning}
+
+\begin{itemize}       
+	\item The problem where an agent has to learn a policy (behavior) by taking actions in an environment, with the goal that the policy should maximize a cumulative reward.
+
+	\medskip
+	\item Different from supervised and unsupervised learning:
+	\begin{itemize}
+		\item No labeled training data.
+		\item Reward signal instead of trying to find hidden structure.
+	\end{itemize}
+	\medskip
+	\item Reinforcement learning can be used in combination with deep neural networks that can be used to approximate policy and cumulative reward functions.
+\end{itemize}
+
+\end{frame}
+    % Todays presentation is about reinforcement learning which is an area in machine learning, where an agent (software) has to learn a policy or behavior by taking actions in an environment, with the goal that the policy should maximize a cummulative reward.
+    
+   % The interaction part makes it different from supervised and  unsupervised learning.
+   
+   % The connection with deep-learning is that reinforcement learning can be used in combination with deep neural networks that can be used to approximate policy and cummulative reward functions.
+    
+	
+
+\begin{frame}
+	\frametitle{Reinforcement Learning}
+		\begin{textblock*}{15.0cm}(2.0cm,2.0cm) % {block width} (coords)
+			\includegraphics[width=8cm]{RL.png}
+		\end{textblock*}
+
+	\begin{textblock*}{15.0cm}(0.1cm,6.0cm) % {block width} (coords)
+		\begin{itemize}       
+			\item Initially, the agent does not have to know anything about the environment.
+			\medskip
+			\item The agent recieves a reward signal and the environment state.
+			\medskip
+			\item Adjusts its actions in order to maximize the cumulative reward.
+		\end{itemize}
+	\end{textblock*}
+\end{frame}
+
+% setup looks something like this:
+% agent & environment
+% Initially, the agent does not have to know anything about the environment.
+
+% he agent recieves a reward signal and the environment state.
+
+% Adjusts its actions in order to maximize the cumulative reward.
+
+% goal: from experience, trial end error learn chose a maximize reward.
+
+\begin{frame}
+	\frametitle{Examples}
+	
+	\begin{itemize}       
+%	\item \href{https://www.youtube.com/watch?v=VCdxqn0fcnE}{Helicopter}	
+%	\medskip
+	\item \href{https://www.youtube.com/watch?v=W_gxLKSsSIE}{Pancake Robot}
+	\medskip
+	\medskip
+	\item \href{https://www.youtube.com/watch?v=V1eYniJ0Rnk}{Atari Game}
+	%https://sites.google.com/site/visuomotorpolicy/
+	\end{itemize}	
+\end{frame}
+% Two examples.
+
+% pancake robot, learn how to flip a pancake.
+% state is pancake trajectory
+% reward is how good the trajectory was
+% action is change of controller parameters.
+
+% image, action left right position, reward is the score.
+
+
+\begin{frame}
+	\frametitle{Overview}
+	
+	\begin{itemize}       
+		\item Based on:
+		\begin{itemize}
+			\item \href{https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html}{Sutton, Richard S., and Andrew G. Barto. Reinforcement learning: An introduction. Vol. 1. No. 1. Cambridge: MIT press, 1998.}
+			\medskip
+			\item \href{http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching.html}{David Silver's course}
+		\end{itemize}
+		\medskip
+		\item Mathematical foundation:
+		\begin{itemize}
+			\item Markov decision processes
+			\item Dynamic programming and the Bellman equation
+		\end{itemize}
+		\medskip
+		\item Different learning algorithms:
+		 \begin{itemize}
+		 	\item Monte Carlo learning
+		 	\item Temporal difference learning
+		 	\begin{itemize}
+		 		\item SARSA
+		 		\item Q-Learning 
+		 	\end{itemize}
+		 	
+		 \end{itemize}
+		 \item Relation to deep learning:
+		 \begin{itemize}
+		 	\item Deep ANN's for function approximation, Deep Q Network (DQN)
+		 	\item Policy gradients
+		 \end{itemize} 
+		 \item Homework - OpenAI Gym
+	\end{itemize}
+	
+\end{frame}
+% Todays presentation is based on the book : Reinforcement learning: an introduction an online course by david silver, one of the guys behind the atari playing software.
+
+% first I will talk about mathematical foundation behind RL alg.
+% Markov decision processes, dynamic programming and the bellman equation.
+% common algorithms.
+
+% This can be related to Deep learning, approximating policy or cummulative reward as a deep neural network.
+
+% discuss the homework.
+
+
+\begin{frame}
+	\frametitle{Definitions}
+	%Definitions: \\
+	\medskip
+	At each step $t$ the agent:
+	\begin{itemize}  
+		\item Receives observation of the environment state, $S_t$.
+		\item Recieves scalar reward, $R_t$.
+		\item Executes action, $A_t$.
+	\end{itemize}
+	\medskip
+	\medskip
+	 and the environment:
+	\begin{itemize}  
+		\item Recieves action, $A_t$.
+		\item Updates state, $S_{t+1}$.
+		\item Emits scalar reward, $R_{t+1}$.
+	\end{itemize}
+\end{frame}
+% read from screen
+\begin{frame}
+	\frametitle{Reward}
+	\begin{itemize}       
+		\item A reward $R_t$ is a scalar feedback signal. \medskip
+		\item Indicates how well agent is doing at step $t$. \medskip
+		\item All goals can be described by the maximization of expected cumulative reward. \medskip
+	\end{itemize}
+		The return $G_t$ is the total discounted reward from time-step $t$: \medskip
+		\[
+		 G_t = R_{t+1} + \gamma R_{t+2} + \cdots = \sum_{k=0}^{\infty}\gamma^kR_{t+k+1} ,
+		\]	
+		where $\gamma \in [0,1)$ is a discount factor, favors immediate rewards.
+\end{frame}
+% Avoids infinite returns in cyclic Markov processes
+% Uncertainty about the future may not be fully represented
+\begin{frame}
+	\frametitle{Policy}
+	A policy $\pi$ is the agent's behaviour, a map from state to action.
+	
+	
+	\begin{itemize}
+	 \item $\text{Deterministic: } a = \pi(s)$.
+		\medskip
+	 \item 	$\text{Stochastic: } \pi(a|s) = \mathbb{P}[A_{t} = a |S_t = s].$
+	\end{itemize}
+
+	The policy should be adjusted in order to maximize the return, $G_t$.
+\end{frame}
+
+\begin{frame}
+	\frametitle{State-Value Function, $v_{\pi}(s)$}
+	Evaluates a state, given a policy $\pi$.
+	\medskip
+	
+	The state-value function $v_{\pi}(s)$, is a prediction of the return $G_t$ given a policy and the current state $S_t$:
+		
+	\[
+	v_{\pi}(s) = \mathbb{E}_{\pi}[G_t | S_{t} = s]
+	\]
+	
+	is used to evaluate a state and helps to select actions.
+\end{frame}
+%So how do we evaluate how good a policy is, then we define something called the svf, gives the expected return given s and policy -> is used to evaluate a state and help to select actions.
+
+\begin{frame}
+	\frametitle{Action-Value Function, $q_{\pi}(s,a)$}
+	Evaluate an alternative action, given a policy $\pi$.
+	\medskip
+
+	The action-value function $q_{\pi}(s,a)$ is the expected return starting from state $s$, taking $a$, and then following policy $\pi$
+	\[
+	q_{\pi}(s,a) = \mathbb{E}_{\pi}[G_{t} | S_{t} = s, A_t = a] .
+	\]
+	\medskip
+	is used to evaluate actions and helps to update the policy.
+\end{frame}
+% action-value function, evaluates an alternative action, given a policy
+% what is the expected return if I take arbitrary action a in this step and then follow my policy. 
+
+% function is later used to improve policy.
+
+\begin{frame}
+	\frametitle{The Markov Property}
+	We assume that the environment state $S_t$ fulfills the Markov property:
+	\[
+	\mathbb{P}[S_{t+1},R_{t+1} | A_t,S_t,A_{t-1},S_{t-1},\cdots,A_{0},S_{0}] = \mathbb{P}[S_{t+1},R_{t+1} | A_t,S_t]
+	\]
+	\\
+	\medskip 
+	The current state contains information of all past states and actions. \\
+	 \medskip
+	The Markov property is important in reinforcement learning because decisions and value functions are assumed to be a function only of the current state. 
+\end{frame}
+% We assume that the environment state $S_t$ fulfills the Markov property:
+% The current state contains information of all past states and actions.
+
+% makes things simpler. especially analysis.
+
+\begin{frame}
+	\frametitle{Markov Decision Process}
+	A Markov decision process (MDP) is a Markov process with decisions and rewards, a framework for modeling decision making:
+	
+		A finite Markov Decision Process is a tuple $<\mathcal{S},\mathcal{A},\mathcal{P},\mathcal{R},\gamma>$ where
+		\begin{itemize}
+			\small
+			\item $\mathcal{S}$ is a finite set of states.
+			\medskip
+			\item $\mathcal{A}$ is a finite set of actions.
+			\medskip
+			\item $\mathcal{P}$ is a transition probability matrix: 	$\mathcal{P}^a_{s,s\textprime} = \mathbb{P}[S_{t+1}= s\textprime|S_t = s, A_t = a]$
+			\medskip
+			\item $\mathcal{R}$ is a reward function: $\mathcal{R}_s^a = \mathbb{E}[R_{t+1}|S_t=s, A_t =a]$
+			\medskip
+			\item $\gamma$ is a discount factor $\gamma \in [0,1)$.
+			\medskip
+		\end{itemize}
+		 \small
+		The core problem of MDPs is to find a policy for the agent, that maximizes return given the MDP. "MDPs are $90 \%$ of modern reinforcement learning."
+\end{frame}
+% with the assumed markov property:
+
+% The environment is typically formulated as a Markov decision process (MDP) nice framework for modeling decision making as many reinforcement learning algorithms for this context utilize dynamic programming techniques. 
+
+\begin{frame}
+	\frametitle{Dynamic Programming}
+	\small
+	\begin{itemize}
+		\item Is an optimization method for solving a problem by breaking it down into simpler subproblems, solving each of those subproblems just once, and storing their solutions. Can be applied to Markov decision processes.
+		
+		\medskip
+		\medskip
+		
+		\item Principle of Optimality: An optimal policy has the property that whatever the initial state and initial decision are, the remaining decisions must constitute an optimal policy with regard to the state resulting from the first decision. (See Bellman, 1957, Chap. III.3.)
+	\end{itemize}
+
+\end{frame}
+% Utilizes something called dynamic programming which is an optimization method well suited for this kinds of problem. divides it into simpler subproblems.
+
+% the reason we can do this is because of the principle of optimality which says that (add figure)
+\begin{frame}
+	\frametitle{The Bellman Equation}
+	\small
+	The value functions can be decomposed into immediate reward plus discounted value of successor state \\
+	\medskip
+	A recursive relationship (($\textprime$) denotes subsequent state/action):
+	\medskip
+	\begin{align*}
+	    v_{\pi}(s) &=  \mathbb{E}[R_{t+1} + \gamma v_{\pi}(s\textprime) |S_t=s] \\
+		           &= \sum_{a \in \mathcal{A}} \pi(a|s) \left(  \mathcal{R}_s^a + \gamma \sum_{s\textprime \in 	\mathcal{S}} \mathcal{P}^a_{ss\textprime} v_{\pi}(s\textprime))  \right)  \\
+		           \medskip
+		           \medskip
+		q_{\pi}(s) &=  \mathbb{E}[R_{t+1} + \gamma q_{\pi}(S_{t+1},A_{t+1}) |S_t=s, A_t =a] \\
+				   &= \mathcal{R}_s^a + \gamma \sum_{s \textprime \in \mathcal{S} } \mathcal{P}^a_{ss\textprime}  \sum_{a \textprime \in \mathcal{A} }\pi(a\textprime|s\textprime) q_{\pi}(s\textprime,a\textprime) 
+	\end{align*}
+	\medskip
+	$v_{\pi}$ and $q_{\pi}$ are unique solutions to these equations, they can be used to compute, approximate and learn $v_{\pi}$ and $q_{\pi}$. 
+\end{frame}
+% We can see what that means in this slide value funtion, write that as  expected subsequent reward and value function for.
+
+% recursive relation -> allows for recursive methods.
+
+% are unique solutions to these equations, they can be used to compute, approximate and learn v and q. 
+\begin{frame}
+	\frametitle{Optimal state-value function}
+	The optimal value functions is the maximum value function over all policies: \\
+	\medskip
+	\begin{itemize}
+		\item	$v_{*}(s) =  \max_{\pi}v_{\pi}(s)$ \\
+		\medskip
+		\item $q_{*}(s,a) =  \max_{\pi}q_{\pi}(s,a)$
+	\end{itemize}
+	\begin{theorem}
+		For any Markov Decision Process
+		\begin{itemize}
+			\item There exists an optimal policy $\pi_*$ that is better than or equal to all other policies $\pi_* \geq \pi,~\forall \pi$.
+			\item All optimal policies achieve the optimal value function, $v_{\pi_*} = v_*(s)$.
+			\item All optimal policies achieve the optimal action-value function, $q_{\pi_*}(s,a) = q_*(s,a)$.
+		\end{itemize}
+	\end{theorem}
+\end{frame}
+
+% what we want to do is to find maximum value functions over all policies optimal state-value functions.
+
+\begin{frame}
+	\frametitle{Bellman Optimality Equation}
+	The optimal value functions are recursively related by the Bellman optimality equations:
+	\begin{align*}
+	v_*(s) &= \max_a q_*(s,a) = \max_a \mathcal{R}_s^a + \gamma \sum_{s \textprime \in \mathcal{S}} \mathcal{P}_{ss\textprime}^a v_*(s\textprime) \\
+	q_*(s) &=  \sum_{s \textprime \in \mathcal{S}} \mathcal{P}_{ss\textprime}^a [r + \gamma \max_a q_*(s\textprime) ]
+	\end{align*} 
+	\begin{itemize}
+		\small
+		\item The Bellman optimality equations is non-linear system of equations.
+		\item Extreme computational cost.
+		\item In reinforcement learning one typically has to settle for approximate solutions.
+		\begin{itemize}
+			\small
+			\item Value Iteration
+			\item Policy Iteration
+			\item Q-learning
+			\item Sarsa
+		\end{itemize}
+	\end{itemize}
+\end{frame}
+
+% use the bellman wrt this optimal policy
+
+% solve this set of nonlinear equations.
+
+% reinforcement learning approximate solutions
+% ----------------------------------------------- %
+% intutionen här??
+%Because v∗ is the value function for a policy, it must satisfy the self-consistency
+% condition given by the Bellman equation for state values
+
+% Intuitively, the Bellman optimality equation
+%expresses the fact that the value of a state under an optimal policy must equal the
+%3.8. OPTIMAL VALUE FUNCTIONS 69
+% expected return for the best action from that state:
+
+% The Bellman optimality equation is actually a system of
+%equations, one for each state, so if there are N states, then there are N equations in
+%N unknowns. If the dynamics of the environment are known (p(s0, r|s, a)), then in
+%principle one can solve this system of equations for v∗ using any one of a variety of
+%methods for solving systems of nonlinear equations.
+
+%Once one has v∗, it is relatively easy to determine an optimal policy. For each
+%state s, there will be one or more actions at which the maximum is obtained in
+%the Bellman optimality equation. Any policy that assigns nonzero probability only
+%to these actions is an optimal policy.
+
+%  saying this is that any
+% policy that is greedy with respect to the optimal evaluation function v∗ is an optimal
+% policy. The term greedy is used in computer science to describe any search or
+% decision procedure that selects alternatives based only on local or immediate considerations,
+% without considering the possibility that such a selection may prevent
+% future access to even better alternatives.
+
+% Having q∗ makes choosing optimal actions still easier
+% Many different decision-making methods can be viewed as ways of approximately solving the Bellman optimality equation.
+
+% The key idea of DP, and of reinforcement learning generally, is the use of value
+%functions to organize and structure the search for good policies.
+
+% Historical Remark
+% The reinforcement learning problem is deeply indebted to the idea of Markov decision
+% processes (MDPs) from the field of optimal control. 
+
+% The counterpart
+%of the Bellman optimality equation for continuous time and state
+%problems is known as the Hamilton–Jacobi–Bellman equation (or often just
+%the Hamilton–Jacobi equation), indicating its roots in classical physics (e.g.,
+%Schultz and Melsa, 1967).
+\begin{frame}
+	\frametitle{Iterative Policy Evaluation \& Improvement}
+	Policy evaluation, iterative application of the Bellman equation:
+	\[
+		  v_1 \rightarrow v_2 \rightarrow v_3 \cdots \rightarrow v_{\pi}
+	\]
+	\[
+		  v_{k+1} =  \mathcal{R}_s^a + \gamma \sum_{ s\textprime \in \mathcal{S} } \mathcal{P}_{ss\textprime}^a v_k( s\textprime) 
+	\]
+	\medskip
+	\medskip
+	Policy improvement:
+	\[
+		\pi \textprime = \text{greedy}(v\textprime) = \text{argmax}_{a \in \mathcal{A}} q_{\pi}(s,a).
+	\]
+	Policy iteration:
+	\[
+	\pi_0 \rightarrow v_{\pi_0} \rightarrow \pi_1 \rightarrow v_{\pi_1} \rightarrow \cdots \rightarrow \pi_* \rightarrow v_{\pi_*}
+	\]
+	If improvements stop, then the Bellman optimality equation has been satisfied.
+\end{frame}
+
+
+% use the bellman equation iteratively
+% until convergence 
+% update greedy -> s maximize the acition value function
+% remember q was immediate return givene an alternative a, subsequent value function old policy.
+
+
+% explain greedy.
+% the sequence {vk} can
+%be shown in general to converge to vπ as k → ∞ under the same conditions that
+%guarantee the existence of vπ. This algorithm is called iterative policy evaluation.
+
+% One drawback to policy iteration is that each of its iterations involves policy evaluation,
+% which may itself be a protracted iterative computation requiring multiple
+% sweeps through the state set.
+
+
+\begin{frame}
+	\frametitle{Iterative Policy Evaluation \& Improvement}
+	\begin{textblock*}{15.0cm}(1.5cm,2.0cm) % {block width} (coords)
+		\includegraphics[width=10cm]{policy_iter.png}
+		\\
+		\medskip
+		\medskip
+		\medskip
+	\tiny Source: 
+			\href{https://webdocs.cs.ualberta.ca/~sutton/book/the-book.html}{\tiny Sutton, Richard S., and Andrew G. Barto. Reinforcement learning: An introduction.}
+		
+	\end{textblock*}
+	
+\end{frame}
+
+\begin{frame}
+	\frametitle{Iterative Policy Evaluation \& Improvement}
+	\small
+	1. Initialize $V(s) \in  \mathbb{R}$ and $\pi(s) \in \mathcal{A}(s)$ arbitrarily for all $s \in \mathcal{S}$.
+	
+	2. Policy Evaluation: \\
+	Repeat \\
+	\hspace*{20pt}	$\Delta \leftarrow 0$ \\
+	\hspace*{20pt}	For each $s \in \mathcal{S}$ \\
+	\hspace*{40pt}	$v \leftarrow V(s)$ \\
+	\hspace*{40pt} $V(s) \leftarrow \sum_{s\textprime,r} p(s\textprime,r|s,\pi(s))[r + \gamma V(s\textprime)]$ \\
+	\hspace*{40pt} $\Delta  \leftarrow \max(\Delta,|v-V(s)|)$ \\
+	until $\Delta \leq \epsilon$
+	
+	3. Policy Improvement: \\
+	policy-stable $\leftarrow$ true \\
+	For each $s \in \mathcal{S}$: \\
+\hspace*{20pt}	old action $\leftarrow \text{argmax}_a \sum_{s\textprime,r}  p(s\textprime,r|s,a)[r + \gamma V(s\textprime)]$\\
+\hspace*{20pt}	If old action $\neq$ $\pi(s)$, then policy-stable $\leftarrow$ false \\
+	If policy-stable, then stop and return $V \approx v_*$ and $\pi \approx \pi_*$ else go to 2
+	
+	
+	\begin{textblock*}{15.0cm}(9.0cm,3.0cm) % {block width} (coords)
+		\includegraphics[width=4cm]{backup.png}
+	\end{textblock*}
+\end{frame}
+% As long as both processes continue to update
+% all states
+
+\begin{frame}
+	\frametitle{General Policy Iteration}
+	\begin{itemize}
+		
+	
+	\item	We need complete knowledge of the environment.
+		\medskip
+	\item Policy iteration also suffers from Bellman's curse of dimensionality for large problems.
+	
+	\medskip
+	\item	However, the general idea of letting policy evaluation and policy improvement
+	processes interact, is used in almost all reinforcement-learning methods.
+	\end{itemize}
+	% include figure
+\end{frame}
+%We need complete knowledge of the environment.
+
+% Policy iteration also suffers from Bellman's curse of dimensionality for large problems.
+
+% However, the general idea of letting policy evaluation and policy improvement processes interact, is used in almost all reinforcement-learning methods.
+
+\begin{frame}
+	\frametitle{Monte-Carlo Reinforcement Learning}
+	
+\begin{itemize}
+
+\item Model free, no prior knowledge about the environment.
+\medskip
+\item  Monte Carlo methods require only experience, i.e. sample sequences of states, actions, and rewards from interaction with the environment.
+\medskip
+\item Learns from complete episodes, updates policy from computed return, $G_t$.
+\end{itemize}
+\end{frame}
+
+% Model free, no prior knowledge about the environment.
+
+%  Monte Carlo methods require only experience, i.e. sample sequences of states, actions, and rewards from interaction with the environment.
+
+% Learns from complete episodes, updates policy from computed return, $G_t$.
+
+\begin{frame}
+	\frametitle{Monte-Carlo Policy Evaluation}
+		Initialize: \\
+		\medskip
+	\hspace*{20pt}	$\pi \leftarrow$ policy to be evaluted \\
+	\hspace*{20pt}	$V \leftarrow$ an arbitrary state-value function \\
+	\hspace*{20pt}	$\text{Returns}(s) \leftarrow$ and empty list, $\forall s \in \mathcal{S}$ \\
+		\medskip
+		Repeat:
+		
+	\hspace*{20pt}	Generate an episode using $\pi$ \\
+	\hspace*{20pt}	For each state $s$ appearing in the episode: \\
+	\hspace*{40pt}	$G  \leftarrow$ following the first occurance of $s$ \\
+	\hspace*{40pt}	Append $G$ to $\text{Returns}(s)$ \\
+	\hspace*{40pt}	$V(s) \leftarrow \text{average}(\text{Returns}(s))$ \\
+		\medskip
+By the law of large numbers: \\
+\medskip
+\hspace*{20pt} $V(s) \rightarrow v_{\pi}(s)$ as the numbers of visits at $s \rightarrow \infty$ \\
+\hspace*{20pt} error $\sigma$  falls as $1/\sqrt{n}$
+\end{frame}
+% what is episode
+\begin{frame}
+	\frametitle{Monte-Carlo Reinforcement Learning}
+	\small
+	Initialize, for all $s \in \mathcal{S}, a \in \mathcal{A}$: \\
+	\medskip
+	\hspace*{20pt}	$Q(s,a) \leftarrow$ arbitrarily \\
+	\hspace*{20pt}	$\pi(s) \leftarrow$ arbitrarily\\
+	\hspace*{20pt}	$\text{Returns}(s,a) \leftarrow$ an empty list  \\
+	\medskip
+	Repeat:
+	
+	\hspace*{20pt}	Choose $S_0 \in \mathcal{S}$ and $A_0 \in \mathcal{A}(S_0)$ randomly\\
+	\hspace*{20pt}	For each state $s,a$ appearing in the episode: \\
+	\hspace*{40pt}	$G  \leftarrow$ return following the first occurance of $s,a$ \\
+	\hspace*{40pt}	Append $G$ to $\text{Returns}(s,a)$ \\
+	\hspace*{40pt}	$Q(s,a) \leftarrow \text{average}(\text{Returns}(s,a))$ \\
+	\hspace*{20pt}	for each $s$ in the episode:\\ \medskip
+	\hspace*{40pt} 	$\pi(a|s) = \begin{cases}   
+	1 - \epsilon + \epsilon/|\mathcal{A}|& \text{if } a = \text{argmax}_a Q(s,a) \\
+	\epsilon/|\mathcal{A}| & \text{otherwise} 
+	\end{cases}$
+\end{frame}
+% är denna verkligen rätt?
+
+% reason for this is that we want to explore the whole state space so that we dont end up in a local minima, classic trade off between Exploitation vs. exploration in RL.
+\begin{frame}
+	\frametitle{Exploitation vs. Exploration}
+	$\epsilon$ - Greedy Exploration
+	
+	\[
+	\pi(a|s) = \begin{cases}   
+	1 - \epsilon + \epsilon/|\mathcal{A}| & \text{if } a = \text{argmax}_{a \in A} Q(s,a) \\
+	\epsilon/|\mathcal{A}| & \text{otherwise} 
+	\end{cases}
+	\]
+\end{frame}
+
+\begin{frame}
+	\frametitle{Monte-Carlo Reinforcement Learning}
+
+\begin{itemize}
+	 \item MC methods do not use any local information, i.e. bootstrapping like in policy iteration. 
+	\medskip
+	\item They do not update their value estimates on the basis of other value estimates.
+	\medskip
+	\item Only updates $v(s)$ and $q(s,a)$ after completed episodes.
+\end{itemize}
+\end{frame}
+
+	 % MC methods do not use any local information, i.e. bootstrapping like in policy iteration. 
+	 % They do not update their value estimates on the basis of other value estimates.
+	 % Only updates $v(s)$ and $q(s,a)$ after completed episodes.
+
+% Doesn't use any local information in the learning. 
+
+% 5 More comments on monte carlo updates pros and cons.....
+
+% Monte Carlo methods do not 	bootstrap
+
+\begin{frame}
+	\frametitle{Temporal-Difference (TD) Learning}
+
+	\begin{itemize}
+		\item TD learning is a combination
+		of Monte Carlo ideas and dynamic programming (DP) ideas.
+		\medskip
+		\item Like Monte Carlo
+		methods, TD methods can learn directly from raw experience without a model.
+		\medskip
+		\item Like DP, TD methods update estimates based in part
+		on other learned estimates, i.e. use bootstrapping.
+	\end{itemize}
+	Bellman equation: \\
+	\begin{align*}
+	v_{\pi}(s) &=  \mathbb{E}[R_{t+1} + \gamma v_{\pi}(s\textprime) |S_t=s, A_t =a] \\
+	\end{align*}
+	Estimate update, $\alpha \in [0,1]$: \\
+	\[
+	V(S_t) \leftarrow V(S_t) + \alpha(R_{t+1} + \gamma V(S_{t+1}) - V(S_t)))
+	\]
+%	\begin{itemize}
+%	\item	$R_{t+1} + \gamma V(S_{t+1})$ is the TD target.
+%	\item   $\delta_t = R_{t+1} + \gamma V(S_{t+1}) - V(S_t)$, is the TD error.
+%    \end{itemize}
+\end{frame}
+
+\begin{frame}
+	\frametitle{TD Policy Evaluation}
+	Initialize: \\
+	\medskip
+	\hspace*{20pt}	$\pi \leftarrow$ policy to be evaluted \\
+	\hspace*{20pt}	$V \leftarrow$ an arbitrary state-value function \\
+
+	\medskip
+	Repeat (for each episode):
+	
+	\hspace*{20pt}	Initialize $S$ \\
+	\hspace*{20pt}	Repeat (for each step of episode) $S$ \\
+	\hspace*{40pt}	$A \leftarrow$ action given by $\pi$ for $S$\\
+	\hspace*{40pt}	Take action $A$, observe $R\textprime$, $S\textprime$ \\
+	\hspace*{40pt}	$V(S) \leftarrow V(S) + \alpha(R\textprime + \gamma V(S\textprime) - V(S)))$ \\
+	\hspace*{40pt}	$S \leftarrow S\textprime$ \\
+	\hspace*{20pt}	until $S$ is terminal \\
+	\medskip
+\end{frame}
+
+% this is done in a more popular class of RL methods.
+
+% a combination of Monte Carlo ideas and dynamic programming (DP) ideas
+
+%TD resembles a Monte Carlo method because it learns by sampling the %environment according to some %policy, and is related to dynamic %programming techniques as it approximates its current estimate based on %previously learned estimates (a process known as bootstrapping). without %waiting for a final outcome
+
+% Obviously, TD methods have an advantage over DP methods in that they do
+%not require a model of the environment, of its reward and next-state probability
+%distributions.
+
+
+%If both TD and Monte Carlo methods converge asymptotically to the correct predictions,
+%then a natural next question is “Which gets there first?” In other words,
+%which method learns faster? Which makes the more efficient use of limited data? At
+%the current time this is an open question in the sense that no one has been able to
+%prove mathematically that one method converges faster than the other. In fact, it
+%is not even clear what is the most appropriate formal way to phrase this question!
+%In practice, however, TD methods have usually been found to converge faster than
+%constant-α MC methods on stochastic tasks, as illustrated in Example 6.2.
+\begin{frame}
+	\frametitle{n-Step TD}
+	Feedback from the n-step return:
+	\[
+	G_t^n = R_{t+1} + \gamma R_{t+2} + \cdots + \gamma^{n-1}R_{t+n} + \gamma^n V(S_{t+n})
+	\]
+	\[
+	V(S_t) \leftarrow V(S_t) + \alpha ( G_t^n - V(S_t))
+	\]
+	\medskip
+	Intermediate algorithm w.r.t TD(0) and MC.
+\end{frame}
+% empirically, intermediate perform better.
+\begin{frame}
+	
+	\frametitle{TD($\lambda$)}
+	The $\lambda$ return $G_t^{\lambda}$ combines all n-step returns $G_t^n$
+	\[
+	G_t^{\lambda}  = (1-\lambda) \sum_{n=1}^{\infty} \lambda^{n-1} G_t^n
+	\]
+	
+	\[
+	V(S_t) \leftarrow V(S_t) + \alpha ( G_t^{\lambda} - V(S_t))
+	\]
+
+\end{frame}
+% 1 -> MC
+%\begin{frame}
+%	\frametitle{Pros and Cons with MC and TD}
+%	MC has high variance, zero bias
+%	\begin{itemize}
+%		\item Good convergence properties
+%		\item Not very sensitive to initial value.
+%		\item TD learns from incomplete episodes, by bootstrapping.
+%	\end{itemize}
+%	TD
+%	\begin{itemize}
+%		\item	More efficient.
+%		\item   Converges.
+%		\item   More sensitive to intial value.
+%	\end{itemize}
+%\end{frame}
+
+%\begin{frame}
+%	Greedy policy improvement over $Q(s,a)$ is model-free
+%	\[
+%	\pi \textprime (s) = \text{argmax}_{a \in \mathcal{A}} Q(s,a)
+%	\]
+%	
+%	But we only talked about V.
+%\end{frame}
+
+
+\begin{frame}
+	\frametitle{Sarsa: TD control algorithm}
+	
+	State-Action-Reward-State-Action (SARSA) \\
+	\medskip
+	$Q(S,A)$ depends on $(S, A, R\textprime, S\textprime, A\textprime)$
+	\medskip
+
+    Policy evaluation, $Q \approx q_{\pi}$: \\
+    \medskip
+	\[
+	Q(S,A) \leftarrow Q(S,A) + \alpha (R\textprime + \gamma Q(S\textprime,A\textprime) - Q(S,A) )
+	\]
+	
+	\medskip
+	Policy improvement is then chosen $\epsilon$- greedy w.r.t. $Q(S,A)$.
+
+\end{frame}
+
+% Sarsa converges with probability
+%1 to an optimal policy and action-value function as long as all state–action
+%pairs are visited an infinite number of times and the policy converges in the limit
+%to the greedy policy (which can be arranged, for example, with ε-greedy policies by
+%setting ε = 1/t), but this result has not yet been published in the literature.
+\begin{frame}
+\begin{itemize}
+	\frametitle{Sarsa: TD control algorithm}
+	\item Initialize $Q(s,a)$, $\forall s \in \mathcal{S}$, $a \in \mathcal{A}$, and $Q(\text{terminal-state},\cdot) = 0$
+	\item Repeat (for each episode)
+	\begin{itemize}
+		\item Initialize $S$
+		\item Choose $A$ from $S$ using policy derived from $Q$ (e.g. $\epsilon$ - greedy)
+		\item Repeat (for each step of episode):
+		\begin{itemize}
+			\item Take action $A$, observe $R\textprime$, $S\textprime$.
+			\item Choose $A\textprime$ from $S\textprime$ using policy derived from $Q$ (e.g. $\epsilon$ - greedy)
+			\item $Q(S,A) \leftarrow Q(S,A) + \alpha [R\textprime + \gamma Q(S\textprime,A\textprime) -Q(S,A)]$
+			\item $S \leftarrow S\textprime,~A \leftarrow A \textprime$
+		\end{itemize}
+		\item until $S$ is terminal
+	\end{itemize}	
+
+\end{itemize}
+
+\end{frame}
+
+\begin{frame}
+\frametitle{Q-Learning}
+\begin{itemize}
+	\item Initialize $Q(s,a)$, $\forall s \in \mathcal{S}$, $a \in \mathcal{A}$, and $Q(\text{terminal-state},\cdot) = 0$
+	\item Repeat (for each episode)
+	\begin{itemize}
+		\item Initialize $S$
+	%	\item Choose $A$ from $S$ using policy derived from $Q$ (e.g. $\epsilon$ - greedy)
+		\item Repeat (for each step of episode):
+		\begin{itemize}
+			\item Choose $A$ from $S$ using policy derived from $Q$ (e.g. $\epsilon$ - greedy)
+			\item Take action $A$, observe $R\textprime$, $S\textprime$.
+			\item $Q(S,A) \leftarrow Q(S,A) + \alpha [R\textprime + \gamma \max_aQ(S\textprime,a) -Q(S,A)]$
+			\item $S \leftarrow S\textprime$.
+		\end{itemize}
+		\item until $S$ is terminal
+\end{itemize}
+
+% alpha, 1 det, decrease for conv, but intermediate is best.
+
+%	The difference may be explained as SARSA learns the Q values associated with taking the policy it follows itself, while Watkin's Q-learning learns the Q values associated with taking the exploitation policy while following an exploration/exploitation policy.
+\end{itemize}
+%Say something general about Q learning and difference to SARSA. Some historical note.
+\end{frame}
+% One of the early breakthroughs in reinforcement learning was the development of an
+% off-policy TD control algorithm known as Q-learning
+
+% the learned action-value function, Q, directly approximates q∗, the optimal
+%action-value function, independent of the policy being followed. This dramatically
+%simplifies the analysis of the algorithm and enabled early convergence proofs.
+\begin{frame}
+	\frametitle{Large-Scale Reinforcement Learning}
+	Reinforcement learning can be used to solve large problems, e.g.
+	\begin{itemize}
+		\item Backgammon $10^{20}$ states
+		\item Computer Go: $10^{170}$ states
+		\item Helicopter: continuous state space
+	\end{itemize}
+	So far we have represented the value functions as lookup table, this becomes slow and memory expensive for large problems.
+	
+	A solution is to estimate the value function with function approximation:
+	\begin{align*}
+		\hat{v}(s,\theta) &\approx v_{\pi}(s) \\
+		\hat{q}(s,a,\theta) &\approx q_{\pi}(s,a)
+	\end{align*}
+	And update the parameter $\theta$ using MC or TD learning.
+\end{frame}
+%
+% Typically, the
+% number of weights (the number of components of θ) is much less than the number
+% of states (n  |S|), and changing one weight changes the estimated value of many
+% states.
+%
+
+% Let us refer to an individual backup
+%by the notation s 7→ g, where s is the state backed up and g is the backed-up value,
+%or target, that s’s estimated value is shifted toward. For example, the Monte Carlo
+%backup for value prediction is St
+%7→ Gt
+%, the TD(0) backup is St
+%7→ Rt+1+γvˆ(St+1,θt),
+%and the n-step TD backup is St
+%7→ G
+%(n)
+%t
+%. I
+%
+%
+% supervised learning methods
+%
+\begin{frame}
+	\frametitle{Function Approximators}
+%	"The number of possible camera images are larger than the atoms in the universe". \\
+%	Objective to minimize:
+%	\[
+%	\text{MSVE}(\theta) = \sum_{s \in \mathcal{S}} d(s) [v_{\pi}(s) -\hat{v}(s,\theta) ]^2
+%	\]
+%	Most commonly minimized using  SGD. \\
+	\medskip
+Examples of approximators:
+\begin{itemize}
+	\small
+	\item Linear, $\hat{v}(s,\theta) = \sum_i \theta_i \phi(s)$
+	%\begin{itemize}
+	%	\item polynomials
+	%	\item Radial basis functions
+	%	\item Fourier bases
+	%\end{itemize}
+	\medskip
+	\item Nonlinear, neural networks	 	
+	%\item Fourier / wavelet bases
+\end{itemize}
+% figur här
+
+\end{frame}
+
+\begin{frame}
+	\frametitle{Value-function approximation}
+	Goal: find a parameter $\theta$ that minimizes the mean-squared error between approximate value function $\hat{v}(S,\theta)$ and the true value function $v_{\pi}(s)$.
+	\[
+	J(\theta) = \frac{1}{2}\mathbb{E}_{\pi} [( v_{\pi}(S) -\hat{v}(S,\theta))^2]
+	\]
+	Gradient descent:
+	\begin{align*}
+	\Delta \theta = - \alpha \nabla_{\theta} J(\theta) 
+	= \alpha \mathbb{E}[(v_{\pi}(S) -\hat{v}(S,\theta))\nabla_{\theta}\hat{v}(S,\theta) ] 
+	\end{align*}
+	Stochastic gradient descent samples:
+	\begin{align*}
+	\Delta \theta &= \alpha \left( v_{\pi}(S)- \hat{v}(S,\theta)  \right) \nabla_{\theta} \hat{v}(S,\theta)
+	\end{align*}
+	where	
+	\[
+	v_{\pi}(S) = \begin{cases}
+	G_t &  \text{ in MC} \\
+	R_{t+1} + \gamma \hat{v}(S_{t+1},\theta) &  \text{ in TD(0)} \\
+	G_t^{\lambda} & \text{in TD}(\lambda)
+	\end{cases}
+	\]
+
+\end{frame}
+
+
+\begin{frame}
+	\frametitle{Action-Value Function Approximation}
+	Goal: find a parameter $\theta$ that minimizes the mean-squared error between approximate value function $\hat{q}(S,A,\theta)$ and the true action-value function $q_{\theta}(S,A)$.
+	\[
+	J(\theta) = \mathbb{E}_{\pi} [( q_{\pi}(S) -\hat{q}(S,A,\theta))^2]
+	\]
+	Gradient descent:
+	\begin{align*}
+	\Delta \theta = -\frac{1}{2} \alpha \nabla_{\theta} J(\theta) 
+	= \alpha \mathbb{E}[(q_{\pi}(S,A) -\hat{q}(S,A,\theta))\nabla_{\theta}\hat{q}(S,A,\theta) ] 
+	\end{align*}
+	Stochastic Gradient Descent samples:
+	\begin{align*}
+	\Delta \theta &= \alpha \left( q_{\pi}(S,A)- \hat{q}(S,A,\theta)  \right) \nabla_{\theta} \hat{q}(S,A,\theta)
+	\end{align*}
+	where	
+	\[
+	q_{\pi}(S,A) = \begin{cases}
+	G_t &  \text{ in MC} \\
+	R_{t+1} + \gamma \hat{q}(S_{t+1},A_{t+1},\theta) &  \text{ in TD(0)} \\
+	G_t^{\lambda} & \text{in TD}(\lambda)
+	\end{cases}
+	\]
+\end{frame}
+
+
+\begin{frame}
+	\frametitle{Episodic Semi-gradient Sarsa for Control}
+	\begin{itemize}
+		\item Input: a differentiable function $\hat{q} : \mathcal{S} \times \mathcal{A} \times \mathbb{R}^n \rightarrow \mathbb{R}$
+		\item Initialize value-function weights $\theta \in \mathbb{R}^n$
+		\item Repeat (for each episode)
+		\begin{itemize}
+			\item $S,A \leftarrow$ initial state and action of episode (e.g., $\epsilon$-greedy)
+	
+			\item Repeat (for each step of episode):
+			\begin{itemize}
+				
+				\item Take action $A$, observe $R$, $S\textprime$.
+				\item Choose $A\textprime$ as a function of $\hat{q}(S\textprime,\cdot,\theta)$ ( e.g. $\epsilon$-greedy)
+				\item $\theta \leftarrow \theta + \alpha[R+\gamma \hat{q}(S\textprime,A\textprime,\theta)-\hat{q}(S,A,\theta)]\nabla\hat{q}(S,A,\theta)$
+				\item $S \leftarrow S\textprime$
+				\item $A \leftarrow A\textprime$
+			\end{itemize}
+		\end{itemize}
+
+	\end{itemize}
+% why semi gradient? - only a partial gradient.
+\end{frame}
+
+\begin{frame}
+	\frametitle{Continuing Tasks}
+	Quality of a policy is instead defined as the average rate of reward:
+	\[
+	\eta(\pi)= \lim_{t \rightarrow \infty} \mathbb{E} [R_t | A_{0:t-1} \sim \pi]
+	\]
+	and returns are defined in terms of differences between rewards and average reward:
+	\[
+	G_t = R_{t+1} - \eta(\pi) + R_{t+2} - \eta(\pi) + R_{t+3} - \eta(\pi) + \cdots
+	\]
+\end{frame}
+
+\begin{frame}
+	\frametitle{Differential Semi-gradient Sarsa for Control}
+	\begin{itemize}
+		\item Input: a differentiable function $\hat{q} : \mathcal{S} \times \mathcal{A} \times \mathbb{R}^n \rightarrow \mathbb{R}$
+		\item Initialize value-function weights $\theta \in \mathbb{R}^n$
+		\item Initialize average reward $\bar{R}$ arbitrarily.
+		\item Repeat (for each episode)
+		\begin{itemize}
+			\item $S,A \leftarrow$ initial state and action of episode (e.g., $\epsilon$-greedy)
+			
+			\item Repeat (for each step of episode):
+			\begin{itemize}
+				
+				\item Take action $A$, observe $R$, $S\textprime$.
+				\item Choose $A\textprime$ as a function of $\hat{q}(S\textprime,\cdot,\theta)$ ( e.g. $\epsilon$-greedy)
+				\item $\theta \leftarrow \theta + \alpha[R - \bar{R} + \hat{q}(S\textprime,A\textprime,\theta)-\hat{q}(S,A,\theta)]\nabla\hat{q}(S,A,\theta)$
+				\item $\bar{R} \leftarrow \bar{R} + \beta [R - \bar{R} + \hat{q}(S\textprime,A\textprime,\theta)-\hat{q}(S,A,\theta)]$
+				\item $S \leftarrow S\textprime$
+				\item $A \leftarrow A\textprime$
+			\end{itemize}
+		\end{itemize}
+		
+	\end{itemize}
+	% why semi gradient?
+\end{frame}
+
+\begin{frame}
+	\frametitle{TD-Gammon}
+	\begin{itemize}
+		\item Used a multi-layer artificial neural network trained by TD($\lambda$) to evaluate each possible move.
+		\medskip
+		\item Achieved a level of play just slightly below that of the top human backgammon player in 1992.
+		\medskip
+		\item Found new strategies.
+	\end{itemize}
+	
+		\begin{textblock*}{15.0cm}(8cm,5.5cm) % {block width} (coords)
+			\includegraphics[width=4.5cm]{TDgammon.png}
+		\end{textblock*}
+\end{frame}
+% add some more stuff
+\begin{frame}
+	\frametitle{Google Deepmind's Deep Q-Network (DQN)}
+	
+	\begin{textblock*}{15.0cm}(1.5cm,2.0cm) % {block width} (coords)
+		\includegraphics[width=10cm]{atari2600.png}
+		\\	
+		\href{https://www.cs.toronto.edu/~vmnih/docs/dqn.pdf}{\tiny Mnih, Volodymyr, et al. "Playing atari with deep reinforcement learning." (2013)}	
+	\end{textblock*}
+	
+	\begin{textblock*}{15.0cm}(1.5cm,4.5cm) % {block width} (coords)
+			\includegraphics[width=10cm]{CNN.png}
+			\\	
+			\href{http://www0.cs.ucl.ac.uk/staff/d.silver/web/Teaching_files/FA.pdf}{\tiny David Silver's presentation on function approximation}	
+	\end{textblock*}
+\end{frame}
+% Batching and Experience replay.
+\begin{frame}
+	\frametitle{Policy-Gradient Methods}
+	Previously we approximated 
+	\[
+	Q_{\theta}(s,a) \approx Q^{\pi}(s,a)
+	\]
+	and then generated a policy from the approximated action-value function.
+	
+	In some cases its better to directly parametrize the policy
+	
+	\[
+	\pi_{\theta}(a|s,\theta) = \mathbb{P}(a|s,\theta)
+	\]
+	
+	which is more effective in high-dimensional or continuous action spaces. We then update $\theta$ using gradient descent:
+	\[
+	\theta_{t+1} = \theta_{t} + \alpha \nabla_{\theta} J(\theta_t)
+	\]
+	where $J$ is a policy objective function.
+\end{frame}
+
+%\begin{frame}
+%	\frametitle{Softmax Distribution}
+%	Softmax distribution for discrete action spaces, used in DQN:
+	
+%	\[
+%	\pi_{\theta}(a|s,\theta) = \frac{\exp(h(s,a,\theta))}{\sum_b \exp(h(s,b,\theta))}
+%	\]
+	
+%\end{frame}
+
+\begin{frame}
+	\frametitle{Policy Evaluation}
+	\small
+	State-value function from initial state, $s_1$: 
+	\[
+	J_1(\theta)= V^{\pi_{\theta}}(s_1)
+	\]
+	Average value
+	\[
+	J_{\text{avV}}(\theta) =  \sum_s d^{\pi_{\theta}}(s) V^{\pi_{\theta}}(s)
+	\]
+	where $d^{\pi_{\theta}}(s)$ is the stationary Markov-chain distribution for $\pi_{\theta}$. \\
+	\medskip
+	Average reward per time-step
+	\[
+	J_{\text{avR}}(\theta) = \sum_s d^{\pi_{\theta}}(s) \sum_a \pi_{\theta}(s,a) \mathcal{R}^a_s
+	\]
+	
+	
+	For any of the above (policy-gradient theorem):
+	\[
+	\nabla_\theta J( \theta ) = \mathbb{E}_{\pi_{\theta}} [\nabla_{\theta} \log \pi_{\theta}(S,A) Q^{\pi_{\theta}}(S,A) ] 
+	\]
+\end{frame}
+% här förstår jag inte mycket.
+\begin{frame}
+	\frametitle{Reinforce Monte Carlo}
+
+	\medskip
+	Initialize $\theta$ \\
+	
+	for each episode do ${S_1,A_1,R_2,...,S_{T-1},A_{T-1},R_T} \sim \pi_{\theta}$ do \\
+	\hspace*{20pt} for $t = 1$ to $T-1$ \\
+	\hspace*{40pt} $\theta = \theta + \alpha \nabla_{\theta} \log \pi_{\theta}(S,A) G$ \\
+	\hspace*{40pt} where $G$ is the sample return from state $S_t$ \\
+	\hspace*{20pt} end for \\
+	end for \\
+	return $\theta$ \\
+	end function
+\end{frame}
+
+\begin{frame}
+	\frametitle{Q - Action Critic}
+	With linear value function approximation $Q_{w}(S,A) = \phi(S,A)^Tw$ \\
+	\medskip
+	Initialize $s, \theta$ \\
+	Sample $a \sim \pi_{\theta}$ \\
+
+for each step do \\
+	\hspace*{20pt} sample reward $R\textprime$, sample transition $S\textprime$ \\
+	\hspace*{20pt} sample action $A\textprime \sim \pi_{\theta}(A\textprime| S\textprime)$ \\
+	\hspace*{20pt} $\delta = R\textprime  + \gamma Q_{w}(S',A') - Q_{w}(S,A)$ \\
+	\hspace*{20pt} $\theta = \theta + \alpha \nabla_{\theta} \log \pi_{\theta}(A\textprime| S\textprime) Q_w(S,A)$ \\
+	\hspace*{20pt} $w \leftarrow \beta \delta \phi(S,A)$ \\
+$A \leftarrow A\textprime$, $S \leftarrow S\textprime$ \\
+end for
+\end{frame}
+%\begin{frame}
+%\frametitle{Exploration vs. Exploitation}
+%\begin{itemize}
+%	\item $\epsilon$-greedy
+%	\item Optimism
+%	\item some other
+%\end{itemize}	
+%\end{frame}
+
+%\begin{frame}
+%	\frametitle{extra stuff}
+	
+%	\begin{itemize}
+%		\item Deep Blue
+%		\item TDGammon - Non-linear value-function approximation.
+%		\item SmooCT
+%		\item fundamental challenges $\rightarrow$ expl vs. expl, delayed rewards.
+%	\end{itemize}
+%\end{frame}
+
+\begin{frame}
+	\frametitle{Exercise: OpenAI Gym}
+	\href{https://gym.openai.com/}{OpenAI Gym}:
+	\begin{itemize}
+	\item A toolkit for developing and comparing reinforcement-learning algorithms.
+	\medskip
+	\item From simulated robots to Atari games. 
+	\medskip
+	\item A site for comparing and reproducing results.
+	\end{itemize}
+	
+	Task:
+	\begin{itemize}
+	 \item Run a RL algorithm on one of the OpenAI-gym examples.
+	 \item I suggest that you try an already working implementation:
+	 \medskip
+	   \begin{itemize}
+	   	\item \href{http://karpathy.github.io/2016/05/31/rl/}{Pong}
+	   	\item \href{https://github.com/coreylynch/async-rl}{Breakout}
+	   	\item \href{https://gym.openai.com/evaluations/eval_HTjvyhm5QlqLWP4pB1fG4A}{Space Invaders}
+	   \end{itemize}
+	 
+	\end{itemize}
+\end{frame}
+
+% Explain code that was used
+\end{document}
\ No newline at end of file
-- 
GitLab