diff --git a/.gitignore b/.gitignore
index 571df5fa93872d042478fe5e09446f0c8177f873..f6e8ef177afcbec911c8080b4f265d49bd0c39a6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -4,6 +4,7 @@ adaptive_width_experiments
 machinefile
 jump_lin_id/build
 *.orig
+*.log
 *.gz
 *.jld
 *.mat
diff --git a/jump_lin_id/bibtexfile.bib b/jump_lin_id/bibtexfile.bib
index 76d5e8defbbe865f1bb7541692a8023d271b9b72..9cd7a72052aeda3d143c723c9dc3ef636e6362a1 100644
--- a/jump_lin_id/bibtexfile.bib
+++ b/jump_lin_id/bibtexfile.bib
@@ -64,3 +64,89 @@
   year={1994},
   publisher={Springer}
 }
+
+@article{tibshirani2014adaptive,
+  title={Adaptive piecewise polynomial estimation via trend filtering},
+  author={Tibshirani, Ryan J and others},
+  journal={The Annals of Statistics},
+  volume={42},
+  number={1},
+  pages={285--323},
+  year={2014},
+  publisher={Institute of Mathematical Statistics}
+}
+
+@article{kim2009ell_1,
+  title={$\backslash$ell\_1 Trend Filtering},
+  author={Kim, Seung-Jean and Koh, Kwangmoo and Boyd, Stephen and Gorinevsky, Dimitry},
+  journal={SIAM review},
+  volume={51},
+  number={2},
+  pages={339--360},
+  year={2009},
+  publisher={SIAM}
+}
+
+@book{murphy2012machine,
+  title={Machine learning: a probabilistic perspective},
+  author={Murphy, Kevin P},
+  year={2012},
+  publisher={MIT press, Cambridge, Massachusetts}
+}
+
+@article{bellman1969curve,
+  title={Curve fitting by segmented straight lines},
+  author={Bellman, Richard and Roth, Robert},
+  journal={Journal of the American Statistical Association},
+  volume={64},
+  number={327},
+  pages={1079--1084},
+  year={1969},
+  publisher={Taylor \& Francis}
+}
+
+@inproceedings{levine2013guided,
+  title={Guided policy search},
+  author={Levine, Sergey and Koltun, Vladlen},
+  booktitle={Proceedings of the 30th International Conference on Machine Learning (ICML-13)},
+  pages={1--9},
+  year={2013}
+}
+
+@inproceedings{levine2015learning,
+  title={Learning contact-rich manipulation skills with guided policy search},
+  author={Levine, Sergey and Wagener, Nolan and Abbeel, Pieter},
+  booktitle={Robotics and Automation (ICRA), 2015 IEEE International Conference on},
+  pages={156--163},
+  year={2015},
+  organization={IEEE}
+}
+
+@article{bristow2006survey,
+  title={A survey of iterative learning control},
+  author={Bristow, Douglas A and Tharayil, Marina and Alleyne, Andrew G},
+  journal={IEEE Control Systems},
+  volume={26},
+  number={3},
+  pages={96--114},
+  year={2006},
+  publisher={IEEE}
+}
+
+@inproceedings{schulman2015trust,
+  title={Trust region policy optimization},
+  author={Schulman, John and Levine, Sergey and Abbeel, Pieter and Jordan, Michael and Moritz, Philipp},
+  booktitle={Proceedings of the 32nd International Conference on Machine Learning (ICML-15)},
+  pages={1889--1897},
+  year={2015}
+}
+
+@inproceedings{vidal2002observability,
+  title={Observability and identifiability of jump linear systems},
+  author={Vidal, Ren{\'e} and Chiuso, Alessandro and Soatto, Stefano},
+  booktitle={Decision and Control, 2002, Proceedings of the 41st IEEE Conference on},
+  volume={4},
+  pages={3614--3619},
+  year={2002},
+  organization={IEEE}
+}
diff --git a/jump_lin_id/figs/states.tex b/jump_lin_id/figs/states.tex
index a466192f57f305b563485939d6308bd141dbe49e..321399634adcf4c16b92b90fae383daf454833e8 100644
--- a/jump_lin_id/figs/states.tex
+++ b/jump_lin_id/figs/states.tex
@@ -4,18 +4,20 @@
 \definecolor{color1}{rgb}{0.88887350027252,0.43564919034819,0.278122936141944}
 \definecolor{color0}{rgb}{0,0.605603161175225,0.978680117569607}
 \definecolor{color2}{rgb}{0.242224297852199,0.64327509315763,0.304448651534115}
+% \pgfplotsset{yticklabel style={text width=2em,align=right}}
 
 \begin{axis}[
     name=plot1,
 ylabel={State value},
-xmin=1, xmax=399,
+xmin=0, xmax=400,
 ymin=-7.85391243746834, ymax=1.78055825822371,
 axis on top,
+ytick = {0,-5},
 width=\figurewidth,
 height=\figureheight,
 legend entries={{Estimated},{True},{Measured}},
 legend cell align={left},
-legend style={at={(0.99,1.03)}, anchor=south, draw=white, legend columns=3},
+legend style={at={(0.5,1.03)}, anchor=south, draw=white, legend columns=3},
 ]
 \addplot [color0]
 table {%
@@ -1228,11 +1230,13 @@ table {%
 \end{axis}
 
 \begin{axis}[
-at=(plot1.south east), anchor=south west,
-xshift=7mm,
+% at=(plot1.south east), anchor=south west,
+at=(plot1.below south east), anchor=above north east,
+ylabel={State value},
+% xshift=7mm,
 % ylabel={State value},
 xlabel={Time index},
-xmin=1, xmax=399,
+xmin=0, xmax=400,
 ymin=-7.8926575396648, ymax=4.28889863645134,
 axis on top,
 width=\figurewidth,
diff --git a/jump_lin_id/id_paper.pdf b/jump_lin_id/id_paper.pdf
index 09a6119ba1b8548d778351cb29a29a128c2dd9ab..19fa8bea0d25d8c6e392899a71f9a8bbd46fecc8 100644
Binary files a/jump_lin_id/id_paper.pdf and b/jump_lin_id/id_paper.pdf differ
diff --git a/jump_lin_id/id_paper.tex b/jump_lin_id/id_paper.tex
index 1abf2058b63ee0874e4219f91aceb608d5a317ac..d887d0e98f86438831eee3ae95f102671e68471a 100644
--- a/jump_lin_id/id_paper.tex
+++ b/jump_lin_id/id_paper.tex
@@ -122,7 +122,7 @@ Identification of LTV Dynamical Models with\\ Smooth or Discontinuous Time Evolu
 
 \author{
 \centering Fredrik Bagge Carlson* \quad Anders Robertsson \quad Rolf Johansson
-\thanks{*Open-source implementations of all presented methods and examples marked with a star ($\star$) in this paper are made available at \href{github.com/baggepinnen/LTVModels.jl}{github.com/baggepinnen/LTVModels.jl}. The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University.\protect\\
+\thanks{*Open-source implementations of all presented methods and examples marked with a star ($\star$) in this paper are made available at \href{github.com/baggepinnen/LTVModels.jl}{github.com/baggepinnen/LTVModels.jl} (Will be made available in advance of paper publication). The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University.\protect\\
 Lund University, Dept Automatic Control, PO Box 118\protect\\
 SE22100 Lund Sweden\protect\\
 {Fredrik.Bagge\_Carlson@control.lth.se}}
@@ -152,10 +152,10 @@ SE22100 Lund Sweden\protect\\
 \pagestyle{empty}
 
 \begin{abstract}
-   We review some classical methods for offline identification of linear, time-varying (LTV) dynamical models and put them into a larger, more general context where we introduce a number of new techniques. Specifically, we introduce methods for identification of piecewise linear dynamical systems where the change in dynamics can be either discontinuous or smooth and where the number of changes in the dynamics parameters can be either known or unknown. We further discuss the introduction of priors on the model parameters for situations where sufficient excitation is unavailable. The identification problems are cast as convex optimization problems and are applicable to, e.g., ARMA-models and state-space models with time-varying parameters. We illustrate usage of the methods in simulations and in applications of model-based reinforcement learning and robot-program segmentation.
+We review some classical methods for offline identification of linear, time-varying (LTV) dynamical models and put them into a larger, more general context where we introduce a number of new techniques. Specifically, we introduce methods for identification of piecewise linear dynamical systems where the change in dynamics can be either discontinuous or smooth and where the number of changes in the dynamics parameters can be either known or unknown. We further discuss the introduction of priors on the model parameters for situations where sufficient excitation is unavailable. The identification problems are cast as convex optimization problems and are applicable to, e.g., ARX-models and state-space models with time-varying parameters. We illustrate usage of the methods in simulations and in an application of model-based reinforcement learning.
 \end{abstract}
 \begin{keywords}
-   System Identification, Piecewise Linear Modeling, Trend Filtering, Kalman Smoothing, Time-Series Segmentation, Segmented Least-Squares.
+   System Identification, Piecewise Linear Modeling, Trend Filtering, Kalman Smoothing, Time-Series Segmentation, Reinforcement Learning.
 \end{keywords}
 
 
@@ -166,14 +166,15 @@ SE22100 Lund Sweden\protect\\
 
 The difficulty of the task of identifying time-varying dynamical models of systems varies greatly with the model considered and the availability of measurements of the state sequence. For smoothly changing, linear dynamics\footnote{We take linear dynamics to mean models that are linear in the parameters.}, the recursive least-squares algorithm with exponential forgetting (RLS$\lambda$) is a common alternative. If a Gaussian random-walk model for the parameters is assumed, a Kalman filtering/smoothing algorithm gives the filtering/smoothing densities of the parameters in closed form. The assumption of smoothly (Gaussian) varying dynamics is often restrictive. Discontinuous dynamics changes occur, for instance, when an external controller changes operation mode, when a sudden contact between a robot and its environment is established, an unmodeled disturbance enters the system or when a system is suddenly damaged.
 
-Identification of systems with non-smooth dynamics evolution has been studied extensively. The book~\cite{costa2006discrete} treats the case where the dynamics are known, but the state sequence unknown. In~\cite{nagarajaiah2004time}, the authors examine the residuals from an initial constant dynamics fit to determine regions in time where improved fit is needed by the introduction of additional constant dynamics models. \cmt{Extend this section with more background.}
+Identification of systems with non-smooth dynamics evolution has been studied extensively. The book~\cite{costa2006discrete} treats the case where the dynamics are known, but the state sequence unknown. In~\cite{nagarajaiah2004time}, the authors examine the residuals from an initial constant dynamics fit to determine regions in time where improved fit is needed by the introduction of additional constant dynamics models. Results on identifiability and observability in jump-linear systems in the non-controlled setting are available in~\cite{vidal2002observability}. The main result on identifiability is a rank condition on a Hankel matrix constructed from the collected output data, similar to classical results on the least-squares identification of ARX-models with rank constraints on the, typically Toeplitz, regressor matrix. Identifiability of the methods proposed in this article are discussed in~\cref{sec:identifiability}.
 
-An important class of identification methods that have been popularized lately are \emph{trend filtering} methods~\cite{Someone}. Trend filtering methods work by specifying a \emph{fitness criterion} that determines the goodness of fit, as well as a \emph{regularization} term, often chosen with sparsity promoting qualities. As a simple example, consider the reconstruction $\hat y$ of a noisy signal $y$ with piecewise constant segments. To this end, we might formulate and solve the convex optimization problem
+An important class of identification methods that have been popularized lately are \emph{trend filtering} methods~\cite{kim2009ell_1, tibshirani2014adaptive}. Trend filtering methods work by specifying a \emph{fitness criterion} that determines the goodness of fit, as well as a \emph{regularization} term, often chosen with sparsity promoting qualities. As a simple example, consider the reconstruction $\hat y$ of a noisy signal $y$ with piecewise constant segments. To this end, we might formulate and solve the convex optimization problem
 \begin{equation} \label{eq:tf}
    \minimize{\hat{\y}} \normt{\y-\hat{\y}}^2 + \lambda\sum_t |\hat{\y}_{t+1} - \hat{\y}_t|
 \end{equation}
-The first term is the fitness criteria or \emph{loss function}, whereas the second term is a sparsity-promoting regularizer which promotes small changes between consecutive samples in the reconstructed signal. The sparsity promoting effect of the 1-norm regularizer is well known, and stems from the constant size of the gradient whenever the argument is non-zero~\cite{someone}. Compare this to the 2-norm, for which the gradient vanishes rapidly as the argument approaches zero. The 2-norm will thus promote \emph{small} arguments, whereas the 1-norm promotes \emph{sparse} arguments.
-In this work, we will draw inspiration from the trend-filtering literature to develop new system identification methods with interesting properties.
+The first term is the fitness criterion or \emph{loss function}, whereas the second term is a sparsity-promoting regularizer which promotes small changes between consecutive samples in the reconstructed signal. The sparsity promoting effect of the 1-norm regularizer is well known, and stems from the constant size of the gradient whenever the argument is non-zero~\cite{murphy2012machine}. Compare this to the 2-norm, for which the gradient rapidly vanishes as the argument approaches zero. The 2-norm will thus promote \emph{small} arguments, whereas the 1-norm promotes \emph{sparse} arguments.
+
+In this work, we will draw inspiration from the trend-filtering literature to develop new system identification methods for LTV models, with interesting properties. We start by defining a set of optimization problems with a least-squares loss function and carefully chosen regularization terms. We further discuss how prior information can be utilized to further increase the accuracy of the identification and end the article with an application of model-based reinforcement learning and a discussion.
 
 \section{LTI identification}\label{sec:lti}
 
@@ -221,7 +222,7 @@ The density $p_v$ is determining the distribution of measurements, given the cur
 We emphasize here that the state in the parameter evolution model refers to the current parameters $k_t$ and not the system state $x_t$, hence, $p_v$ is called the emission density and not the transition density. Particular choices of $p_v$ and $p_w$ emit data likelihoods concave in the parameters and hence amenable to convex optimization.
 
 
-The following sections will introduce a number of optimization problems with different regularization functions and regularization arguments, and discuss the quality of the resulting identification. An emphasis is put on intuitive motivations for the formulations \cmt{rewrite}.
+The following sections will introduce a number of optimization problems with different regularization functions and regularization arguments, and discuss the quality of the resulting identification.
 
 \subsection{Smooth time evolution $\star$}
 A smoothly varying signal is characterized by \emph{small second order time differences}.
@@ -267,7 +268,7 @@ If the number of switches in dynamics parameters, $M$, is known in advance, the
    & \subjto &        & \sum_t \textbf{1}\{ \w_{t+1} \neq \w_t\} \leq M
 \end{align}
 where $\textbf{1}\{\cdot\}$ is the indicator function.
-This problem is non-convex, but admits an exact solution through dynamic programming (DP), with an extended version of \cite{bellman1961approximation}, an algorithm frequently referred to as segmented least-squares~\cite{someone}. The extension lies in the association of each segment with a dynamics model, as opposed to a simple straight line.\footnote{Indeed, if a simple integrator is chosen as dynamics model and a constant input is assumed, the result of our extended algorithm reduces to the segmented least-squares solution.} Unfortunately, the computational complexity of the naive dynamic programming solution, $\mathcal{O}(T^2K^3)$, becomes prohibitive for large $T$.\footnote{For details regarding the DP algorithm and implementation, the reader is referred to the source-code repository accompanying this article \cite{repository}.}
+This problem is non-convex, but admits an exact solution through dynamic programming (DP), with an extended version of \cite{bellman1961approximation}, an algorithm frequently referred to as segmented least-squares~\cite{bellman1969curve}. The extension lies in the association of each segment with a dynamics model, as opposed to a simple straight line.\footnote{Indeed, if a simple integrator is chosen as dynamics model and a constant input is assumed, the result of our extended algorithm reduces to the segmented least-squares solution.} Unfortunately, the computational complexity of the naive dynamic programming solution, $\mathcal{O}(T^2K^3)$, becomes prohibitive for large $T$.\footnote{For details regarding the DP algorithm and implementation, the reader is referred to the source-code repository accompanying this article \cite{repository}.}
 
 \subsection{Piecewise linear time evolution}\label{eq:pwlinear}
 A piecewise linear signal is characterized by a \emph{sparse second-order time difference}, i.e., it has a small number of changes in the slope. A piecewise linear time-evolution of the dynamics parameters is hence obtained if we solve the optimization problem.
@@ -277,8 +278,8 @@ A piecewise linear signal is characterized by a \emph{sparse second-order time d
 The only difference between \labelcref{eq:pwlinear} and~\labelcref{eq:smooth} is the square root in the 2-norm of \labelcref{eq:pwlinear}. The two regularizers are illustrated in~\cref{fig:surf}.
 \begin{figure}[htp]
     \centering
-    \includegraphics[width=0.7\linewidth]{figs/surf.png}
-    \caption{Comparison between the 2-norm (pointy bottom) and squared 2-norm (round bottom) regularizers. \cmt{replace and more figure text}}
+    \includegraphics[width=0.9\linewidth]{figs/surf.png}
+    \caption{Comparison between the 2-norm (pointy bottom) and squared 2-norm (round bottom) regularizers. The figure illustrates how the gradient of the 1-norm regularizer remains constant as we move towards the origin along one of the axis, whereas the 2-norm regularizer has a vanishing gradient close to the origin. This difference leads to sparse results when applying 1-norm penalties and small results when applying 2-norm penalty.}
     \label{fig:surf}
 \end{figure}
 
@@ -301,7 +302,6 @@ Norm & $D_n$ & Result    \\ \midrule
 
 
 \subsection{Two-step refinement $\star$}
-\cmt{Re-estimate and replace sparsity promoting penalty with constraints. Reduce the size of this section significantly}
 
 Since many of the proposed formulations of the optimization problem penalize the size of the changes to the dynamics, solutions in which the changes are slightly underestimated are favored. To mitigate this issue, a two-step procedure can be implemented where in the first step, change points are identified. In the second step, the penalty on the one-norm is interchanged for equality constraints between the non-zero changes in dynamics.
 
@@ -313,7 +313,7 @@ To identify the points at which the dynamics change, we observe the relevant sig
 \end{equation}
 Points where $a_t$ takes non-zero values indicate dynamics changes.
 
-\section{Dynamics prior} \cmt{Mention this in abstract and introduction}
+\section{Dynamics prior}
 The identifiability of the parameters in a dynamical model hinges on the observability of the dynamics system~\labelcref{eq:dynsys}, or more explicitly, only modes excited by the input $u$ will be satisfactorily identified. If the identification is part of an iterative learning and control scheme, e.g., ILC or reinforcement learning, it might be undesirable to introduce additional noise in the input to improve excitation for identification. This section will introduce prior information about the dynamics which mitigates the issue of poor excitation of the system modes. The prior information might come from, e.g., a nominal model known to be inaccurate, or an estimated global model such as a Gaussian mixture model (GMM). A statistical model of the joint density $p(x_{t+1},x_t,u_t)$ constructed from previously collected tuples $(x_{t+1},x_t,u_t)$ provides a dynamical model of the system through the conditional $p(x_{t+1}|x_t,u_t)$.
 
 We will see that for priors from certain families, the resulting optimization problem remains convex. For the special case of a Gaussian prior over the dynamics parameters or the output, the posterior mean of the parameter vector is conveniently obtained from a Kalman-smoothing algorithm, modified to include the prior.
@@ -333,7 +333,7 @@ which factors conveniently due to the Markov property of a state-space model. Ta
 \end{equation}
 For particular choices of density functions in~\labelcref{eq:lcdll}, notably Gaussian and Laplacian, the resulting ML estimation problem becomes convex. The next section will elaborate on the Gaussian case and introduce a recursive algorithm that solves for the full posterior efficiently.
 
-\subsection{Gaussian case $\star$ \cmt{TODO: example}}\label{sec:kalmanmodel}
+\subsection{Gaussian case $\star$} \label{sec:kalmanmodel}
 If all densities are Gaussian and $\w$ is modeled with a Brownian random walk (Gaussian $v_t$), \labelcref{eq:lcdll} can be written on the form (constants omitted)
 \begin{equation}
     \begin{split}\label{eq:prioropt}
@@ -368,34 +368,25 @@ A prior over the output of the system, or a subset thereof, is straight forward
 Laplacian densities appearing in~\labelcref{eq:lcdll} give rise to sparsity-promoting terms on the form $\normt{\cdot}$ in~\labelcref{eq:prioropt}. The resulting optimization problem remains convex, but must be solved with an iterative method, similar to how~\labelcref{eq:pwconstant} is solved.
 
 
-\section{Well-posedness and identifiability}
+\section{Well-posedness and identifiability}\label{sec:identifiability}
 To assess the well-posedness of the proposed identification methods, we start
 by noting that the problem of finding $A$ in $x_{t+1} = Ax_t$ given a pair
 $(x_{t+1},x_t)$ is an ill-posed problem in the sense that the solution is non
 unique. If we are given several pairs $(x_{t+1},x_t)$, for different $t$, while
 $A$ remains constant, the problem becomes over-determined and well-posed in the
 least-squares sense. The LTI-case in~\cref{sec:lti} is well posed according to
-classical results, when the $\Phi$ has full column rank. When we extend our view
-to LTV models, the number of free parameters is increased significantly, and the
-corresponding regressor matrix $\Phi$ would never have full column rank without
-the introduction of a regularization term. Informally, for every $n$ measurements,
-we have $K=n^2+nm$ free parameters. If we consider the identification problem
-of~\cref{eq:pwconstant} and let $\lambda \rightarrow \infty$, the regularizer is
-essentially replaced with equality constraints. This will force a solution in
-which all parameters in $k$ are constant over time, and the problem reduces to
-the LTI-problem. As $\lambda$ decreases from infinity the effective number of
-free parameters increases until the problem gets ill-posed for $\lambda = 0$.
+classical results, when the $\Phi$ has full column rank.
 
 When we extend our view to LTV models, the number of free
 parameters is increased significantly, and the corresponding
-regressor matrix $\tilde{\A}$ would never have full column
-rank without the introduction of a regularization term.
+regressor matrix $\tilde{\A}$ will never have full column
+rank and the introduction of a regularization term is necessary.
 Informally, for every $n$ measurements, we have $K=n^2+nm$
 free parameters. If we consider the identification problem
 of~\cref{eq:pwconstant} and let
 $\lambda \rightarrow \infty$, the regularizer is
 essentially replaced with equality constraints.
-This will force a solution in which all parameters
+This will enforce a solution in which all parameters
 in $k$ are constant over time, and the problem reduces
 to the LTI-problem. As $\lambda$ decreases from infinity
 the effective number of free parameters increases until the
@@ -458,8 +449,8 @@ The input was Gaussian noise of zero mean and unit variance, state transition no
 \Cref{fig:ss} depicts the estimated coefficients in the dynamics matrices for a value of $\lambda$ chosen using the L-curve method~\cite{hansen1994regularization}.
 \begin{figure}
     \centering
-    \setlength{\figurewidth}{0.55\linewidth}
-    \setlength{\figureheight }{4cm}
+    \setlength{\figurewidth}{0.99\linewidth}
+    \setlength{\figureheight }{5cm}
     \input{figs/states.tex}
     \figurecaptionreduction
     \caption{Piecewise constant state-space dynamics. True state values are shown with dashed lines. Gaussian state-transition and measurement noise with $\sigma = 0.2$ were added.}
@@ -468,7 +459,7 @@ The input was Gaussian noise of zero mean and unit variance, state transition no
 \begin{figure}
     \centering
     \setlength{\figurewidth}{0.99\linewidth}
-    \setlength{\figureheight }{5cm}
+    \setlength{\figureheight }{6cm}
     \input{figs/ss.tex}
     \figurecaptionreduction
     \caption{Piecewise constant state-space dynamics. True values are shown with dashed, black lines. Gaussian state-transition and measurement noise with $\sigma = 0.2$ were added.}
@@ -477,12 +468,23 @@ The input was Gaussian noise of zero mean and unit variance, state transition no
 
 
 
+\section{Reinforcement learning $\star$}
+In this example, we use the proposed methods to identify LTV dynamics models for
+reinforcement learning. The task is to stabilize a pendulum attached to a moving
+cart by means of moving the cart, with bounds on the control signal and a quadratic
+cost on states and control. To this end, we perform a series of experiments,
+whereafter each we 1) fit a dynamics model, 2) optimize the cost function under
+the model using differential dynamic programming,\footnote{Implementation made available at
+\href{github.com/baggepinnen/DifferentialDynamicProgramming.jl}{github.com/baggepinnen/DifferentialDynamicProgramming.jl}}
+with bounds on the deviation between the new trajectory and the last trajectory
+in order to stay close to the validity region of the model. This reinforcement-learning strategy has shown great promise lately and is referred to as trust-region policy optimization~\cite{schulman2015trust}. We compare three different models,
+the ground truth system model, an LTV model (\labelcref{eq:smooth}) and an LTI model.
+The total cost over $T=500$ time steps is shown as a function of learning iteration
+in~\cref{fig:ilc}. The figure illustrates how the learning procedure reaches the
+optimal cost of the ground truth model when an LTV model is used, whereas when
+using an LTI model, the learning diverges. The figure further illustrates that if
+the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed is increased. The prior in this case was constructed from the linearized system model which is unavailable in a real application, but serves as an indication of the effectiveness of inclusion of a prior in this example.
 
-\section{Applications}
-\subsection{Reinforcement learning $\star$}
-In this example, we use the proposed methods to identify LTV dynamics models for reinforcement learning. The task is to stabilize a pendulum attached to a moving cart by means of moving the cart, with bounds on the control signal and a quadratic cost on states and control. To this end, we perform a series of experiments, whereafter each we 1) fit a dynamics model, 2) optimize the cost function under the model using differential dynamic programming,\footnote{Implementation made available at \href{github.com/baggepinnen/DifferentialDynamicProgramming.jl}{github.com/baggepinnen/DifferentialDynamicProgramming.jl}} with bounds on the deviation between the new trajectory and the last trajectory in order to stay close to the validity region of the model. We compare three different models, the ground truth system model, an LTV model (\labelcref{eq:smooth}) and an LTI model. The total cost over $T=500$ time steps is shown as a function of learning iteration in~\cref{fig:ilc}. The figure illustrates how the learning procedure reaches the optimal cost of the ground truth model when an LTV model is used, whereas when using an LTI model, the learning diverges. The figure further illustrates that if the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed is increased.
-
-\cmt{Mention where prior came from. For what noise level does method break down with/without prior? Fix NN implementation and get prior from there.}
 
 \begin{figure}[htp]
     \centering
@@ -492,31 +494,32 @@ In this example, we use the proposed methods to identify LTV dynamics models for
     \caption{Reinforcement learning example. Three different model types are used to iteratively optimize the trajectory of a pendulum on a cart.}
     \label{fig:ilc}
 \end{figure}
-\subsection{Robot-program segmentation $\star$}
-In this section we demonstrate how the proposed methods can be utilized to segment a composite robot program into subprograms. The intuition that guides us is that during, e.g., contact operation, the system dynamics changes abruptly as new contacts are established. These changes in dynamics serve as candidates for segment boundaries. This method share similarity with~\cite{fox2014bparhmm}, where a beta-process prior is utilized to find sets of AR-models and assign parts of the time-series to each individual dynamics model. \cmt{Extend}
 
-The application we consider is the mating of two plastic pieces, described in detail in~\cite{stolt2015robotic}. While several different candidates for input-output assignment exists, we build a model from recorded forces and torques to joint coordinates. The experimental data, normalized to zero mean and unit variance, is shown in~\cref{fig:joints}. To perform a segmentation, we estimate an LTV-model with piecewise constant dynamics evolution (\cref{sec:pwconstant}) and find the points where the discontinuities in the estimated model parameters occur. The sum of squared time differences over all model coefficients, $a_t$, is shown in the blue line in~\cref{fig:segments}. A simple peak finding algorithm with thresholds on height and distance between peaks was used to identify the most prominent changes, marked as automatic segments in the figure. The robot programmer who recorded the program has indicated his selection of segmentation points, marked as manual segments.
-\begin{figure}
-    \centering
-    \setlength{\figurewidth}{0.6\linewidth}
-    \setlength{\figureheight }{4cm}
-    \input{figs/joints.tex}
-    \figurecaptionreduction
-    \caption{Normalized experimental data.}
-    \label{fig:joints}
-\end{figure}
 
-\begin{figure}
-    \centering
-    \setlength{\figurewidth}{0.99\linewidth}
-    \setlength{\figureheight }{7cm}
-    \input{figs/segmented.tex}
-    \figurecaptionreduction
-    \caption{Segmentation result with LTV model from force to joint coordinates. Automatic segmentation is compared to manual segmentation.}
-    \label{fig:segments}
-\end{figure}
+% \subsection{Robot-program segmentation $\star$}
+% In this section we demonstrate how the proposed methods can be utilized to segment a composite robot program into subprograms. The intuition that guides us is that during, e.g., contact operation, the system dynamics changes abruptly as new contacts are established. These changes in dynamics serve as candidates for segment boundaries. This method share similarity with~\cite{fox2014bparhmm}, where a beta-process prior is utilized to find sets of AR-models and assign parts of the time-series to each individual dynamics model. \cmt{Extend}
+%
+% The application we consider is the mating of two plastic pieces, described in detail in~\cite{stolt2015robotic}. While several different candidates for input-output assignment exists, we build a model from recorded forces and torques to joint coordinates. The experimental data, normalized to zero mean and unit variance, is shown in~\cref{fig:joints}. To perform a segmentation, we estimate an LTV-model with piecewise constant dynamics evolution (\cref{sec:pwconstant}) and find the points where the discontinuities in the estimated model parameters occur. The sum of squared time differences over all model coefficients, $a_t$, is shown in the blue line in~\cref{fig:segments}. A simple peak finding algorithm with thresholds on height and distance between peaks was used to identify the most prominent changes, marked as automatic segments in the figure. The robot programmer who recorded the program has indicated his selection of segmentation points, marked as manual segments.
+% \begin{figure}
+%     \centering
+%     \setlength{\figurewidth}{0.6\linewidth}
+%     \setlength{\figureheight }{4cm}
+%     \input{figs/joints.tex}
+%     \figurecaptionreduction
+%     \caption{Normalized experimental data.}
+%     \label{fig:joints}
+% \end{figure}
+%
+% \begin{figure}
+%     \centering
+%     \setlength{\figurewidth}{0.99\linewidth}
+%     \setlength{\figureheight }{7cm}
+%     \input{figs/segmented.tex}
+%     \figurecaptionreduction
+%     \caption{Segmentation result with LTV model from force to joint coordinates. Automatic segmentation is compared to manual segmentation.}
+%     \label{fig:segments}
+% \end{figure}
 
-\cmt{Maybe include an example with a disturbance}
 
 \section{Discussion}
 
@@ -558,6 +561,12 @@ However, all terms $\lambda\normt{\Delta k}^2 = (\Delta k)\T (\lambda I) (\Delta
 can be generalized to $(\Delta k)\T \Lambda (\Delta k)$, where $\Lambda$ is an arbitrary positive definite matrix.
 This allows incorporation of different scales for different variables with little added implementation complexity.
 
+The main benefit of the regularization is the prior belief over the nature of the change in dynamics parameters it represents. By postulating a prior belief that the dynamics parameters change in a certain way, less demand is put on the data required for identification. The identification process will thus not interfere with operation in the same way as if noise would be added to the input for identification purposes. This allows learning of flexible, over-parametrized models that fit available data well. This makes the proposed identification methods attractive in applications such as guided policy search (GPS)~\cite{levine2013guided, levine2015learning} and non-linear iterative learning control (ILC)~\cite{bristow2006survey}, where they can lead to dramatically decreased sample complexity.
+
+\addtolength{\textheight}{-16.5cm}
+
+The drawbacks of LTV models include situations where the observed time variation comes from non-linear effects. Non-linearities in the system may cause an apparent time-variation as the state varies. While an LTV-model can capture this variation, it will fail to generalize well to a subsequent experiment where the states take a significantly different trajectory. Many non-linear systems are, however, approximately \emph{locally} linear, such that they are well described by a linear model in a small neighborhood around the linearization/operating point. LTV models can thus be efficiently used for model-based reinforcement learning with variants of trust-region based optimization~\cite{schulman2015trust}.
+
 
 
 % \printbibliography
@@ -590,6 +599,6 @@ This allows incorporation of different scales for different variables with littl
 x = linspace(-1.3,1.3,100)
 z1 = [norm([x,y]) for x in x, y in x]
 z2 = [norm([x,y])^2 for x in x, y in x]
-surface(x,x,z2, colorbar=false, alpha=0.2)
-surface!(x,x,z1, fillalpha=1, grid=false)
+contour(x,x,z2, colorbar=false, alpha=0.2, c=:red)
+contour!(x,x,z1, fillalpha=1, grid=false, c=:blue)
 % savetikz("/work/fredrikb/reinforcementlearning/jump_lin_id/figs/surf.tex")