From a6a408ba580ebc5bd969eb4e771df4c4c5839d83 Mon Sep 17 00:00:00 2001
From: baggepinnen <cont-frb@ulund.org>
Date: Fri, 16 Feb 2018 17:00:20 +0100
Subject: [PATCH] update id_paper

---
 jump_lin_id/bibtexfile.bib | 12 ++++----
 jump_lin_id/id_paper.tex   | 56 +++++++++++++++++++-------------------
 jump_lin_id/two_link.jl    |  2 +-
 3 files changed, 35 insertions(+), 35 deletions(-)

diff --git a/jump_lin_id/bibtexfile.bib b/jump_lin_id/bibtexfile.bib
index 1de07dd..6f455fc 100644
--- a/jump_lin_id/bibtexfile.bib
+++ b/jump_lin_id/bibtexfile.bib
@@ -21,7 +21,7 @@
   title={Discrete-time Markov jump linear systems},
   author={Costa, Oswaldo Luiz Valle and Fragoso, Marcelo Dutra and Marques, Ricardo Paulino},
   year={2006},
-  publisher={Springer Science \& Business Media}
+  publisher={Springer Science \& Business Media, London}
 }
 
 @article{nagarajaiah2004time,
@@ -108,7 +108,7 @@
 @inproceedings{levine2013guided,
   title={Guided policy search},
   author={Levine, Sergey and Koltun, Vladlen},
-  booktitle={Proceedings of the 30th International Conference on Machine Learning (ICML-13)},
+  booktitle={Int. Conf. Machine Learning (ICML), Atlanta},
   pages={1--9},
   year={2013}
 }
@@ -116,7 +116,7 @@
 @inproceedings{levine2015learning,
   title={Learning contact-rich manipulation skills with guided policy search},
   author={Levine, Sergey and Wagener, Nolan and Abbeel, Pieter},
-  booktitle={Robotics and Automation (ICRA), 2015 IEEE International Conference on},
+  booktitle={Robotics and Automation (ICRA), IEEE Int. Conf.},
   pages={156--163},
   year={2015},
   organization={IEEE}
@@ -136,7 +136,7 @@
 @inproceedings{schulman2015trust,
   title={Trust region policy optimization},
   author={Schulman, John and Levine, Sergey and Abbeel, Pieter and Jordan, Michael and Moritz, Philipp},
-  booktitle={Proceedings of the 32nd International Conference on Machine Learning (ICML-15)},
+  booktitle={Int. Conf. Machine Learning (ICML-15)},
   pages={1889--1897},
   year={2015}
 }
@@ -144,7 +144,7 @@
 @inproceedings{vidal2002observability,
   title={Observability and identifiability of jump linear systems},
   author={Vidal, Ren{\'e} and Chiuso, Alessandro and Soatto, Stefano},
-  booktitle={Decision and Control, 2002, Proceedings of the 41st IEEE Conference on},
+  booktitle={IEEE Conf. Decision and Control (CDC), Las Vegas},
   volume={4},
   pages={3614--3619},
   year={2002},
@@ -154,7 +154,7 @@
 @article{parikh2014proximal,
   title={Proximal algorithms},
   author={Parikh, Neal and Boyd, Stephen and others},
-  journal={Foundations and Trends{\textregistered} in Optimization},
+  journal={Foundations and Trends in Optimization},
   volume={1},
   number={3},
   pages={127--239},
diff --git a/jump_lin_id/id_paper.tex b/jump_lin_id/id_paper.tex
index 793ce2f..4219fa2 100644
--- a/jump_lin_id/id_paper.tex
+++ b/jump_lin_id/id_paper.tex
@@ -16,6 +16,7 @@
 \usepackage{amssymb}
 \usepackage{units}
 \usepackage{afterpage}
+\usepackage{microtype}
 %\usepackage[english]{varioref}
 \usepackage{gensymb}
 %\usepackage[mathletters]{ucs}
@@ -148,7 +149,7 @@ Identification of LTV Dynamical Models with\\ Smooth or Discontinuous Time Evolu
 \pagestyle{empty}
 
 \begin{abstract}
-    We establish a connection between trend filtering and system identification which results in a family of new identification methods for linear, time-varying (LTV) dynamical models based on convex optimization. We demonstrate how the design of the cost function promotes either a continuous change in dynamics over time, or causes discontinuous changes in model coefficients occurring at a finite (sparse) set of time instances. We further discuss the introduction of priors on the model parameters for situations where excitation is insufficient for identification. The identification problems are cast as convex optimization problems and are applicable to, e.g., ARX-models and state-space models with time-varying parameters. We illustrate usage of the methods in simulations of jump-linear systems, a nonlinear robot arm with non-smooth friction and stiff contacts as well as in model-based, trajectory centric reinforcement learning on a smooth nonlinear system.
+    We establish a connection between trend filtering and system identification which results in a family of new identification methods for linear, time-varying (LTV) dynamical models based on convex optimization. We demonstrate how the design of the cost function promotes a model with either a continuous change in dynamics over time, or causes discontinuous changes in model coefficients occurring at a finite (sparse) set of time instances. We further discuss the introduction of priors on the model parameters for situations where excitation is insufficient for identification. The identification problems are cast as convex optimization problems and are applicable to, e.g., ARX models and state-space models with time-varying parameters. We illustrate usage of the methods in simulations of jump-linear systems, a nonlinear robot arm with non-smooth friction and stiff contacts as well as in model-based, trajectory centric reinforcement learning on a smooth nonlinear system.
     % \cmt{Rewrote abstract to make contribution stand out from previous work and highlight interesting nonlinear and non-smooth nature of examples.}
 \end{abstract}
 % \begin{keywords}
@@ -163,17 +164,17 @@ Identification of LTV Dynamical Models with\\ Smooth or Discontinuous Time Evolu
 
 The difficulty of the task of identifying time-varying dynamical models of systems varies greatly with the model considered and the availability of measurements of the state sequence. For smoothly changing dynamics, linear in the parameters, the recursive least-squares algorithm with exponential forgetting (RLS$\lambda$) is a common option. If a Gaussian random-walk model for the parameters is assumed, a Kalman filtering/smoothing algorithm \cite{rauch1965maximum} gives the filtering/smoothing densities of the parameters in closed form. The assumption of smoothly (Gaussian) varying dynamics is often restrictive. Discontinuous dynamics changes occur, for instance, when an external controller changes operation mode, when a sudden contact between a robot and its environment is established, an unmodeled disturbance enters the system or when a system is suddenly damaged.
 
-Identification of systems with non-smooth dynamics evolution has been studied extensively. The book~\cite{costa2006discrete} treats the case where the dynamics are known, but the state sequence unknown, i.e., state estimation. In~\cite{nagarajaiah2004time}, the authors examine the residuals from an initial constant dynamics fit to determine regions in time where improved fit is needed by the introduction of additional constant dynamics models. Results on identifiability and observability in jump-linear systems in the non-controlled (autonomous) setting are available in~\cite{vidal2002observability}. The main result on identifiability in \cite{vidal2002observability} was a rank condition on a Hankel matrix constructed from the collected output data, similar to classical results on the least-squares identification of ARX-models which appears as rank constraints on the, typically Toeplitz or block-Toeplitz, regressor matrix. Identifiability of the methods proposed in this article are discussed in~\cref{sec:identifiability}.
+Identification of systems with non-smooth dynamics evolution has been studied extensively. The book~\cite{costa2006discrete} treats the case where the dynamics are known, but the state sequence unknown, i.e., state estimation. In~\cite{nagarajaiah2004time}, the authors examine the residuals from an initial constant dynamics fit to determine regions in time where improved fit is needed by the introduction of additional constant dynamics models. Results on identifiability and observability in jump-linear systems in the non-controlled (autonomous) setting are available in~\cite{vidal2002observability}. The main result on identifiability in \cite{vidal2002observability} was a rank condition on a Hankel matrix constructed from the collected output data, similar to classical results on the least-squares identification of ARX models which appears as rank constraints on the, typically Toeplitz or block-Toeplitz, regressor matrix. Identifiability of the methods proposed in this article are discussed in~\cref{sec:identifiability}.
 
-An LTV model can be seen as a first order approximation of the dynamics of a nonlinear system around a trajectory. We emphasize that such an approximation will in general fail to generalize far from the this trajectory, but many methods in reinforcement learning and control make efficient use of the linearized dynamics for optimization, while ensuring validity of the approximation by constraints or penalty terms. An example provided in \cref{sec:rl} highlights such a method.
+An LTV model can be seen as a first-order approximation of the dynamics of a nonlinear system around a trajectory. We emphasize that such an approximation will in general fail to generalize far from the this trajectory, but many methods in reinforcement learning and control make efficient use of the linearized dynamics for optimization, while ensuring validity of the approximation by constraints or penalty terms. An example provided in \cref{sec:rl} highlights such a method.
 
-An important class of identification methods that has been popularized lately is \emph{trend filtering} methods~\cite{kim2009ell_1, tibshirani2014adaptive}. Trend filtering methods work by specifying a \emph{fitness criterion} that determines the goodness of fit, as well as a \emph{regularization} term, often chosen with sparsity promoting qualities. As a simple example, consider the reconstruction $\hat y$ of a noisy signal $y = \left\{y_t\inspace{}\right\}_{t=1}^T$ with piecewise constant segments. To this end, we might formulate and solve the convex optimization problem
+An important class of identification methods that has been popularized lately is \emph{trend filtering} methods~\cite{kim2009ell_1, tibshirani2014adaptive}. Trend filtering methods work by specifying a \emph{fitness criterion} that determines the goodness of fit, as well as a \emph{regularization} term, often chosen with sparsity promoting qualities. As a simple example, consider the reconstruction $\hat y$ of a noisy signal $y = \{y_t\inspace{}\}_{t=1}^T$ with piecewise constant segments. To this end, we may formulate and solve the convex optimization problem
 \begin{equation} \label{eq:tf}
    \minimize{\hat{y}} \normt{y-\hat{y}}^2 + \lambda\sum_t |\hat{y}_{t+1} - \hat{y}_t|
 \end{equation}
 The first term is the fitness criterion or \emph{loss function}, whereas the second term is a sparsity-promoting regularizer which promotes small changes between consecutive samples in the reconstructed signal. The sparsity promoting effect of the 1-norm regularizer is well known, and stems from the constant length of the gradient whenever the argument is non-zero~\cite{murphy2012machine}. Compare this to the squared difference, for which the gradient rapidly vanishes as the argument approaches zero. The squared difference will thus promote \emph{small} arguments, whereas the 1-norm promotes \emph{sparse} arguments.
 
-In this work, we will draw inspiration from the trend-filtering literature to develop new system identification methods for LTV models, with interesting properties. In trend filtering, we decompose a curve as a set of polynomial segments. In the identification methods proposed in this work, we instead decompose a multivariable state sequence as the output of a set of LTV models, where the model coefficients evolve as polynomial functions of time. We start by defining a set of optimization problems with a least-squares loss function and carefully chosen regularization terms. We further discuss how prior information can be utilized to increase the accuracy of the identification and end the article with identification of a nonlinear system with non-smooth friction and an example of model-based reinforcement learning and a discussion.
+In this work, we will draw inspiration from the trend-filtering literature to develop new system identification methods for LTV models, with interesting properties. In trend filtering, we decompose a curve as a set of polynomial segments. In the identification methods proposed in this work, we instead decompose a multivariable state sequence as the output of a set of LTV models, where the model coefficients evolve as polynomial functions of time. We start by defining a set of optimization problems with a least-squares loss function and carefully chosen regularization terms. We further discuss how prior information can be utilized to increase the accuracy of the identification and end the article with identification of a nonlinear system with non-smooth friction and an example of model-based reinforcement learning followed by a discussion.
 
 \section{LTI identification}\label{sec:lti}
 
@@ -218,10 +219,10 @@ where the parameters $\w$ are assumed to evolve according to the dynamical syste
     \label{eq:dynsys}
 \begin{split}
    k_{t+1} &= H_t k_t + w_t\\
-   y_t &= \big(I_n \otimes \bmatrixx{x\T & u\T}\big) \w_t
+   y_t &= \big(I_n \otimes \bmatrixx{x_t\T & u_t\T}\big) \w_t
 \end{split}
 \end{equation}
-where, if no prior knowledge is available, the dynamics matrix $H_t$ can be taken as the identity matrix. $H = I$ implies that the model coefficients follow a random walk dictated by the properties of $w_t$, i.e., the state transition density function $p_w(k_{t+1}|k_t)$. The emission density function $p_v(x_{t+1} | x_t, u_t, k_t)$ is determining the drift of the state, which for the parameter estimation problem can be seen as the distribution of measurements, given the current state of the system.
+where, if no prior knowledge is available, the dynamics matrix $H_t$ can be taken as the identity matrix; $H = I$ implies that the model coefficients follow a random walk dictated by the properties of $w_t$, i.e., the state transition density function $p_w(k_{t+1}|k_t)$. The emission density function $p_v(x_{t+1} | x_t, u_t, k_t)$ is determining the drift of the state, which for the parameter estimation problem can be seen as the distribution of measurements, given the current state of the system.
 We emphasize here that the state in the parameter evolution model refers to the current parameters $k_t$ and not the system state $x_t$, hence, $p_v$ is called the emission density and not the transition density. Particular choices of $p_v$ and $p_w$ emit data likelihoods concave in the parameters and hence amenable to convex optimization.
 
 
@@ -278,7 +279,7 @@ If the number of switches in dynamics parameters, $M$, is known in advance, the
    & \subjto &        & \sum_t \textbf{1}\{ \w_{t+1} \neq \w_t\} \leq M
 \end{align}
 where $\textbf{1}\{\cdot\}$ is the indicator function.
-This problem is non-convex and we propose solving it using dynamic programming (DP). For this purpose we modify the algorithm developed in \cite{bellman1961approximation}, an algorithm frequently referred to as segmented least-squares~\cite{bellman1969curve}, where a curve is approximated by piecewise linear segments. The modification lies in the association of each segment (set of consecutive time indices during which the dynamics is constant) with a dynamics model, as opposed to a simple straight line.\footnote{Indeed, if a simple integrator is chosen as dynamics model and a constant input is assumed, the result of our extended algorithm reduces to the segmented least-squares solution.} Unfortunately, the computational complexity of the dynamic programming solution, $\mathcal{O}(T^2K^3)$, becomes prohibitive for large $T$.\footnote{For details regarding the DP algorithm and implementation, the reader is referred to the source-code repository accompanying this article.}
+This problem is non-convex and we propose solving it using dynamic programming (DP). For this purpose we modify the algorithm developed in \cite{bellman1961approximation}, an algorithm frequently referred to as segmented least-squares~\cite{bellman1969curve}, where a curve is approximated by piecewise linear segments. The modification lies in the association of each segment (set of consecutive time indices during which the parameters are constant) with a dynamics model, as opposed to a simple straight line.\footnote{Indeed, if a simple integrator is chosen as dynamics model and a constant input is assumed, the result of our extended algorithm reduces to the segmented least-squares solution.} Unfortunately, the computational complexity of the dynamic programming solution, $\mathcal{O}(T^2K^3)$, becomes prohibitive for large $T$.\footnote{For details regarding the DP algorithm and implementation, the reader is referred to the source-code repository accompanying this article.}
 
 \subsection{Piecewise linear time evolution}\label{eq:pwlinear}
 A piecewise linear signal is characterized by a \emph{sparse second-order time difference}, i.e., it has a small number of changes in the slope. A piecewise linear time-evolution of the dynamics parameters is hence obtained if we solve the optimization problem.
@@ -314,11 +315,11 @@ Norm & $D_n$ & Result    \\ \midrule
 
 \subsection{Two-step refinement}
 
-Since many of the proposed formulations of the optimization problem penalize the size of the changes to the parameters, solutions in which the changes are slightly underestimated are favored. To mitigate this issue, a two-step procedure can be implemented where in the first step, change points are identified. In the second step, the penalty on the one-norm is removed and equality constraints are introduced between consecutive time-indices for which no change in dynamics was indicated by the first step.
+Since many of the proposed formulations of the optimization problem penalize the size of the changes to the parameters, solutions in which the changes are slightly underestimated are favored. To mitigate this issue, a two-step procedure can be implemented where in the first step, change points (knots) are identified. In the second step, the penalty on the one-norm is removed and equality constraints are introduced between consecutive time-indices for which no change in dynamics was indicated by the first step.
 
 The second step can be computed very efficiently by noticing that the problem can be split into several identical sub-problems at the knots identified in the first step. The sub-problems have closed-form solutions if the problem in~\cref{sec:pwconstant} is considered.
 
-To identify the points at which the dynamics change (knots), we observe the argument inside the sum of the regularization term, i.e., $a_{t1} = \normt{ \w_{t+1} - \w_t}$ or $a_{t2} = \normt{\w_{t+2} -2 \w_{t+1} + \w_t}$. Time instances where $a_t$ is taking non-zero values indicate change points.
+To identify the points at which the dynamics change, we observe the argument inside the sum of the regularization term, i.e., $a_{t1} = \normt{ \w_{t+1} - \w_t}$ or $a_{t2} = \normt{\w_{t+2} -2 \w_{t+1} + \w_t}$. Time instances where $a_t$ is taking non-zero values indicate change points.
 
 
 \section{Dynamics prior and Kalman filtering}
@@ -344,7 +345,7 @@ which factors conveniently due to the Markov property of a state-space model.
 For particular choices of density functions in~\labelcref{eq:lcdll}, notably Gaussian and Laplacian, the negative likelihood function becomes convex. The next section will elaborate on the Gaussian case and introduce a recursive algorithm that solves for the full posterior efficiently. The Laplacian case, while convex, does not admit an equally efficient algorithm, but is more robust to outliers in the data.
 
 \subsection{Gaussian case} \label{sec:kalmanmodel}
-If all densities in~\labelcref{eq:lcdll} are Gaussian and $\w$ is modeled with the Brownian random walk model \labelcref{eq:dynsys} (Gaussian $v_t$), \labelcref{eq:lcdll} can be written on the form (constants omitted)
+If all densities in~\labelcref{eq:lcdll} are Gaussian and $\w$ is modeled with the Brownian random walk model \labelcref{eq:dynsys} (Gaussian $v_t$), \labelcref{eq:lcdll} can be written on the form (scaling constants omitted)
 \begin{equation}
     \begin{split}\label{eq:prioropt}
     -\log p(\w,y|x,z)_{1:T} &=
@@ -363,7 +364,7 @@ In this special case, we introduce a recursive solution given by a modified Kalm
         \hat{x}_{t|t} &= \hat{x}_{t|t-1} + K_t\big(y_t-C\hat{x}_{t|t-1}\big)\\
         P_{t|t} &= P_{t|t-1} - K_t C P_{t|t-1}
     \end{align}
-    where $x$ is the state vector, with state-drift covariance $R_1$ and $C$ is a matrix that relates $x$ to a measurement $y$ with covariance $R_2$ according to $y=Cx$. The first two equations constitute the \emph{prediction} step, and the last two equations incorporate the measurement $y_t$ in the \emph{correction} step.
+    where $x$ is the state vector, with state-drift covariance $R_1$ and $C$ is a matrix that relates $x$ to a measurement $y=Cx$ with covariance $R_2$. The first two equations constitute the \emph{prediction} step, and the last two equations incorporate the measurement $y_t$ in the \emph{correction} step.
     The modification required to incorporate a Gaussian prior on the state variable $p(x_t|v_t) = \N(\mu_0(v_t), \Sigma_0(v_t))$ involves a repeated correction step and takes the form
     \begin{align}
         \bar{K}_t &= P_{t|t}\big(P_{t|t}+\Sigma_0(v_t)\big)^{-1}\label{eq:postk}\\
@@ -384,7 +385,7 @@ The Kalman smoother can be used for solving identification problems like~\labelc
 To develop a Kalman-filter based algorithm for solving~\labelcref{eq:smooth}, we augment the model \labelcref{eq:dynsys} with the state variable $k^\prime_t = k_{t} - k_{t-1}$ and note that $k^\prime_{t+1} - k^\prime_t = k_{t+1} - 2k_t + k_{t-1}$. We thus introduce the augmented-state model
 \begin{align}\label{eq:dynsys2}
    \begin{bmatrix}k_{t+1} \\ k^\prime_{t+1}\end{bmatrix} &= \begin{bmatrix}I_K & I_K \\ 0_K & I_K\end{bmatrix} \begin{bmatrix}k_{t} \\ k^\prime_{t}\end{bmatrix} + \begin{bmatrix}0 \\ w_{t}\end{bmatrix}\\
-   y_t &=  \begin{bmatrix}\big(I_n \otimes \bmatrixx{x\T & u\T}\big) & 0\end{bmatrix} \begin{bmatrix}k_{t} \\ k^\prime_{t}\end{bmatrix}
+   y_t &=  \begin{bmatrix}\big(I_n \otimes \bmatrixx{x_t\T & u_t\T}\big) & 0\end{bmatrix} \begin{bmatrix}k_{t} \\ k^\prime_{t}\end{bmatrix}
 \end{align}
 which is on a form suitable for filtering/smoothing with the machinery developed above.
 
@@ -411,8 +412,8 @@ by noting that the problem of finding $A$ in $x_{t+1} = Ax_t$ given a pair
 $(x_{t+1},x_t)$ is an ill-posed problem in the sense that the solution is non
 unique. If we are given several pairs $(x_{t+1},x_t)$, for different $t$, while
 $A$ remains constant, the problem becomes over-determined and well-posed in the
-least-squares sense, provided that the vectors $\left\{x_t^{(i)}\right\}_{t=1}^T$ span $\mathbb{R}^n$. The LTI-case in~\cref{sec:lti} is well posed according to
-classical results, when the $\Phi$ has full column rank.
+least-squares sense, provided that the vectors $\{x_t^{(i)}\}_{t=1}^T$ span $\mathbb{R}^n$. The LTI-case in~\cref{sec:lti} is well posed according to
+classical results, when $\Phi$ has full column rank.
 
 When we extend our view to LTV models, the number of free
 parameters is increased significantly, and the corresponding
@@ -421,11 +422,10 @@ rank and the introduction of a regularization term is necessary.
 Informally, for every $n$ measurements, we have $K=n^2+nm$
 free parameters. If we consider the identification problem
 of~\cref{eq:pwconstant} and let
-$\lambda \rightarrow \infty$, the regularizer is
-essentially replaced with equality constraints.
+$\lambda \rightarrow \infty$, the regularizer terms essentially becomes equality constraints.
 This will enforce a solution in which all parameters
 in $k$ are constant over time, and the problem reduces
-to the LTI-problem. As $\lambda$ decreases from infinity
+to the LTI-problem. As $\lambda$ decreases,
 the effective number of free parameters increases until the
 problem gets ill-posed for $\lambda = 0$.
 We formalize the above arguments as
@@ -433,15 +433,15 @@ We formalize the above arguments as
     Optimization problems \labelcref{eq:slow,eq:pwconstant} have unique global minima for $\lambda > 0$ if and only if the corresponding LTI optimization problem has a unique solution.
 \end{proposition}
 \begin{proof}
-    The cost function is a sum of two convex terms. For a global minimum to be non-unique, the Hessians of the two terms must share nullspace.
+    The cost function is a sum of two convex terms. For a global minimum to be non-unique, the Hessians of the two terms must have intersecting nullspaces.
     In the limit $\lambda \rightarrow \infty$ the problem reduces to the LTI problem. The nullspace of the regularization Hessian, which is invariant to $\lambda$, does thus not share any directions with the nullspace of $\tilde{\A}\T \tilde{\A}$ which establishes the equivalence of identifiability between the LTI problem and the LTV problems.
 \end{proof}
 \begin{proposition}
-    Optimization problems \labelcref{eq:smooth,eq:pwlinear} have unique global minima for $\lambda > 0$ if and only if the corresponding LTI optimization problem has a unique solution and the Jacobian of the generating system is bounded along parts of the trajectory.
+    Optimization problems \labelcref{eq:smooth,eq:pwlinear} with higher order differentiation in the regularization term have unique global minima for $\lambda > 0$ if and only if the corresponding LTI optimization problem has a unique solution and the Jacobian of the generating system is bounded along the trajectory.
 \end{proposition}
 \begin{proof}
-    Again, the cost function is a sum of two convex terms and for a global minimum to be non-unique, the Hessians of the two terms must share nullspace.
-    In the limit $\lambda \rightarrow \infty$ the regularization term reduces to a linear constraint set, allowing only parameter vectors that lie along a line through time. Let $v \neq 0$ be such a line and let $L$ be an upper bound on the Frobenius norm of the Jacobian. $v \in \operatorname{null}{(\tilde{\A}\T \tilde{\A})}$ implies $\exists \alpha : \norm{k+\alpha v} > L$ which establishes the equivalence of identifiability between the LTI problem and the LTV problems.
+    Again, the cost function is a sum of two convex terms and for a global minimum to be non-unique, the Hessians of the two terms must have intersecting nullspaces.
+    In the limit $\lambda \rightarrow \infty$ the regularization term reduces to a linear constraint set, allowing only parameter vectors that lie along a line through time. Let $v \neq 0$ be such a vector and let $L$ be an upper bound on the Frobenius norm of the Jacobian. $v \in \operatorname{null}{(\tilde{\A}\T \tilde{\A})}$ implies that the loss is invariant to the pertubation $k+\alpha v$ for an arbitrary $\alpha$. However, $\exists \alpha : \norm{k+\alpha v} > L$ which establishes the equivalence of identifiability between the LTI problem and the LTV problems.
 \end{proof}
 
 For the LTI problem to be well-posed, the system must be identifiable and the input $u$ must be persistently exciting of sufficient order~\cite{johansson1993system}.
@@ -527,14 +527,14 @@ The input was Gaussian noise of zero mean and unit variance, state transition no
 \section{Example -- Non-smooth robot arm with stiff contact}
 To illustrate the ability pf the proposed models to represent the non-smooth dynamics along a trajectory of a robot arm, we simulate a two-link robot with discontinuous Coulomb friction. Additionally, we let the robot establish a stiff contact with the environment to illustrate both strengths and weaknesses of the modeling approach.
 
-The state of the robot arm consists of two joint coordinates, $q$, and their time derivatives, $\dot q$. \Cref{fig:robot_train} illustrates the state trajectories, control torques and simulations of a model estimated with~\labelcref{eq:pwconstant}. The figure clearly illustrates that the model is able to capture the dynamics both during the non-smooth sign change of the velocity, but also during establishment of the stiff contact. The learned dynamics of the contact is however time-dependent, which is illustrated in \Cref{fig:robot_val}, where the model is used on a validation trajectory where a different noise sequence was added to the control torque. Due to the novel input signal, the contact is established at a different time-instant and as a consequence, there is an error transient in the simulated data.
+The state of the robot arm consists of two joint coordinates, $q$, and their time derivatives, $\dot q$. \Cref{fig:robot_train} illustrates the state trajectories, control torques and simulations of a model estimated by solving~\labelcref{eq:pwconstant}. The figure clearly illustrates that the model is able to capture the dynamics both during the non-smooth sign change of the velocity, but also during establishment of the stiff contact. The learned dynamics of the contact is however time-dependent, which is illustrated in \Cref{fig:robot_val}, where the model is used on a validation trajectory where a different noise sequence was added to the control torque. Due to the novel input signal, the contact is established at a different time-instant and as a consequence, there is an error transient in the simulated data.
 \begin{figure*}[htp]
     \centering
     \setlength{\figurewidth}{0.5\linewidth}
     \setlength{\figureheight }{4cm}
     \input{figs/robot_train.tex}
     % \includegraphics[width=\linewidth]{figs/robot_train}
-    \caption{Simulation of non-smooth robot dynamics with stiff contact -- training data vs. sample time index. The sign change in velocity, and hence a discontinuous change in friction torque, occurs in the time interval 50-100 and the contact is established in the time interval 100-150. For numerical stability, all time-series are normalized to zero mean and unit variance, hence, the velocity zero crossing is explicitly marked with a dashed line. The control signal plot clearly indicates the discontinuity in torque around the unnormalized zero crossing of $\dot{q}_2$.}
+    \caption{Simulation of non-smooth robot dynamics with stiff contact -- training data vs. sample time index. The sign change in velocity, and hence a discontinuous change in friction torque, occurs in the time interval 50-100 and the contact is established in the time interval 100-150. For numerical stability, all time-series are normalized to zero mean and unit variance, hence, the original velocity zero crossing is explicitly marked with a dashed line. The control signal plot clearly indicates the discontinuity in torque around the unnormalized zero crossing of $\dot{q}_2$.}
     \label{fig:robot_train}
 \end{figure*}
 \begin{figure*}[htp]
@@ -542,7 +542,7 @@ The state of the robot arm consists of two joint coordinates, $q$, and their tim
     \setlength{\figurewidth}{0.5\linewidth}
     \setlength{\figureheight }{4cm}
     \input{figs/robot_val.tex}
-    \caption{Simulation of non-smooth robot dynamics with stiff contact -- validation data vs. sample time index. The dashed lines indicate the event times for the training data, highlighting that the model is able to deal effortless with the non-smooth friction, but inaccurately predicts the time evolution around the contact event.}
+    \caption{Simulation of non-smooth robot dynamics with stiff contact -- validation data vs. sample time index. The dashed lines indicate the event times for the training data, highlighting that the model is able to deal effortless with the non-smooth friction, but inaccurately predicts the time evolution around the contact event which now occurs at a slightly different time instance.}
     \label{fig:robot_val}
 \end{figure*}
 
@@ -560,7 +560,7 @@ the model using iterative LQG (differential dynamic programming),\footnote{Imple
 \href{github.com/baggepinnen/DifferentialDynamicProgramming.jl}{github.com/baggepinnen/DifferentialDynamicProgramming.jl}} an algorithm that calculates the value function exactly under the LTV dynamics and a quadratic expansion of the cost function.
 In order to stay close to the validity region of the linear model, we put bounds on the deviation between each new trajectory and the last trajectory.
 % This reinforcement-learning strategy has shown great promise lately and is part of the guided policy search framework \cite{levine2013guided} and has similarities to trust-region policy optimization~\cite{schulman2015trust}.
-We compare three different models,
+We compare three different models;
 the ground truth system model, an LTV model (obtained by solving \labelcref{eq:smooth}) and an LTI model.
 The total cost over $T=500$ time steps is shown as a function of learning iteration
 in~\cref{fig:ilc}. The figure illustrates how the learning procedure reaches the
@@ -620,9 +620,9 @@ the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed
 % linear given the input-output sequences $u$ and $y$. Extension to time-varying
 % dynamics is straight forward.
 
-This article presents methods for estimation of linear, time-varying models. When estimating an LTV model from a trajectory obtained from a nonlinear system, one is effectively estimating the linearization of the system around that trajectory. A first order approximation to a nonlinear system is not guaranteed to generalize well as deviations from the trajectory becomes large. Many non-linear systems are, however, approximately \emph{locally} linear, such that they are well described by a linear model in a small neighborhood around the linearization/operating point. For certain methods, such as iterative learning control and trajectory centric reinforcement learning, a first order approximation to the dynamics is used for efficient optimization, while the validity of the approximation is ensured by incorporating penalties or constraints between two consecutive trajectories.
+This article presents methods for estimation of linear, time-varying models. The methods presented extend directly to nonlinear models that remain \emph{linear in the parameters}. When estimating an LTV model from a trajectory obtained from a nonlinear system, one is effectively estimating the linearization of the system around that trajectory. A first-order approximation to a nonlinear system is not guaranteed to generalize well as deviations from the trajectory become large. Many non-linear systems are, however, approximately \emph{locally} linear, such that they are well described by a linear model in a small neighborhood around the linearization/operating point. For certain methods, such as iterative learning control and trajectory centric reinforcement learning, a first-order approximation to the dynamics is used for efficient optimization, while the validity of the approximation is ensured by incorporating penalties or constraints between two consecutive trajectories.
 
-The methods presented allow very efficient learning of this first order approximation due to the prior belief over the nature of the change in dynamics parameters, encoded by the regularization terms. By postulating a prior belief that the dynamics parameters change in a certain way, less demand is put on the data required for identification. The identification process will thus not interfere with normal operation in the same way as if excessive noise would be added to the input for identification purposes. This allows learning of flexible, over-parametrized models that fit available data well. This makes the proposed identification methods attractive in applications such as guided policy search (GPS)~\cite{levine2013guided, levine2015learning} and non-linear iterative learning control (ILC)~\cite{bristow2006survey}, where they can lead to dramatically decreased sample complexity.
+The methods presented allow very efficient learning of this first-order approximation due to the prior belief over the nature of the change in dynamics parameters, encoded by the regularization terms. By postulating a prior belief that the dynamics parameters change in a certain way, less demand is put on the data required for identification. The identification process will thus not interfere with normal operation in the same way as if excessive noise would be added to the input for identification purposes. This allows learning of flexible, over-parametrized models that fit available data well. This makes the proposed identification methods attractive in applications such as guided policy search (GPS)~\cite{levine2013guided, levine2015learning} and non-linear iterative learning control (ILC)~\cite{bristow2006survey}, where they can lead to dramatically decreased sample complexity.
 % \addtolength{\textheight}{-7cm}
 
 When faced with a system where time-varying dynamics is suspected and no particular
diff --git a/jump_lin_id/two_link.jl b/jump_lin_id/two_link.jl
index 3137c90..1dd4b95 100644
--- a/jump_lin_id/two_link.jl
+++ b/jump_lin_id/two_link.jl
@@ -191,7 +191,7 @@ ind2 = length(sp0)
 fig=plot(xt', label="States", show=false, layout=grid(3,2), subplot=1:4, size=(1800,1000), title="State trajectories", title=["\$q_1\$" "\$q_2\$" "\$\\dot{q}_1\$" "\$\\dot{q}_2\$"], c=:blue, linewidth=2)
 vline!([signch].*ones(1,4), show=false, subplot=1:4, lab="Velocity sign change", l=(:black, :dash))
 vline!([contact].*ones(1,4), show=false, subplot=1:4, lab="Stiff contact", l=(:black, :dash))
-plot!(forward_kin(x[1:2,:])', subplot=5, title="End effector positions", lab=["x" "y"], c=[:blue :red])
+plot!(forward_kin(x[1:2,:])', subplot=5, title="End-effector positions", lab=["x" "y"], c=[:blue :red])
 hline!([wall], l=(:dash, :black), grid=false, subplot=5, lab="Constraint")
 vline!([contact], subplot=5, lab="Stiff contact", l=(:black, :dash))
 plot!(ut[1:2,:]', subplot=6, title="Control torques", lab="", c=[:blue :red])
-- 
GitLab