From 1b484a1ac29f46c32a4456786059f7e75bd41ced Mon Sep 17 00:00:00 2001
From: baggepinnen <cont-frb@ulund.org>
Date: Tue, 23 Jan 2018 16:45:24 +0100
Subject: [PATCH] Updates

---
 flux/nn_prior/nn_prior.jl  | 24 +++++-----
 flux/nn_prior/utilities.jl | 13 ++++--
 jump_lin_id/bibtexfile.bib | 10 ++++
 jump_lin_id/id_paper.tex   | 95 ++++++++++++++++++++------------------
 4 files changed, 79 insertions(+), 63 deletions(-)

diff --git a/flux/nn_prior/nn_prior.jl b/flux/nn_prior/nn_prior.jl
index 22567be..9b30691 100644
--- a/flux/nn_prior/nn_prior.jl
+++ b/flux/nn_prior/nn_prior.jl
@@ -6,10 +6,10 @@ using OrdinaryDiffEq, LTVModels
     using Flux: back!, truncate!, treelike, train!, mse, testmode!, combine, params, jacobian
     using Flux.Optimise: Param, optimiser, RMSProp, expdecay
     wdecay         = 0.0
-    num_params     = 50
+    num_params     = 20
     stepsize       = 0.05
-    n_bootstrap    = 1
-    num_montecarlo = 20
+    n_bootstrap    = 4
+    num_montecarlo = 30
 end
 
 inoffice() = gethostname() ∈ ["billman", "Battostation"]
@@ -66,7 +66,7 @@ end
     if jacprop > 0
         iters = round(Int, iters / sqrt(jacprop+1))
         n,m = sys.ns,sys.n
-        R1,R2 = 0.01eye(n*(n+m)), 10eye(n) # Fast
+        R1,R2 = 0.01eye(n*(n+m)), 100eye(n) # Fast
         P0 = 10000R1
         model = LTVModels.fit_model(LTVModels.KalmanModel, x,u,R1,R2,P0, extend=true)
         xt,ut,yt = [x;u], u, y
@@ -93,7 +93,7 @@ end
 
         m          = modeltype(sys, num_params, activation)
         opt        = [ADAM(params(m), stepsize, decay=0.005); [expdecay(Param(p), wdecay) for p in params(m) if p isa AbstractMatrix]]
-        results    = fit_model(opt, loss(m), m, xt, yt, ut, xv, yv, uv, sys,modeltype, iters=iters ÷ jacprop, doplot=doplot, batch_size = size(yt,2) ÷ jacprop)
+        results    = fit_model(opt, loss(m), m, xt, yt, ut, xv, yv, uv, sys,modeltype, iters=iters, doplot=doplot, batch_size = size(yt,2))
         @pack results = x, u, y, xv, uv, yv
         println("Done: ", it)
         results
@@ -119,7 +119,7 @@ end
 # Produce figures
 # pyplot(reuse=false, show=false)
 # figs1 = fit_system(1, sys, SystemD, 4, doplot=false);     #savefigures(figs1..., 1)
-figs2 = fit_system(1, sys, DiffSystemD, 2, doplot=true); #savefigures(figs2..., 2)
+# figs2 = fit_system(1, sys, DiffSystemD, 2, doplot=true); #savefigures(figs2..., 2)
 # figs3 = fit_system(1, sys, VelSystemD, 2, doplot=false);  #savefigures(figs3..., 3)
 # figs4 = fit_system(1, sys, AffineSystem, doplot=true);  #savefigures(figs4..., 4)
 # error()
@@ -166,9 +166,9 @@ figs2 = fit_system(1, sys, DiffSystemD, 2, doplot=true); #savefigures(figs2...,
 ##
 
 res = map(1:num_montecarlo) do it
-    r1 = @spawn fit_system(it, sys, SystemD, 2)
-    r2 = @spawn fit_system(it, sys, DiffSystemD, 2)
-    r3 = @spawn fit_system(it, sys, VelSystemD, 2)
+    r1 = @spawn fit_system(it, sys, System)
+    r2 = @spawn fit_system(it, sys, DiffSystem)
+    r3 = @spawn fit_system(it, sys, VelSystem)
     r4 = @spawn fit_system(it, sys, System, 2)
     r5 = @spawn fit_system(it, sys, DiffSystem, 2)
     r6 = @spawn fit_system(it, sys, VelSystem, 2)
@@ -183,7 +183,7 @@ open(file->serialize(file, res), "res","w") # Save
 error()
 
 ##
-sleep(60*3)
+sleep(60*6)
 ##
 using StatPlots
 res = open(file->deserialize(file), "res") # Load
@@ -191,7 +191,7 @@ sys = res[1][1][1][:sys]
 nr = length(res[1]) ÷ 2
 labelvec = ["f" "g" "h"]
 infostring = @sprintf("Num hidden: %d, sigma: %2.2f, Montecarlo: %d", num_params, sys.σ0, num_montecarlo)
-
+##
 # pgfplots(size=(600,300), show=true)
 
 pred  = hcat([eval_pred.(get_res(res,i), true) for i ∈ 1:nr]...)
@@ -227,4 +227,4 @@ violin!((1:nr)',identity.(jacerrp), side=:right, xticks=(1:nr,labelvec), ylabel=
 # Weight decay on/off
 
 # Compare true vs ad-hoc bootstrap
-num_workers = 1; addprocs([(@sprintf("philon-%2.2d",i),num_workers) for i in [2:4; 6:10]]);addprocs([(@sprintf("ktesibios-%2.2d",i),num_workers) for i in 1:9]);addprocs([(@sprintf("heron-%2.2d",i),num_workers) for i in [1,2,3,4,6,11]])
+num_workers = 4; addprocs([(@sprintf("philon-%2.2d",i),num_workers) for i in [2:4; 6:10]]);addprocs([(@sprintf("ktesibios-%2.2d",i),num_workers) for i in 1:9]);addprocs([(@sprintf("heron-%2.2d",i),num_workers) for i in [1,2,3,4,6,11]])
diff --git a/flux/nn_prior/utilities.jl b/flux/nn_prior/utilities.jl
index dab3e0c..cb3bb20 100644
--- a/flux/nn_prior/utilities.jl
+++ b/flux/nn_prior/utilities.jl
@@ -11,7 +11,7 @@ function System(sys, num_params, activation)
     @unpack n,ns = sys
     ny = ns
     np = num_params
-    m  = Chain(Dense(ns+n,np, activation), Dropout(0.01), Dense(np,np, activation), Dropout(0.01), Dense(np,np, activation), Dropout(0.01), Dense(np,np, activation), Dropout(0.01), Dense(np, ny))
+    m  = Chain(Dense(ns+n,np, activation), Dense(np, ny))
     System(m, sys)
 end
 (m::System)(x) = m.m(x)
@@ -25,7 +25,7 @@ function DiffSystem(sys, num_params, activation)
     @unpack n,ns = sys
     ny = ns
     np = num_params
-    m  = Chain(Dense(ns+n,np, activation), Dropout(0.01), Dense(np,np, activation), Dropout(0.01), Dense(np,np, activation), Dropout(0.01), Dense(np,np, activation), Dropout(0.01), Dense(np, ny))
+    m  = Chain(Dense(ns+n,np, activation), Dense(np, ny))
     DiffSystem(m, sys)
 end
 (m::DiffSystem)(x) = m.m(x)+x[1:m.sys.ns,:]
@@ -39,7 +39,7 @@ function VelSystem(sys, num_params, activation)
     @unpack n,ns = sys
     ny = n
     np = num_params
-    m = Chain(Dense(ns+n,np, activation), Dropout(0.01), Dense(np,np, activation), Dropout(0.01), Dense(np, ny))
+    m = Chain(Dense(ns+n,np, activation),  Dense(np, ny))
     VelSystem(m, sys)
 end
 (m::VelSystem)(x) = m.m(x)
@@ -105,7 +105,6 @@ end
 =#
 
 loss(m::AbstractSystem) = (x,y) -> sum((m(x).-y).^2)/size(x,2)
-foreach(treelike, [SystemD, DiffSystemD, VelSystemD, System, DiffSystem, VelSystem])
 
 const AbstractEnsembleSystem = Vector{<:AbstractSystem}
 const EnsembleSystem = Vector{System}
@@ -209,7 +208,7 @@ function plot_jacobians(Jm, Js, Jtrue)
     # plot!([1, prod(size(Jm))], [0,0], l=:dash, c=:black, linewidth=3, title="Jacobian values and confidence bounds", lab="")
 end
 
-function plot_eigvals(results, eval=false)
+function LTVModels.plot_eigvals(results, eval=false)
     @unpack x,u,xv,uv,sys,modeltype = results[1]
     x,u = eval ? (xv,uv) : (x,u)
     N = size(x,2)
@@ -325,3 +324,7 @@ function savefig2(filename, opts=[])
     close(temp)
     mv(temppath, filename, remove_destination=true)
 end
+
+try
+foreach(treelike, [SystemD, DiffSystemD, VelSystemD, System, DiffSystem, VelSystem])
+end
diff --git a/jump_lin_id/bibtexfile.bib b/jump_lin_id/bibtexfile.bib
index 5036b3d..1de07dd 100644
--- a/jump_lin_id/bibtexfile.bib
+++ b/jump_lin_id/bibtexfile.bib
@@ -161,3 +161,13 @@
   year={2014},
   publisher={Now Publishers, Inc.}
 }
+
+@article{rauch1965maximum,
+  title={Maximum likelihood estimates of linear dynamic systems},
+  author={Rauch, Herbert E and Tung, F and Striebel, Charlotte T and others},
+  journal={AIAA journal},
+  volume={3},
+  number={8},
+  pages={1445--1450},
+  year={1965}
+}
diff --git a/jump_lin_id/id_paper.tex b/jump_lin_id/id_paper.tex
index 5f2896e..1260a5c 100644
--- a/jump_lin_id/id_paper.tex
+++ b/jump_lin_id/id_paper.tex
@@ -166,7 +166,7 @@ Identification of LTV Dynamical Models with\\ Smooth or Discontinuous Time Evolu
 % ---------------------------------------------------------
 \section{Introduction}\label{sec:introduction}
 
-The difficulty of the task of identifying time-varying dynamical models of systems varies greatly with the model considered and the availability of measurements of the state sequence. For smoothly changing, linear dynamics,\footnote{Linear dynamics in the sense of models that are linear in the parameters.} the recursive least-squares algorithm with exponential forgetting (RLS$\lambda$) is a common option. If a Gaussian random-walk model for the parameters is assumed, a Kalman filtering/smoothing algorithm gives the filtering/smoothing densities of the parameters in closed form. The assumption of smoothly (Gaussian) varying dynamics is often restrictive. Discontinuous dynamics changes occur, for instance, when an external controller changes operation mode, when a sudden contact between a robot and its environment is established, an unmodeled disturbance enters the system or when a system is suddenly damaged.
+The difficulty of the task of identifying time-varying dynamical models of systems varies greatly with the model considered and the availability of measurements of the state sequence. For smoothly changing, linear dynamics,\footnote{Linear dynamics in the sense of models that are linear in the parameters.} the recursive least-squares algorithm with exponential forgetting (RLS$\lambda$) is a common option. If a Gaussian random-walk model for the parameters is assumed, a Kalman filtering/smoothing algorithm \cite{rauch1965maximum} gives the filtering/smoothing densities of the parameters in closed form. The assumption of smoothly (Gaussian) varying dynamics is often restrictive. Discontinuous dynamics changes occur, for instance, when an external controller changes operation mode, when a sudden contact between a robot and its environment is established, an unmodeled disturbance enters the system or when a system is suddenly damaged.
 
 Identification of systems with non-smooth dynamics evolution has been studied extensively. The book~\cite{costa2006discrete} treats the case where the dynamics are known, but the state sequence unknown, i.e., state estimation. In~\cite{nagarajaiah2004time}, the authors examine the residuals from an initial constant dynamics fit to determine regions in time where improved fit is needed by the introduction of additional constant dynamics models. Results on identifiability and observability in jump-linear systems in the non-controlled (autonomous) setting are available in~\cite{vidal2002observability}. The main result on identifiability in \cite{vidal2002observability} was a rank condition on a Hankel matrix constructed from the collected output data, similar to classical results on the least-squares identification of ARX-models which appears as rank constraints on the, typically Toeplitz or block-Toeplitz, regressor matrix. Identifiability of the methods proposed in this article are discussed in~\cref{sec:identifiability}.
 
@@ -258,7 +258,7 @@ Also this optimization problem has a closed form solution on the form~\labelcref
 
 
 \subsection{Piecewise constant time evolution}\label{sec:pwconstant}
-In the presence of discontinuous or abrupt changes in the dynamics, estimation method~\labelcref{eq:smooth} might perform poorly. A signal which is mostly flat, with a small number of distinct level changes, is characterized by a \emph{sparse first-order time difference}. To detect sudden changes in dynamics, we thus formulate and solve the optimization problem
+In the presence of discontinuous or abrupt changes in the dynamics, estimation method~\labelcref{eq:smooth} might perform poorly. A signal which is mostly flat, with a small number of distinct level changes, is characterized by a \emph{sparse first-order time difference}. To detect sudden changes in dynamics, we thus formulate and solve the problem
 \begin{equation} \label{eq:pwconstant}
    \minimize{\w} \normt{\y-\hat{\y}}^2 + \lambda\sum_t \normt{ \w_{t+1} - \w_t}
 \end{equation}
@@ -325,6 +325,7 @@ The second step can be computed very efficiently by noticing that the problem ca
 
 To identify the points at which the dynamics change (knots), we observe the argument inside the sum of the regularization term, i.e., $a_{t1} = \normt{ \w_{t+1} - \w_t}$ or $a_{t2} = \normt{\w_{t+2} -2 \w_{t+1} + \w_t}$. Time instances where $a_t$ is taking non-zero values indicate change points.
 
+
 \section{Dynamics prior and Kalman filtering}
 \recmt{Reviewer found this section rushed}
 The identifiability of the parameters in a dynamical model hinges on the observability of the dynamics system~\labelcref{eq:dynsys}, or more explicitly, only modes excited by the input $u$ will be satisfactorily identified. If the identification is part of an iterative learning and control scheme, e.g., ILC or reinforcement learning, it might be undesirable to introduce additional noise in the input to improve excitation for identification. This section will introduce prior information about the dynamics which mitigates the issue of poor excitation of the system modes. The prior information might come from, e.g., a nominal model known to be inaccurate, or an estimated global model such as a Gaussian mixture model (GMM). A statistical model of the joint density $p(x_{t+1},x_t,u_t)$ constructed from previously collected tuples $(x_{t+1},x_t,u_t)$ provides a dynamical model of the system through the conditional pdf $p(x_{t+1}|x_t,u_t)$.
@@ -333,21 +334,22 @@ We will see that for priors from certain families, the resulting optimization pr
 
 \subsection{General case}\label{sec:general}
 
-If we introduce a parameter state $\w$ (c.f., \labelcref{eq:tvk}) and a prior over all parameter-state variables $p(\w_{t}|z_t)$, where the variable $z_t$ might be, for instance, the time index $t$ or state $x_t$, we have the data likelihood
-\begin{align}\label{eq:cdll}
-    p(\w,y|x,z)_{1:T} = \prod_{t=1}^T p(y_t|\w_t,x_t) \prod_{t=1}^{T-1} p(\w_{t+1}|\w_t) \prod_{t=1}^{T} p(\w_{t}|z_t)
-\end{align}
-which factors conveniently due to the Markov property of a state-space model. Taking the $\log$ of~\labelcref{eq:cdll}, we obtain
+If we introduce a parameter state $\w$ (c.f., \labelcref{eq:tvk}) and a prior over all parameter-state variables $p(\w_{t}|z_t)$, where the variable $z_t$ might be, for instance, the time index $t$ or state $x_t$, we have the data log-likelihood
+% \begin{align}\label{eq:cdll}
+%     p(\w,y|x,z)_{1:T} = \prod_{t=1}^T p(y_t|\w_t,x_t) \prod_{t=1}^{T-1} p(\w_{t+1}|\w_t) \prod_{t=1}^{T} p(\w_{t}|z_t)
+% \end{align}
+ % Taking the $\log$ of~\labelcref{eq:cdll}, we obtain
 \begin{equation}\label{eq:lcdll}
     \begin{split}
     \log p(\w,y|x,z)_{1:T} &= \sum_{t=1}^T \log p(y_t|\w_t,x_t) \\
     + \sum_{t=1}^{T-1} \log p(\w_{t+1}|\w_t) &+ \sum_{t=1}^{T} \log p(\w_{t}|z_t)
 \end{split}
 \end{equation}
-For particular choices of density functions in~\labelcref{eq:lcdll}, notably Gaussian and Laplacian, the resulting ML estimation problem becomes convex. The next section will elaborate on the Gaussian case and introduce a recursive algorithm that solves for the full posterior efficiently. The Laplacian case, while convex, does not admit an equally efficient algorithm, but is more robust to outliers in the data.
+which factors conveniently due to the Markov property of a state-space model.
+For particular choices of density functions in~\labelcref{eq:lcdll}, notably Gaussian and Laplacian, the negative likelihood function becomes convex. The next section will elaborate on the Gaussian case and introduce a recursive algorithm that solves for the full posterior efficiently. The Laplacian case, while convex, does not admit an equally efficient algorithm, but is more robust to outliers in the data.
 
 \subsection{Gaussian case} \label{sec:kalmanmodel}
-If all densities are Gaussian and $\w$ is modeled with the Brownian random walk model \labelcref{eq:dynsys} (Gaussian $v_t$), \labelcref{eq:lcdll} can be written on the form (constants omitted)
+If all densities in~\labelcref{eq:lcdll} are Gaussian and $\w$ is modeled with the Brownian random walk model \labelcref{eq:dynsys} (Gaussian $v_t$), \labelcref{eq:lcdll} can be written on the form (constants omitted)
 \begin{equation}
     \begin{split}\label{eq:prioropt}
     -\log p(\w,y|x,z)_{1:T} &=
@@ -358,7 +360,7 @@ If all densities are Gaussian and $\w$ is modeled with the Brownian random walk
 \end{equation}
 for some function $\mu_0(z_t)$ which produces the prior mean of $\w$ given $z_t$. $\Sigma_y, \Sigma_\w, \Sigma_0(z_t)$ are the covariance matrices of the state-drift, parameter drift and prior respectively and $\norm{x}_{\Sigma^{-1}}^2 = x\T\Sigma^{-1}x$.
 
-In this special case, we introduce a recursive solution given by a modified Kalman smoothing algorithm, where the conditional mean of the state is updated with the prior. Consider the standard Kalman filtering equations
+In this special case, we introduce a recursive solution given by a modified Kalman smoothing algorithm, where the conditional mean of the state is updated with the prior. Consider the standard Kalman filtering equations, reproduced here to establish the notation
     \begin{align}
         \hat{x}_{t|t-1} &= A \hat{x}_{t-1|t-1} + B u_{t-1} \\
         P_{t|t-1} &= A P_{t-1|t-1}A\T + R_1\\
@@ -381,7 +383,7 @@ A prior over the output of the system, or a subset thereof, is straight forward
 \subsection{Kalman filter for identification}
 We can employ the Kalman-based algorithm to solve two of the proposed optimization problems:
 \subsubsection{Low frequency}
-The Kalman filter can be used for solving identification problems like~\labelcref{eq:slow} by noting that~\labelcref{eq:slow} is the negative log-likelihood of the dynamics model~\labelcref{eq:dynsys}.
+The Kalman smoother can be used for solving identification problems like~\labelcref{eq:slow} by noting that~\labelcref{eq:slow} is the negative log-likelihood of the dynamics model~\labelcref{eq:dynsys}. The identification problem is thus reduced to a standard state-estimation problem.
 \subsubsection{Smooth}\label{sec:smoothkalman}
 
 To develop a Kalman-filter based algorithm for solving~\labelcref{eq:smooth}, we augment the model \labelcref{eq:dynsys} with the state variable $k^\prime_t = k_{t} - k_{t-1}$ and note that $k^\prime_{t+1} - k^\prime_t = k_{t+1} - 2k_t + k_{t-1}$. We thus introduce the augmented-state model
@@ -392,7 +394,7 @@ To develop a Kalman-filter based algorithm for solving~\labelcref{eq:smooth}, we
 which is on a form suitable for filtering/smoothing with the machinery developed above.
 
 \subsubsection{General case}
-The Kalman-filter based identification methods can be generalized to solving optimization problems where the argument in the regularizer appearing in \labelcref{eq:smooth} is replaced for an general linear operation on the parameter vector, $P(z)k$, and we have the following proposition
+The Kalman-filter based identification method can be generalized to solving optimization problems where the argument in the regularizer appearing in \labelcref{eq:smooth} is replaced by a general linear operation on the parameter vector, $P(z)k$, and we have the following proposition
 \begin{proposition}
     Any optimization problem on the form
     \begin{equation} \label{eq:generalkalman}
@@ -402,9 +404,9 @@ The Kalman-filter based identification methods can be generalized to solving opt
 \end{proposition}
 \begin{proof}
     Let $P^*(z^{-1}) = z^{-n} P(z)$. We assume without loss of generality that $P^*(0) = 1$ since any constant $P^*(0)$ can be factored out of the polynomial. $Q(z^{-1}) = P^*(z^{-1})^{-1}$ is a strictly proper transfer function and has a realization as a linear, Gaussian state-space system of degree $n$. Since $Q(z^{-1})$ is strictly proper, the realization has no direct term.
-    The negative data log-likelihood of $Q(z^{-1})$ is equal to the cost function in~\cref{eq:generalkalman}, hence the Kalman smoother applied to $Q$ optimizes \cref{eq:generalkalman}.
+    The negative data log-likelihood of $Q(z^{-1})$ is equal, up to constants idenpendent of $\w$, to the cost function in~\cref{eq:generalkalman}, hence the Kalman smoother applied to $Q$ optimizes \cref{eq:generalkalman}.
 \end{proof}
-For~\labelcref{eq:smooth} $P(z)$ equals $z^2 - 2z + 1 = (z-1)^2$, and augmenting the dynamical model describing the parameter evolution with a state corresponding to the polynomial $Q(z) = (z-1)^{-1}P(z)$.
+For~\labelcref{eq:smooth} $P(z)$ equals $z^2 - 2z + 1$ and $Q(z^{-1})$ has a realization on the form~\labelcref{eq:dynsys2}.
 
 
 
@@ -431,34 +433,34 @@ in $k$ are constant over time, and the problem reduces
 to the LTI-problem. As $\lambda$ decreases from infinity
 the effective number of free parameters increases until the
 problem gets ill-posed for $\lambda = 0$.
-
-Problems \labelcref{eq:slow,eq:smooth} can both be
-written on the form~\labelcref{eq:closedform}, for orders 2
-and 1 of the differentiation operator $D$ respectively.
-The problem of well-posedness thus reduces to determining
-the rank of the Hessian matrix
-$\tilde{\A}\T \tilde{\A} + D\T D$, where
-$\operatorname{rank}(D_2\T D_2) = K(T-2)$ respectively
-$\operatorname{rank}(D_1\T D_1) = K(T-1)$ . The matrix $\tilde{\A}\T \tilde{\A} + D\T D$
-will thus be full rank whenever $\tilde{\A}$ has rank
-$2K$ (K) or more, with a basis spanning the rank-$2K$ (K) null
-space of $D_2\T D_2$ ($D_1\T D_1$). This will be the case for any non-zero
-state and input sequences of sufficient length, provided that the input, $u$,
-is persistently exciting of sufficient order~\cite{johansson1993system} and that the system is observable.
-\cmt{If $v$ is in both the nullspace of $D_1\T D_1$, and the nullspace of $\Phi\T\Phi$ we must have that the original system is invariant to an offset $k^+ + v = H(k+v), \quad y = C(k+v)$, what does this say about $C$?}
-
-Another strategy is to analyze the observability of the dynamical systems~\labelcref{eq:dynsys,eq:dynsys2},
-which will yield the desired results for problems~\labelcref{eq:slow,eq:smooth}.
-
 We formalize the above arguments as
 \begin{proposition}
-    Optimization problems have unique global minima provided that $A$ and $u$...
+    Optimization problems \labelcref{eq:slow,eq:smooth,eq:pwconstant,eq:pwlinear} have unique global minima for $\lambda > 0$ if and only if the corresponding LTI optimization problem has a unique solution.
 \end{proposition}
 \begin{proof}
     The cost function is a sum of two convex terms. For the global minimum to be non-unique, the Hessians of the two terms must share nullspace.
-    Let $k$ be a minimizer of $\lambda^2\sum_t \normt{\w_{t+1} - \w_{t}}^2$, then there exist a basis of dimension $K$ spanning the nullspace of $D_1\T D_1$, $\mathcal{N}_D$. Let $v$ be a vector in $\mathcal{N}_D$, for $\Phi\T \Phi$ to share nullspace with $D$, we must have $\Phi\T\Phi v = 0$ or equivalently $y_t = Ck_t = C(k_t+v_t) \Longrightarrow Cv_t = 0$
+    In the limit $\lambda \rightarrow \infty$ the problem reduces to the LTI problem. The nullspace of the regularization Hessian, which is invariant to $\lambda$, does thus not share any directions with the nullspace of $\tilde{\A}\T \tilde{\A}$ which establishes the equivalence of identifiability between the LTI problem and the LTV problems.
 \end{proof}
 
+For the LTI problem to be well-posed, the system must be identifiable and the input $u$ must be persistently exciting of sufficient order~\cite{johansson1993system}.
+
+% Further insight into the properties
+% Problems \labelcref{eq:slow,eq:smooth} can both be
+% written on the form~\labelcref{eq:closedform}, for orders 2
+% and 1 of the differentiation operator $D$ respectively.
+% The problem of well-posedness thus reduces to determining
+% the rank of the Hessian matrix
+% $\tilde{\A}\T \tilde{\A} + D\T D$, where
+% $\operatorname{rank}(D_2\T D_2) = K(T-2)$ respectively
+% $\operatorname{rank}(D_1\T D_1) = K(T-1)$ . The matrix $\tilde{\A}\T \tilde{\A} + D\T D$
+% will thus be full rank whenever $\tilde{\A}$ has rank
+% $2K$ (K) or more, with a basis spanning the rank-$2K$ (K) null
+% space of $D_2\T D_2$ ($D_1\T D_1$). This will be the case for any non-zero
+% state and input sequences of sufficient length, provided that the input, $u$,
+% is persistently exciting of sufficient order~\cite{johansson1993system} and that the system is observable.
+
+
+
 \section{Example}
 \cmt{Add two link example where a stiff contact is established and there is a sign change in control signal and hence friction. Apply pwconstant method and maybe scrap linear system example. Fig 3 can be reproduced with analytical jacobian as comparison. Mention in abstract that we then evaluate on one non-linear but smooth system (pendcart) and one non-smooth system (twolink).}
 \recmt{Only linear systems considered (pend-cart actually not linear.} \cmt{Highligt that pendcart is nonlinear or introduce simple robotic example. Mention early that an LTV system along a trajectory does not generalize far for a nonlinear system, hence the use of KL constraint.}
@@ -528,13 +530,14 @@ reinforcement learning. The goal of the task is to dampen oscillations of a pend
 cart by means of moving the cart, with bounds on the control signal and a quadratic
 cost on states and control. Due to the nonlinear nature of the pendulum dynamics,
 linear expansions of the dynamics in the upward (initial) position and downward (final) position have poles
-on opposite sides of the imaginary axis. To this end, we perform a series of experiments,
-whereafter each we 1) fit a dynamics model along the last obtained trajectory,
+on opposite sides of the imaginary axis. To this end, we employ a reinforcement-learning framework inspired by~\cite{levine2013guided}, where we perform a series of rollouts whereafter each we
+1) fit a dynamics model along the last obtained trajectory,
 2) optimize the cost function under
-the model using differential dynamic programming,\footnote{Implementation made available at
-\href{github.com/baggepinnen/DifferentialDynamicProgramming.jl}{github.com/baggepinnen/DifferentialDynamicProgramming.jl}}
-with bounds on the deviation between the new trajectory and the last trajectory
-in order to stay close to the validity region of the model. This reinforcement-learning strategy has shown great promise lately and is part of the guided policy search framework \cite{levine2013guided} and has similarities to trust-region policy optimization~\cite{schulman2015trust}. We compare three different models,
+the model using iterative LQG (differential dynamic programming),\footnote{Implementation made available at
+\href{github.com/baggepinnen/DifferentialDynamicProgramming.jl}{github.com/baggepinnen/DifferentialDynamicProgramming.jl}} an algorithm that calculates the value function exactly under the LTV dynamics and a quadratic expansion of the cost function.
+In order to stay close to the validity region of the linear model, we put bounds on the deviation between each new trajectory and the last trajectory.
+% This reinforcement-learning strategy has shown great promise lately and is part of the guided policy search framework \cite{levine2013guided} and has similarities to trust-region policy optimization~\cite{schulman2015trust}.
+We compare three different models,
 the ground truth system model, an LTV model (obtained by solving \labelcref{eq:smooth}) and an LTI model.
 The total cost over $T=500$ time steps is shown as a function of learning iteration
 in~\cref{fig:ilc}. The figure illustrates how the learning procedure reaches the
@@ -548,7 +551,7 @@ the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed
     \setlength{\figurewidth}{0.99\linewidth}
     \setlength{\figureheight }{5cm}
     \input{figs/ilc.tex}
-    \caption{Reinforcement learning example. Three different model types are used to iteratively optimize the trajectory of a pendulum on a cart. Due to the nonlinear nature of the pendulum dynamics, linear expansions of the dynamics in the upward and downward positions have poles on opposite sides of the imaginary axis.}
+    \caption{Reinforcement learning example. Three different model types are used to iteratively optimize the trajectory of a pendulum on a cart. Due to the nonlinear nature of the pendulum dynamics, linear expansions of the dynamics in the upward and downward positions have poles on opposite sides of the imaginary axis, why the algorithm fails with an LTI model.}
     \label{fig:ilc}
 \end{figure}
 
@@ -579,8 +582,6 @@ the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed
 
 
 \section{Discussion}
-\recmt{Some paragraphs hard to follow. Linear operator extension can be moved to theory section. } \recmt{Shorten section and reorganize, omit tf eq} \recmt{Rewrite first two sentences}
-
 % This article is written focused on state-space models, where the state sequence
 % is assumed to be known. This may seem like an overly optimistic setting at first,
 % but all methods and algorithm discussed apply equally well to the input-output
@@ -596,7 +597,9 @@ the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed
 % linear given the input-output sequences $u$ and $y$. Extension to time-varying
 % dynamics is straight forward.
 
-This article presents methods for estimation of linear, time-varying models. When estimating an LTV model from a trajectory obtained from a nonlinear system, one is effectively estimating the linearization of the system around that trajectory. A first order approximation to a nonlinear system is not guaranteed to generalize well as deviations from the trajectory becomes large. For certain methods, such as iterative learning control and trajectory centric reinforcement learning, a first order approximation to the dynamics is used for efficient optimization, while the validity of the approximation is ensured by incorporating penalties or constraints between two consecutive trajectories. The methods presented allow very efficient learning of this first order approximation due to the prior belief over the nature of the change in dynamics parameters, encoded by the regularization terms. By postulating a prior belief that the dynamics parameters change in a certain way, less demand is put on the data required for identification. The identification process will thus not interfere with operation in the same way as if excessive noise would be added to the input for identification purposes. This allows learning of flexible, over-parametrized models that fit available data well. This makes the proposed identification methods attractive in applications such as guided policy search (GPS)~\cite{levine2013guided, levine2015learning} and non-linear iterative learning control (ILC)~\cite{bristow2006survey}, where they can lead to dramatically decreased sample complexity.
+This article presents methods for estimation of linear, time-varying models. When estimating an LTV model from a trajectory obtained from a nonlinear system, one is effectively estimating the linearization of the system around that trajectory. A first order approximation to a nonlinear system is not guaranteed to generalize well as deviations from the trajectory becomes large. Many non-linear systems are, however, approximately \emph{locally} linear, such that they are well described by a linear model in a small neighborhood around the linearization/operating point. For certain methods, such as iterative learning control and trajectory centric reinforcement learning, a first order approximation to the dynamics is used for efficient optimization, while the validity of the approximation is ensured by incorporating penalties or constraints between two consecutive trajectories.
+
+The methods presented allow very efficient learning of this first order approximation due to the prior belief over the nature of the change in dynamics parameters, encoded by the regularization terms. By postulating a prior belief that the dynamics parameters change in a certain way, less demand is put on the data required for identification. The identification process will thus not interfere with normal operation in the same way as if excessive noise would be added to the input for identification purposes. This allows learning of flexible, over-parametrized models that fit available data well. This makes the proposed identification methods attractive in applications such as guided policy search (GPS)~\cite{levine2013guided, levine2015learning} and non-linear iterative learning control (ILC)~\cite{bristow2006survey}, where they can lead to dramatically decreased sample complexity.
 % \addtolength{\textheight}{-7cm}
 
 When faced with a system where time-varying dynamics is suspected and no particular
@@ -625,7 +628,7 @@ This allows incorporation of different scales for different variables with littl
 
 
 
-The drawbacks of LTV models include situations where the observed time variation comes from non-linear effects. Non-linearities in the system may cause an apparent time-variation as the state varies. While an LTV-model can capture this variation, it will fail to generalize well to a subsequent experiment where the states take a significantly different trajectory. Many non-linear systems are, however, approximately \emph{locally} linear, such that they are well described by a linear model in a small neighborhood around the linearization/operating point. LTV models can thus be efficiently used for model-based reinforcement learning with variants of trust-region based optimization~\cite{schulman2015trust}.
+% The drawbacks of LTV models include situations where the observed time variation comes from non-linear effects. Non-linearities in the system may cause an apparent time-variation as the state varies. While an LTV-model can capture this variation, it will fail to generalize well to a subsequent experiment where the states take a significantly different trajectory. Many non-linear systems are, however, approximately \emph{locally} linear, such that they are well described by a linear model in a small neighborhood around the linearization/operating point. LTV models can thus be efficiently used for model-based reinforcement learning with variants of trust-region based optimization~\cite{schulman2015trust}.
 
 
 
-- 
GitLab