From 6c1be1f8a10dcafec1092832370452bad8c1d7bd Mon Sep 17 00:00:00 2001
From: baggepinnen <cont-frb@ulund.org>
Date: Wed, 21 Mar 2018 15:21:47 +0100
Subject: [PATCH] work on nn prior

---
 flux/nn_prior/mutual.jl   | 157 ++++++++++++++++++++++++++++++++++++++
 flux/nn_prior/nn_prior.jl |  57 +++++++-------
 jump_lin_id/id_paper.tex  | 113 +++++++--------------------
 3 files changed, 214 insertions(+), 113 deletions(-)
 create mode 100644 flux/nn_prior/mutual.jl

diff --git a/flux/nn_prior/mutual.jl b/flux/nn_prior/mutual.jl
new file mode 100644
index 0000000..851e5e9
--- /dev/null
+++ b/flux/nn_prior/mutual.jl
@@ -0,0 +1,157 @@
+cd(@__DIR__)
+# import Iterators
+using OrdinaryDiffEq, LTVModels
+@everywhere begin
+    using Flux, ValueHistories, IterTools, MLDataUtils, OrdinaryDiffEq, Parameters
+    using Flux: back!, truncate!, treelike, train!, mse, testmode!, params, jacobian
+    using Flux.Optimise: Param, optimiser, RMSProp, expdecay
+    num_params     = 20
+    stepsize       = 0.05
+    n_bootstrap    = 4
+    num_montecarlo = 30
+end
+
+inoffice() = gethostname() ∈ ["billman", "Battostation"]
+inoffice() && !isdefined(:time_derivative) && include(Pkg.dir("DynamicMovementPrimitives","src","two_link.jl"))
+if !isdefined(:AbstractEnsembleSystem)
+    inoffice() || @everywhere include("/var/tmp/fredrikb/v0.6/DynamicMovementPrimitives/src/two_link.jl")
+    @everywhere include("utilities.jl")
+    @everywhere include("two_link_sys.jl")
+    @everywhere include("linear_sys.jl")
+end
+
+sys = TwoLinkSys(N=1000, h=0.02, σ0 = 0.1)#; warn("Added noise")
+# sys = LinearSys(1,N=100, h=0.2, σ0 = 1, n = 10, ns = 10)
+
+@everywhere function fit_model(opt, loss, m, x, y, u, sys, modeltype;
+    iters                 = 2000,
+    doplot                = true,
+    batch_size            = 50)
+
+    trace   = History(Float64)
+    batcher = batchview(shuffleobs((x,y)), batch_size)
+    dataset = ncycle(batcher, iters)
+    iter    = 0
+    plt = plot()
+    phi = linspace(0,2pi,300)
+    function evalcallback()
+        iter += 1
+        testmode!(m)
+        l = loss(x, y).tracker.data[1]
+        push!(trace,iter,l)
+        if iter % 10 == 0
+            if doplot
+                println("Iter: $iter, Loss: ", l)
+                # plot(trace,reuse=true,show=false, lab="Train", layout=3, subplot=1, size=(1400,1000))
+                # plot!(y', subplot=2:3, lab="y")
+                # plot!(m(x).data', l=:dash, subplot=2:3, show=true, lab="Pred")
+                n = 4
+                e = eigvals(jacobian(m, x[:,end÷2])[1:n,1:n])
+                error("Show true jacobian")
+                # e = log.(Complex.(e))/sys.h
+                plot!(real.(exp.(phi.*im)), imag.(exp.(phi.*im)), legend=false, c=:black, l=:dash, xlims=(-5,2))
+                scatter!(real.(e), imag.(e), c=:blue, reuse=true)
+                # histogram(abs.(e), reuse=true, bins=0:0.1:1.1)
+                update_plot(plt[1], max_history=10, attribute=:markercolor)
+            end
+        end
+        testmode!(m, false)
+        Flux.reset!(m)
+    end
+    train!(loss, dataset, opt, cb = evalcallback)
+    results           = Dict{Symbol, Any}()
+    @pack results = m, trace, modeltype, sys
+    results
+end
+
+
+
+@everywhere function mutual_reg(seed, sys, modeltype; jacprop = 1, doplot=false, wdecay=0, outer_iters=8)
+    activations = [swish, Flux.sigmoid, tanh, elu]
+    models = [modeltype(sys, num_params, activations[i]) for i in 1:length(activations)]
+
+
+    outer_results = pmap(1:outer_iters) do i
+        x,y,u = generate_data(sys, modeltype, seed+i)
+        iters = modeltype==System ? 2000 : 1000
+
+        iters    = round(Int, iters / sqrt(jacprop+1))
+        n,m      = sys.ns,sys.n
+        R1,R2    = 0.01eye(n*(n+m)), 100eye(n) # Fast
+        P0       = 10000R1
+        model    = LTVModels.fit_model(LTVModels.KalmanModel, x,u,R1,R2,P0, extend=true)
+        xt,ut,yt = [x;u], u, y
+        for i = 1:jacprop
+            xa = x .+ std(x,2)/10 .* randn(size(x))
+            ua = u .+ std(u,2)/10 .* randn(size(u))
+            ya = LTVModels.predict(model, xa,ua)
+            xt = [xt [xa;ua]]
+            ut = [ut ua]
+            yt = modeltype ∈ [VelSystem, VelSystemD] ? [yt ya[3:4,:]] : [yt ya]
+        end
+        x  = [x;u]
+
+
+        results = map(1:n_bootstrap) do it
+            # global x, y, xv, yv
+            opt        = [ADAM(params(models[it]), stepsize, decay=0.005); [expdecay(Param(p), wdecay) for p in params(models[it]) if p isa AbstractMatrix]]
+            results    = fit_model(opt, loss(models[it]), models[it], xt, yt, ut, sys,modeltype, iters=iters, doplot=doplot, batch_size = size(yt,2))
+            @pack results = x, u, y
+            println("Done: ", it)
+            results
+        end
+        Jm, Js, Jtrue = all_jacobians(results)
+        Jltv = cat(2,model.At, model.Bt)
+        @pack results = Jm, Js, Jtrue, Jltv
+        # if doplot
+        #     fig_jac = plot_jacobians(Jm, Js, Jtrue)
+        #     fig_time = plotresults(results)
+        #     fig_eig = plot_eigvals(results)
+        #     fig_jac, fig_time, fig_eig
+        #     @pack results = fig_jac, fig_time, fig_eig
+        # end
+        results
+    end # Outer iter
+    outer_results[:models] = models
+end
+
+
+# Produce figures
+# pyplot(reuse=false, show=false)
+opts = Dict(:outer_iters=>10, :wdecay=>0.01, :doplot=>true, :jacprop=>1)
+figs1 = mutual_reg(1, sys, System; opts...)
+figs2 = mutual_reg(1, sys, DiffSystem; opts...)
+figs3 = mutual_reg(1, sys, VelSystem; opts...)
+# error()
+
+##
+
+#
+# jacerrtrain = hcat([eval_jac.(get_res(res,i), false) for i ∈ 1:nr]...)
+# jacerrval = hcat([eval_jac.(get_res(res,i), true) for i ∈ 1:nr]...)
+# violin(log10.(jacerrtrain), side=:left, xticks=(1:nr,labelvec), title="Jacobian error", ylabel="log(norm of Jacobian error)", reuse=true, lab=["Train" "" ""], c=:red)
+# violin!((1:nr)',log10.(jacerrval), side=:right, xticks=(1:nr,labelvec), ylabel="log(norm of Jacobian error)", reuse=true, lab=["Validation" "" ""], c=:blue); gui()
+# # savefig2("/local/home/fredrikb/papers/nn_prior/figs/jacerr.tex")
+##
+
+
+##
+
+using StatPlots
+
+sys = res[1][1][1][:sys]
+nr = length(res[1]) ÷ 2
+labelvec = ["f" "g" "h"]
+infostring = @sprintf("Num hidden: %d, sigma: %2.2f, Montecarlo: %d", num_params, sys.σ0, num_montecarlo)
+##
+# pgfplots(size=(600,300), show=true)
+
+
+# jacerrtrain = hcat([eval_jac.(get_res(res,i), false) for i ∈ 1:nr]...)
+jacerr = hcat([eval_jac.(get_res(res,i), true) for i ∈ 1:nr]...)
+jacerrp = hcat([eval_jac.(get_res(res,i), true) for i ∈ nr+1:2nr]...)
+violin(identity.(jacerr), side=:left, xticks=(1:nr,labelvec), title="Jacobian error (validation data)"*infostring, ylabel="norm of Jacobian error", reuse=true, lab=["Standard" "" ""], c=:red)
+violin!((1:nr)',identity.(jacerrp), side=:right, xticks=(1:nr,labelvec), ylabel="Jacobian RMS", reuse=true, lab=["Jacobian Propagation" "" ""], c=:blue); gui()
+# savefig2("/local/home/fredrikb/papers/nn_prior/figs/jacerr_mutual.tex")
+
+##
diff --git a/flux/nn_prior/nn_prior.jl b/flux/nn_prior/nn_prior.jl
index 9b30691..7c019d9 100644
--- a/flux/nn_prior/nn_prior.jl
+++ b/flux/nn_prior/nn_prior.jl
@@ -3,7 +3,7 @@ cd(@__DIR__)
 using OrdinaryDiffEq, LTVModels
 @everywhere begin
     using Flux, ValueHistories, IterTools, MLDataUtils, OrdinaryDiffEq, Parameters
-    using Flux: back!, truncate!, treelike, train!, mse, testmode!, combine, params, jacobian
+    using Flux: back!, truncate!, treelike, train!, mse, testmode!, params, jacobian
     using Flux.Optimise: Param, optimiser, RMSProp, expdecay
     wdecay         = 0.0
     num_params     = 20
@@ -34,23 +34,31 @@ sys = TwoLinkSys(N=1000, h=0.02, σ0 = 0.1)#; warn("Added noise")
     batcher = batchview(shuffleobs((x,y)), batch_size)
     dataset = ncycle(batcher, iters)
     iter    = 0
+    plot()
     function evalcallback()
         iter += 1
         testmode!(m)
-        l = loss(x, y).data[1]
+        l = loss(x, y).tracker.data[1]
         push!(trace,iter,l)
-        push!(vtrace, iter, loss(xv,yv).data[1])
-        if iter % 50 == 0
+        push!(vtrace, iter, loss(xv,yv).tracker.data[1])
+        if iter % 5 == 0
             if doplot
                 println("Iter: $iter, Loss: ", l)
-                plot(trace,reuse=true,show=false, lab="Train", layout=3, subplot=1, size=(1400,1000))
-                plot!(vtrace,show=false, lab="Validation", subplot=1, yscale=:log10)
-                plot!(y', subplot=2:3, lab="y")
-                plot!(m(x).data', l=:dash, subplot=2:3, show=true, lab="Pred")
+                # plot(trace,reuse=true,show=false, lab="Train", layout=3, subplot=1, size=(1400,1000))
+                # plot!(vtrace,show=false, lab="Validation", subplot=1, yscale=:log10)
+                # plot!(y', subplot=2:3, lab="y")
+                # plot!(m(x).data', l=:dash, subplot=2:3, show=true, lab="Pred")
+                n = 4
+                e = eigvals(jacobian(m, x[:,end÷2])[1:n,1:n])
+                e = log.(Complex.(e))/sys.h
+                scatter!(real.(e), imag.(e), c=:blue, reuse=true)
+                # phi = linspace(0,2pi,300)
+                # plot!(real.(exp.(phi.*im)), imag.(exp.(phi.*im)), legend=false, c=:black, l=:dash)
+                # histogram(abs.(e), reuse=true, bins=0:0.1:1.1)
             end
         end
         testmode!(m, false)
-        truncate!(m)
+        Flux.reset!(m)
     end
     train!(loss, dataset, opt, cb = evalcallback)
     results           = Dict{Symbol, Any}()
@@ -61,14 +69,14 @@ end
 
 
 @everywhere function fit_system(seed, sys, modeltype, jacprop = 0; doplot=false)
-    x,y,u    = generate_data(sys, modeltype, seed)
+    x,y,u = generate_data(sys, modeltype, seed)
     iters = modeltype==System ? 2000 : 1000
     if jacprop > 0
-        iters = round(Int, iters / sqrt(jacprop+1))
-        n,m = sys.ns,sys.n
-        R1,R2 = 0.01eye(n*(n+m)), 100eye(n) # Fast
-        P0 = 10000R1
-        model = LTVModels.fit_model(LTVModels.KalmanModel, x,u,R1,R2,P0, extend=true)
+        iters    = round(Int, iters / sqrt(jacprop+1))
+        n,m      = sys.ns,sys.n
+        R1,R2    = 0.01eye(n*(n+m)), 100eye(n) # Fast
+        P0       = 10000R1
+        model    = LTVModels.fit_model(LTVModels.KalmanModel, x,u,R1,R2,P0, extend=true)
         xt,ut,yt = [x;u], u, y
         for i = 1:jacprop
             xa = x .+ std(x,2)/10 .* randn(size(x))
@@ -109,19 +117,12 @@ end
 end
 
 
-function savefigures(f1,f2,f3,n)
-    savefig(f1,"/local/home/fredrikb/papers/nn_prior/figs/jacs$(n).pdf")
-    savefig(f2,"/local/home/fredrikb/papers/nn_prior/figs/timeseries$(n).pdf")
-    savefig(f3,"/local/home/fredrikb/papers/nn_prior/figs/eigvals$(n).pdf")
-end
-
-
 # Produce figures
 # pyplot(reuse=false, show=false)
-# figs1 = fit_system(1, sys, SystemD, 4, doplot=false);     #savefigures(figs1..., 1)
-# figs2 = fit_system(1, sys, DiffSystemD, 2, doplot=true); #savefigures(figs2..., 2)
-# figs3 = fit_system(1, sys, VelSystemD, 2, doplot=false);  #savefigures(figs3..., 3)
-# figs4 = fit_system(1, sys, AffineSystem, doplot=true);  #savefigures(figs4..., 4)
+# figs1 = fit_system(1, sys, System, 2, doplot=true)
+figs2 = fit_system(1, sys, DiffSystemD, 2, doplot=true)
+# figs3 = fit_system(1, sys, VelSystem, 2, doplot=false)
+# figs4 = fit_system(1, sys, AffineSystem, doplot=true)
 # error()
 
 ##
@@ -228,3 +229,7 @@ violin!((1:nr)',identity.(jacerrp), side=:right, xticks=(1:nr,labelvec), ylabel=
 
 # Compare true vs ad-hoc bootstrap
 num_workers = 4; addprocs([(@sprintf("philon-%2.2d",i),num_workers) for i in [2:4; 6:10]]);addprocs([(@sprintf("ktesibios-%2.2d",i),num_workers) for i in 1:9]);addprocs([(@sprintf("heron-%2.2d",i),num_workers) for i in [1,2,3,4,6,11]])
+
+Pkg.update()
+include(joinpath(JULIA_HOME, Base.DATAROOTDIR, "julia", "build_sysimg.jl"))
+build_sysimg(default_sysimg_path(), "native", "userimage.jl", force=true)
diff --git a/jump_lin_id/id_paper.tex b/jump_lin_id/id_paper.tex
index c0c759a..8c740e4 100644
--- a/jump_lin_id/id_paper.tex
+++ b/jump_lin_id/id_paper.tex
@@ -103,7 +103,7 @@
 
 \newcommand{\fourier}[2]{\mathcal{F}_{#1}\big(#2\big)}
 
-\linespread{0.994}
+% \linespread{0.993}
 %\addtolength{\abovedisplayskip}{-0.5mm}
 %\addtolength{\belowdisplayskip}{-0.5mm}
 
@@ -118,7 +118,7 @@ Identification of LTV Dynamical Models with\\ Smooth or Discontinuous Time Evolu
 
 \author{
 \centering Fredrik Bagge Carlson* \quad Anders Robertsson \quad Rolf Johansson
-\thanks{*Open-source implementations of all presented methods and examples in this paper are made available at \href{github.com/baggepinnen/LTVModels.jl}{github.com/baggepinnen/LTVModels.jl} (Will be made available in advance of paper publication). The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University, Dept Automatic Control, Lund Sweden.\protect\\
+\thanks{*Open-source implementations of all presented methods and examples in this paper are made available at \href{github.com/baggepinnen/LTVModels.jl}{github.com/baggepinnen/LTVModels.jl}. The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University, Dept Automatic Control, Lund Sweden.\protect\\
 {Fredrik.Bagge\_Carlson@control.lth.se}}
 }
 
@@ -166,7 +166,7 @@ The difficulty of the task of identifying time-varying dynamical models of syste
 
 Identification of systems with non-smooth dynamics evolution has been studied extensively. The book~\cite{costa2006discrete} treats the case where the dynamics are known, but the state sequence unknown, i.e., state estimation. In~\cite{nagarajaiah2004time}, the authors examine the residuals from an initial constant dynamics fit to determine regions in time where improved fit is needed by the introduction of additional constant dynamics models. Results on identifiability and observability in jump-linear systems in the non-controlled (autonomous) setting are available in~\cite{vidal2002observability}. The main result on identifiability in \cite{vidal2002observability} was a rank condition on a Hankel matrix constructed from the collected output data, similar to classical results on the least-squares identification of ARX models which appears as rank constraints on the, typically Toeplitz or block-Toeplitz, regressor matrix. Identifiability of the methods proposed in this article are discussed in~\cref{sec:identifiability}.
 
-An LTV model can be seen as a first-order approximation of the dynamics of a nonlinear system around a trajectory. We emphasize that such an approximation will in general fail to generalize far from the this trajectory, but many methods in reinforcement learning and control make efficient use of the linearized dynamics for optimization, while ensuring validity of the approximation by constraints or penalty terms. An example provided in \cref{sec:rl} highlights such a method.
+An LTV model can be seen as a first-order approximation of the dynamics of a nonlinear system around a trajectory. We emphasize that such an approximation will in general fail to generalize far from this trajectory, but many methods in reinforcement learning and control make efficient use of the linearized dynamics for optimization, while ensuring validity of the approximation by constraints or penalty terms. An example provided in \cref{sec:rl} highlights such a method.
 
 An important class of identification methods that has been popularized lately is \emph{trend filtering} methods~\cite{kim2009ell_1, tibshirani2014adaptive}. Trend filtering methods work by specifying a \emph{fitness criterion} that determines the goodness of fit, as well as a \emph{regularization} term, often chosen with sparsity promoting qualities. As a simple example, consider the reconstruction $\hat y$ of a noisy signal $y = \{y_t\inspace{}\}_{t=1}^T$ with piecewise constant segments. To this end, we may formulate and solve the convex optimization problem
 \begin{equation} \label{eq:tf}
@@ -222,7 +222,7 @@ where the parameters $\w$ are assumed to evolve according to the dynamical syste
    y_t &= \big(I_n \otimes \bmatrixx{x_t\T & u_t\T}\big) \w_t
 \end{split}
 \end{equation}
-where, if no prior knowledge is available, the dynamics matrix $H_t$ can be taken as the identity matrix; $H = I$ implies that the model coefficients follow a random walk dictated by the properties of $w_t$, i.e., the state transition density function $p_w(k_{t+1}|k_t)$. The emission density function $p_v(x_{t+1} | x_t, u_t, k_t)$ is determining the drift of the state, which for the parameter estimation problem can be seen as the distribution of measurements, given the current state of the system.
+where, if no prior knowledge is available, the dynamics matrix $H_t$ can be taken as the identity matrix; $H = I$ implies that the model coefficients follow a random walk dictated by the properties of $w_t$, i.e., the state transition density function $p_w(k_{t+1}|k_t)$. The emission density function $p_v(x_{t+1} | x_t, u_t, k_t)$ is determining the drift of the state, which for the parameter estimation problem can be seen as the distribution of measurements ($y_t = x_{t+1}$), given the current state of the system.
 We emphasize here that the state in the parameter evolution model refers to the current parameters $k_t$ and not the system state $x_t$, hence, $p_v$ is called the emission density and not the transition density. Particular choices of $p_v$ and $p_w$ emit data likelihoods concave in the parameters and hence amenable to convex optimization.
 
 
@@ -281,19 +281,11 @@ If the number of switches in dynamics parameters, $M$, is known in advance, the
 where $\textbf{1}\{\cdot\}$ is the indicator function.
 This problem is non-convex and we propose solving it using dynamic programming (DP). For this purpose we modify the algorithm developed in \cite{bellman1961approximation}, an algorithm frequently referred to as segmented least-squares~\cite{bellman1969curve}, where a curve is approximated by piecewise linear segments. The modification lies in the association of each segment (set of consecutive time indices during which the parameters are constant) with a dynamics model, as opposed to a simple straight line.\footnote{Indeed, if a simple integrator is chosen as dynamics model and a constant input is assumed, the result of our extended algorithm reduces to the segmented least-squares solution.} Unfortunately, the computational complexity of the dynamic programming solution, $\mathcal{O}(T^2K^3)$, becomes prohibitive for large $T$.\footnote{For details regarding the DP algorithm and implementation, the reader is referred to the source-code repository accompanying this article.}
 
-\subsection{Piecewise linear time evolution}\label{eq:pwlinear}
+\subsection{Piecewise linear time evolution}\label{sec:pwlinear}
 A piecewise linear signal is characterized by a \emph{sparse second-order time difference}, i.e., it has a small number of changes in the slope. A piecewise linear time-evolution of the dynamics parameters is hence obtained if we solve the optimization problem.
 \begin{equation} \label{eq:pwlinear}
    \minimize{\w} \normt{\y-\hat{\y}}^2 + \lambda\sum_t \normt{\w_{t+2} -2 \w_{t+1} + \w_t}
 \end{equation}
-% The only difference between \labelcref{eq:pwlinear} and~\labelcref{eq:smooth} is the square root in the 2-norm of \labelcref{eq:pwlinear}.
-% The two regularizers are illustrated in~\cref{fig:surf}.
-% \begin{figure}[htp]
-%     \centering
-%     \includegraphics[width=0.9\linewidth]{figs/surf.png}
-%     \caption{Comparison between the 2-norm (pointy bottom) and squared 2-norm (round bottom) regularizers. The figure illustrates how the gradient of the 2-norm regularizer remains constant as we move towards the origin, whereas the squared 2-norm regularizer has a vanishing gradient close to the origin. This difference leads to sparse results when applying 2-norm penalties and small results when applying squared 2-norm penalty. \recmt{Figure not needed}}
-%     \label{fig:surf}
-% \end{figure}
 
 \subsection{Summary}
 The proposed optimization problems are summarized in~\cref{tab:opts}. The table illustrates how the choice of regularizer and order of time-differentiation of the parameter vector affects the quality of the resulting solution.
@@ -349,14 +341,14 @@ If all densities in~\labelcref{eq:lcdll} are Gaussian and $\w$ is modeled with t
 \begin{equation}
     \begin{split}\label{eq:prioropt}
     -\log p(\w,y|x,z)_{1:T} &=
-    \sum_{t=1}^T \norm{y_t - \hat{y}(\w_t,x_t)}^2_{\Sigma^{-1}_y} \\
-    &+ \sum_{t=1}^{T-1} \norm{\w_{t+1} - \w_{t}}^2_{\Sigma^{-1}_\w} \\
+    \sum_{t=1}^T \norm{y_t - \hat{y}(\w_t,x_t)}^2_{\Sigma^{-1}_v} \\
+    &+ \sum_{t=1}^{T-1} \norm{\w_{t+1} - \w_{t}}^2_{\Sigma^{-1}_w} \\
     &+ \sum_{t=1}^{T} \norm{\mu_0(z_t) - \w_{t}}^2_{\Sigma^{-1}_0(z_t)}
     \end{split}
 \end{equation}
-for some function $\mu_0(z_t)$ which produces the prior mean of $\w$ given $z_t$. $\Sigma_y, \Sigma_\w, \Sigma_0(z_t)$ are the covariance matrices of the state-drift, parameter drift and prior respectively and $\norm{x}_{\Sigma^{-1}}^2 = x\T\Sigma^{-1}x$.
+for some function $\mu_0(z_t)$ which produces the prior mean of $\w$ given $z_t$. $\Sigma_v, \Sigma_w, \Sigma_0(z_t)$ are the covariance matrices of the state-drift, parameter drift and prior respectively and $\norm{x}_{\Sigma^{-1}}^2 = x\T\Sigma^{-1}x$.
 
-In this special case, we introduce a recursive solution given by a modified Kalman smoothing algorithm, where the conditional mean of the state is updated with the prior. Consider the standard Kalman filtering equations, reproduced here to establish the notation
+In this special case, we introduce a recursive solution given by a modified Kalman smoothing algorithm, where the conditional mean of the state is updated with the prior. Consider the standard Kalman filtering equations for a general linear system with estimated state vector $\hat{x}_{i|j}$ at time $i$, given data up to time $j$, reproduced here to establish the notation
     \begin{align}
         \hat{x}_{t|t-1} &= A \hat{x}_{t-1|t-1} + B u_{t-1} \\
         P_{t|t-1} &= A P_{t-1|t-1}A\T + R_1\\
@@ -364,7 +356,7 @@ In this special case, we introduce a recursive solution given by a modified Kalm
         \hat{x}_{t|t} &= \hat{x}_{t|t-1} + K_t\big(y_t-C\hat{x}_{t|t-1}\big)\\
         P_{t|t} &= P_{t|t-1} - K_t C P_{t|t-1}
     \end{align}
-    where $x$ is the state vector, with state-drift covariance $R_1$ and $C$ is a matrix that relates $x$ to a measurement $y=Cx$ with covariance $R_2$. The first two equations constitute the \emph{prediction} step, and the last two equations incorporate the measurement $y_t$ in the \emph{correction} step.
+    with state-drift covariance $R_1$ and $C$ is a matrix that relates $x_t$ to a measurement $y_t=Cx_t$ with covariance $R_2$. The first two equations constitute the \emph{prediction} step, and the last two equations incorporate the measurement $y_t$ in the \emph{correction} step.
     The modification required to incorporate a Gaussian prior on the state variable $p(x_t|v_t) = \N(\mu_0(v_t), \Sigma_0(v_t))$ involves a repeated correction step and takes the form
     \begin{align}
         \bar{K}_t &= P_{t|t}\big(P_{t|t}+\Sigma_0(v_t)\big)^{-1}\label{eq:postk}\\
@@ -379,7 +371,7 @@ A prior over the output of the system, or a subset thereof, is straight forward
 \subsection{Kalman filter for identification}
 We can employ the Kalman-based algorithm to solve two of the proposed optimization problems:
 \subsubsection{Low frequency}
-The Kalman smoother can be used for solving identification problems like~\labelcref{eq:slow} by noting that~\labelcref{eq:slow} is the negative log-likelihood of the dynamics model~\labelcref{eq:dynsys}. The identification problem is thus reduced to a standard state-estimation problem.
+The Kalman smoother can be used for solving identification problems like~\labelcref{eq:slow} by noting that~\labelcref{eq:slow} is the negative log-likelihood of the dynamics model~\labelcref{eq:dynsys} with state vector $\w_t$. The identification problem is thus reduced to a standard state-estimation problem.
 \subsubsection{Smooth}\label{sec:smoothkalman}
 
 To develop a Kalman-filter based algorithm for solving~\labelcref{eq:smooth}, we augment the model \labelcref{eq:dynsys} with the state variable $k^\prime_t = k_{t} - k_{t-1}$ and note that $k^\prime_{t+1} - k^\prime_t = k_{t+1} - 2k_t + k_{t-1}$. We thus introduce the augmented-state model
@@ -390,19 +382,13 @@ To develop a Kalman-filter based algorithm for solving~\labelcref{eq:smooth}, we
 which is on a form suitable for filtering/smoothing with the machinery developed above.
 
 \subsubsection{General case}
-The Kalman-filter based identification method can be generalized to solving optimization problems where the argument in the regularizer appearing in \labelcref{eq:smooth} is replaced by a general linear operation on the parameter vector, $P(z)k$, and we have the following proposition
-\begin{proposition}
-    Any optimization problem on the form
-    \begin{equation} \label{eq:generalkalman}
-       \minimize{\w} \normt{\y-\hat{\y}}^2 + \lambda^2\sum_t \normt{P(z)\w_t}^2
-    \end{equation}
-    where $P(z)$ is a polynomial of degree $n>0$ in the time difference operator $z$ with $z^{-n} P(1) = 0$, can be solved with a Kalman smoother employed to an autonomous state-space system.
-\end{proposition}
-\begin{proof}
-    Let $P^*(z^{-1}) = z^{-n} P(z)$. We assume without loss of generality that $P^*(0) = 1$ since any constant $P^*(0)$ can be factored out of the polynomial. $Q(z^{-1}) = P^*(z^{-1})^{-1}$ is a strictly proper transfer function and has a realization as a linear, Gaussian state-space system of degree $n$. Since $Q(z^{-1})$ is strictly proper, the realization has no direct term.
-    The negative data log-likelihood of $Q(z^{-1})$ is equal, up to constants idenpendent of $\w$, to the cost function in~\cref{eq:generalkalman}, hence the Kalman smoother applied to $Q$ optimizes \cref{eq:generalkalman}.
-\end{proof}
-For~\labelcref{eq:smooth} $P(z)$ equals $z^2 - 2z + 1$ and $Q(z^{-1})$ has a realization on the form~\labelcref{eq:dynsys2}.
+The Kalman-filter based identification method can be generalized to solving optimization problems where the argument in the regularizer appearing in \labelcref{eq:smooth} is replaced by a general linear operation on the parameter vector, $P(z)k$
+\begin{equation} \label{eq:generalkalman}
+    \minimize{\w} \normt{\y-\hat{\y}}^2 + \lambda^2\sum_t \normt{P(z)\w_t}^2
+\end{equation}
+where $P(z)$ is a polynomial of degree $n>0$ in the time difference operator $z$.
+For~\labelcref{eq:smooth}, $P(z)$ equals $z^2 - 2z + 1$ and $P^{-1}(z)$ has a realization on the form~\labelcref{eq:dynsys2}.
+
 
 
 
@@ -412,7 +398,7 @@ by noting that the problem of finding $A$ in $x_{t+1} = Ax_t$ given a pair
 $(x_{t+1},x_t)$ is an ill-posed problem in the sense that the solution is non
 unique. If we are given several pairs $(x_{t+1},x_t)$, for different $t$, while
 $A$ remains constant, the problem becomes over-determined and well-posed in the
-least-squares sense, provided that the vectors $\{x_t^{(i)}\}_{t=1}^T$ span $\mathbb{R}^n$. The LTI-case in~\cref{sec:lti} is well posed according to
+least-squares sense, provided that the vectors of state components $\{x_t^{(i)}\}_{t=1}^T$ span $\mathbb{R}^n$. The LTI-case in~\cref{sec:lti} is well posed according to
 classical results, when $\Phi$ has full column rank.
 
 When we extend our view to LTV models, the number of free
@@ -437,14 +423,14 @@ We formalize the above arguments as
     In the limit $\lambda \rightarrow \infty$ the problem reduces to the LTI problem. The nullspace of the regularization Hessian, which is invariant to $\lambda$, does thus not share any directions with the nullspace of $\tilde{\A}\T \tilde{\A}$ which establishes the equivalence of identifiability between the LTI problem and the LTV problems.
 \end{proof}
 \begin{proposition}
-    Optimization problems \labelcref{eq:smooth,eq:pwlinear} with higher order differentiation in the regularization term have unique global minima for $\lambda > 0$ if and only if there exists a vector $v \inspace{n+m}$ such that
+    Optimization problems \labelcref{eq:smooth,eq:pwlinear} with higher order differentiation in the regularization term have unique global minima for $\lambda > 0$ if and only if there exists no vector $v \neq 0 \inspace{n+m}$ such that
     \begin{equation}
         C^{xu}_t v = \bmatrixx{x_tx_t\T & x_tu_t\T \\ u_tx_t\T & u_tu_t\T}v = 0 \;\forall t
     \end{equation}
 \end{proposition}
 \begin{proof}
     Again, the cost function is a sum of two convex terms and for a global minimum to be non-unique, the Hessians of the two terms must have intersecting nullspaces.
-    In the limit $\lambda \rightarrow \infty$ the regularization term reduces to a linear constraint set, allowing only parameter vectors that lie along a line through time. Let $\tilde{v} \neq 0$ be such a vector, parametrized by $t$ as $\tilde{v} = \bmatrixx{v & 2v & \cdots & Tv}\T \inspace{T\times(n+m)}$ for arbitrary $v\inspace{n+m}$. $\tilde{v} \in \operatorname{null}{(\tilde{\A}\T \tilde{\A})}$ implies that the loss is invariant to the pertubation $\alpha \tilde{v}$ to $\tilde{k}$ for an arbitrary $\alpha \inspace{}$.  $(\tilde{\A}\T \tilde{\A})$ is given by $\operatorname{blkdiag}(\left\{C^{xu}_t \right\}_1^T)$ which means that $\tilde{v} \in \operatorname{null}{(\tilde{\A}\T \tilde{\A})} \Longleftrightarrow  \alpha t C^{xu}_t v = 0 \; \forall (\alpha,t) \Longleftrightarrow v \in \operatorname{null}{(C^{xu}_t)} \;\forall t$.
+    In the limit $\lambda \rightarrow \infty$ the regularization term reduces to a linear constraint set, allowing only parameter vectors that lie along a line through time. Let $\tilde{v} \neq 0$ be such a vector, parametrized by $t$ as $\tilde{v} = \bmatrixx{\bar{v}\T & 2\bar{v}\T & \cdots & T\bar{v}\T}\T \inspace{TK}$ where $\bar{v} = \operatorname{vec}(\left\{v\right\}_1^n) \inspace{K}$ and $v$ is an arbitrary vector $\inspace{n+m}$. $\tilde{v} \in \operatorname{null}{(\tilde{\A}\T \tilde{\A})}$ implies that the loss is invariant to the pertubation $\alpha \tilde{v}$ to $\tilde{k}$ for an arbitrary $\alpha \inspace{}$.  $(\tilde{\A}\T \tilde{\A})$ is given by $\operatorname{blkdiag}(\left\{I_n \otimes C^{xu}_t \right\}_1^T)$ which means that $\tilde{v} \in \operatorname{null}{(\tilde{\A}\T \tilde{\A})} \Longleftrightarrow  \alpha t (I_n \otimes C^{xu}_t) \bar{v} = 0 \; \forall (\alpha,t) \Longleftrightarrow \bar{v} \in \operatorname{null}{(I_n \otimes C^{xu}_t)} \;\forall t$, which implies $v \in \operatorname{null}{C^{xu}_t}$ due to the block-diagonal nature of $I_n \otimes C^{xu}_t$
 
 
 
@@ -472,15 +458,6 @@ For the LTI problem to be well-posed, the system must be identifiable and the in
 \section{Example -- Jump-linear system}
 \cmt{Add two link example where a stiff contact is established and there is a sign change in control signal and hence friction. Apply pwconstant method and maybe scrap linear system example. Fig 3 can be reproduced with analytical jacobian as comparison. Mention in abstract that we then evaluate on one non-linear but smooth system (pendcart) and one non-smooth system (twolink).}
 \recmt{Only linear systems considered (pend-cart actually not linear.} \cmt{Highligt that pendcart is nonlinear or introduce simple robotic example. Mention early that an LTV system along a trajectory does not generalize far for a nonlinear system, hence the use of KL constraint.}
-% \begin{figure}
-%     \centering
-%     \setlength{\figurewidth}{0.99\linewidth}
-%     \setlength{\figureheight }{5cm}
-%     \input{figs/discont.tex}
-%     \figurecaptionreduction
-%     \caption{Piecewise constant MA-dynamics. True/estimated values are shown with dashed/solid lines.}
-%     \label{fig:MA}
-% \end{figure}
 
 We now consider a simulated example. We generate a state sequence from the following LTV system, where the change in dynamics, from
 $$A_t = \left[
@@ -508,21 +485,12 @@ to $$A_t = \left[
 \right]
 $$
 occurred at $t=200$.
-The input was Gaussian noise of zero mean and unit variance, state transition noise and measuremet noise of zero mean and $\sigma = 0.2$ were added.
+The input was Gaussian noise of zero mean and unit variance, state transition noise and measuremet noise ($y^m_t = x_{t+1} + v^m_t$) of zero mean and $\sigma_{y_m} = 0.2$ were added.
 \Cref{fig:ss} depicts the estimated coefficients in the dynamics matrices for a value of $\lambda$ chosen using the L-curve method~\cite{hansen1994regularization}.
-% \begin{figure}
-%     \centering
-%     \setlength{\figurewidth}{0.99\linewidth}
-%     \setlength{\figureheight }{5cm}
-%     \input{figs/states.tex}
-%     \figurecaptionreduction
-%     \caption{Piecewise constant state-space dynamics. True state values are shown with dashed lines. Gaussian state-transition and measurement noise with $\sigma = 0.2$ were added. \recmt{Hard to see and understand}}
-%     \label{fig:states}
-% \end{figure}
 \begin{figure}
     \centering
     \setlength{\figurewidth}{0.99\linewidth}
-    \setlength{\figureheight }{6cm}
+    \setlength{\figureheight }{5.5cm}
     \input{figs/ss.tex}
     \figurecaptionreduction
     \caption{Piecewise constant state-space dynamics. True values are shown with dashed, black lines. Gaussian state-transition and measurement noise with $\sigma = 0.2$ were added. \recmt{Hard to read, maybe showing error is better.}}
@@ -533,10 +501,10 @@ The input was Gaussian noise of zero mean and unit variance, state transition no
 \section{Example -- Non-smooth robot arm with stiff contact}
 To illustrate the ability of the proposed models to represent the non-smooth dynamics along a trajectory of a robot arm, we simulate a two-link robot with discontinuous Coulomb friction. We also let the robot establish a stiff contact with the environment to illustrate both strengths and weaknesses of the modeling approach.
 
-The state of the robot arm consists of two joint coordinates, $q$, and their time derivatives, $\dot q$. \Cref{fig:robot_train} illustrates the state trajectories, control torques and simulations of a model estimated by solving~\labelcref{eq:pwconstant}. The figure clearly illustrates that the model is able to capture the dynamics both during the non-smooth sign change of the velocity, but also during establishment of the stiff contact. The learned dynamics of the contact is however time-dependent, which is illustrated in \Cref{fig:robot_val}, where the model is used on a validation trajectory where a different noise sequence was added to the control torque. Due to the novel input signal, the contact is established at a different time-instant and as a consequence, there is an error transient in the simulated data.
+The state of the robot arm consists of two joint coordinates, $q$, and their time derivatives, $\dot q$. \Cref{fig:robot_train} illustrates the state trajectories, control torques and simulations of a model estimated by solving~\labelcref{eq:pwconstant}. The figure clearly illustrates that the model is able to capture the dynamics both during the non-smooth sign change of the velocity, but also during establishment of the stiff contact. The learned dynamics of the contact is however time-dependent, which is, in some situations, a drawback of the model and is illustrated in \Cref{fig:robot_val}, where the model is used on a validation trajectory where a different noise sequence was added to the control torque. Due to the novel input signal, the contact is established at a different time-instant and as a consequence, there is an error transient in the simulated data.
 \begin{figure*}[htp]
     \centering
-    \setlength{\figurewidth}{0.5\linewidth}
+    \setlength{\figurewidth}{0.495\linewidth}
     \setlength{\figureheight }{4cm}
     \input{figs/robot_train.tex}
     % \includegraphics[width=\linewidth]{figs/robot_train}
@@ -545,7 +513,7 @@ The state of the robot arm consists of two joint coordinates, $q$, and their tim
 \end{figure*}
 \begin{figure*}[htp]
     \centering
-    \setlength{\figurewidth}{0.5\linewidth}
+    \setlength{\figurewidth}{0.495\linewidth}
     \setlength{\figureheight }{4cm}
     \input{figs/robot_val.tex}
     \caption{Simulation of non-smooth robot dynamics with stiff contact -- validation data vs. sample time index. The dashed lines indicate the event times for the training data, highlighting that the model is able to deal effortless with the non-smooth friction, but inaccurately predicts the time evolution around the contact event which now occurs at a slightly different time instance.}
@@ -669,32 +637,3 @@ We have proposed a framework for identification of linear, time-varying models a
 \bibliography{bibtexfile}{}
 \bibliographystyle{IEEEtran}
 \end{document}
-
-
-
-
-
-
-
-
-
-% \subsection{Implementation}
-% We intend to provide a dynamic programming algorithm to solve these problems very efficiently, bear with us.
-% Viterbi algorithm:
-%
-% \begin{align}
-%     e(A,B) &= (y - Ax - Bu)\T (y - Ax - Bu) \\
-%     ~ &= y\T y + x\T A\T Ax + u\T B\T Bu + 2x\T A\T Bu \\
-%     ~ &\quad  - 2y\T Ax - 2y\T Bu
-% \end{align}
-%
-% \begin{align}
-%     \dfrac{\partial e(A,B)}{\partial A} &= 2Axx\T + 2x(Bu)\T - 2yx\T \\
-%     \dfrac{\partial e(A,B)}{\partial B} &= 2Buu\T + 2xAu\T - 2yu\T \\
-% \end{align}
-x = linspace(-1.3,1.3,100)
-z1 = [norm([x,y]) for x in x, y in x]
-z2 = [norm([x,y])^2 for x in x, y in x]
-contour(x,x,z2, colorbar=false, alpha=0.2, c=:red)
-contour!(x,x,z1, fillalpha=1, grid=false, c=:blue)
-% savetikz("/work/fredrikb/reinforcementlearning/jump_lin_id/figs/surf.tex")
-- 
GitLab