diff --git a/flux/bayesian_dropout.jl b/flux/bayesian_dropout.jl
index 89c29779712b8f98d8558f6d7067126134b3424d..cb2ff2045002dba67247a1aea2b57468237e6401 100644
--- a/flux/bayesian_dropout.jl
+++ b/flux/bayesian_dropout.jl
@@ -20,32 +20,32 @@ function update_plot(p; max_history = 10, attribute = :markercolor)
 
 end
 
-iters = 4000
+iters = 100
 N = 30
-n = 50
-
-function train(m = Chain(Dense(1,n,relu), Dropout(0.01), Dense(n,n,relu), Dropout(0.01), Dense(n,n,relu), Dropout(0.01), Dense(n,n,relu), Dropout(0.01), Dense(n,1)))
-x = linspace(-10,10,N)'
-y = sin.(x)./(x)
-trace = History(Float64)
-batcher = batchview(shuffleobs((x,y)), N)
-dataset = ncycle(batcher, iters)
-opt = ADAM(params(m), 0.01, decay = 0)
-loss(x,y) = sum((m(x).-y).^2)/N
-fig = plot(layout=2, reuse=true)
-function cb()
-    push!(trace, loss(x,y).data[])
-    plot!(vec(x), [vec(y) m(x).data'], ylims=[-0.3,1], c = [:blue :red], subplot=1)
-    plot!(trace, subplot=2, c=:blue, yscale=:log10)
-    update_plot(fig[1], max_history = 10, attribute=:linecolor)
-    update_plot(fig[2], max_history=1)
-    gui()
-end
-Flux.train!(loss, dataset, opt, cb=cb)
+n = 20
+
+function train(m = Chain(Dense(1,n,relu), Dropout(0.001), Dense(n,n,relu), Dropout(0.001), Dense(n,n,relu), Dropout(0.001), Dense(n,n,relu), Dropout(0.001), Dense(n,1)))
+    x = linspace(-10,10,N)'
+    y = sin.(x)./(x)
+    trace = History(Float64)
+    batcher = batchview(shuffleobs((x,y)), N)
+    dataset = ncycle(batcher, iters)
+    opt = ADAM(params(m), 0.01, decay = 0)
+    loss(x,y) = sum((m(x).-y).^2)/N
+    fig = plot(layout=2, reuse=true)
+    function cb()
+        push!(trace, loss(x,y).data[])
+        plot!(vec(x), [vec(y) m(x).data'], ylims=[-0.3,1], c = [:blue :red], subplot=1)
+        plot!(trace, subplot=2, c=:blue, yscale=:log10)
+        update_plot(fig[1], max_history = 10, attribute=:linecolor)
+        update_plot(fig[2], max_history=1)
+        gui()
+    end
+    Flux.train!(loss, dataset, opt, cb=cb)
 
-m, trace, x, y
+    m, trace, x, y
 end
-m, trace, x, y= train()
+m, trace, x, y = train()
 
 ##
 mc = 1000
diff --git a/jump_lin_id/id_paper.tex b/jump_lin_id/id_paper.tex
index 860f6c520e0f76d4091ca417ccdd7d92c72bcdaa..5f2896e099ad5c09108c557daace0c827af3edc1 100644
--- a/jump_lin_id/id_paper.tex
+++ b/jump_lin_id/id_paper.tex
@@ -122,9 +122,7 @@ Identification of LTV Dynamical Models with\\ Smooth or Discontinuous Time Evolu
 
 \author{
 \centering Fredrik Bagge Carlson* \quad Anders Robertsson \quad Rolf Johansson
-\thanks{*Open-source implementations of all presented methods and examples in this paper are made available at \href{github.com/baggepinnen/LTVModels.jl}{github.com/baggepinnen/LTVModels.jl} (Will be made available in advance of paper publication). The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University.\protect\\
-Lund University, Dept Automatic Control, PO Box 118\protect\\
-SE22100 Lund Sweden\protect\\
+\thanks{*Open-source implementations of all presented methods and examples in this paper are made available at \href{github.com/baggepinnen/LTVModels.jl}{github.com/baggepinnen/LTVModels.jl} (Will be made available in advance of paper publication). The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University, Dept Automatic Control, Lund Sweden.\protect\\
 {Fredrik.Bagge\_Carlson@control.lth.se}}
 }
 
@@ -172,7 +170,7 @@ The difficulty of the task of identifying time-varying dynamical models of syste
 
 Identification of systems with non-smooth dynamics evolution has been studied extensively. The book~\cite{costa2006discrete} treats the case where the dynamics are known, but the state sequence unknown, i.e., state estimation. In~\cite{nagarajaiah2004time}, the authors examine the residuals from an initial constant dynamics fit to determine regions in time where improved fit is needed by the introduction of additional constant dynamics models. Results on identifiability and observability in jump-linear systems in the non-controlled (autonomous) setting are available in~\cite{vidal2002observability}. The main result on identifiability in \cite{vidal2002observability} was a rank condition on a Hankel matrix constructed from the collected output data, similar to classical results on the least-squares identification of ARX-models which appears as rank constraints on the, typically Toeplitz or block-Toeplitz, regressor matrix. Identifiability of the methods proposed in this article are discussed in~\cref{sec:identifiability}.
 
-An LTV model can be seen as a first order approximation of the dynamics of a nonlinear system around a trajectory. We emphasize that such an approximation will in general fail to generalize far from the this trajectory, but many methods in reinforcement learning and control make efficient use of the linearized dynamics for optimization, while ensuring validity of the approximation by constraints or penalty terms. One of the examples provided towards the end of this work highlights such a method.
+An LTV model can be seen as a first order approximation of the dynamics of a nonlinear system around a trajectory. We emphasize that such an approximation will in general fail to generalize far from the this trajectory, but many methods in reinforcement learning and control make efficient use of the linearized dynamics for optimization, while ensuring validity of the approximation by constraints or penalty terms. An example provided towards the end of this work highlights such a method.
 
 An important class of identification methods that have been popularized lately are \emph{trend filtering} methods~\cite{kim2009ell_1, tibshirani2014adaptive}. Trend filtering methods work by specifying a \emph{fitness criterion} that determines the goodness of fit, as well as a \emph{regularization} term, often chosen with sparsity promoting qualities. As a simple example, consider the reconstruction $\hat y$ of a noisy signal $y = \left\{y_t\inspace{}\right\}_{t=1}^T$ with piecewise constant segments. To this end, we might formulate and solve the convex optimization problem
 \begin{equation} \label{eq:tf}
@@ -228,14 +226,11 @@ where the parameters $\w$ are assumed to evolve according to the dynamical syste
    y_t &= \big(I_n \otimes \bmatrixx{x\T & u\T}\big) \w_t
 \end{split}
 \end{equation}
-\recmt{This eq not clear. Motivate noise terms better.} \cmt{Connect noise terms to MLE}
-where, if no prior knowledge is available, the dynamics matrix $H_t$ can be taken as the identity matrix, which implies that the model coefficients follow a random walk dictated by $\w$.
-The evolution of the dynamics is thus governed by the emission density function $p_v(x_{t+1} | x_t, u_t, k_t)$ and a state transition density function $p_w(k_{t+1}|k_t)$.
-The density $p_v$ is determining the distribution of measurements, given the current state of the system, whereas $p_w$ governs the transition of the parameter vector between time steps.
+where, if no prior knowledge is available, the dynamics matrix $H_t$ can be taken as the identity matrix. $H = I$ implies that the model coefficients follow a random walk dictated by the properties of $w_t$, i.e., the state transition density function $p_w(k_{t+1}|k_t)$. The emission density function $p_v(x_{t+1} | x_t, u_t, k_t)$ is determining the drift of the state, which for the parameter estimation problem can be seen as the distribution of measurements, given the current state of the system.
 We emphasize here that the state in the parameter evolution model refers to the current parameters $k_t$ and not the system state $x_t$, hence, $p_v$ is called the emission density and not the transition density. Particular choices of $p_v$ and $p_w$ emit data likelihoods concave in the parameters and hence amenable to convex optimization.
 
 
-The following sections will introduce a number of optimization problems with different regularization functions and regularization arguments, corresponding to different choices of $p_w$, and discuss the quality of the resulting identification.
+The following sections will introduce a number of optimization problems with different regularization functions, corresponding to different choices of $p_w$, and different regularization arguments, corresponding to different choices of $H$. We also discuss the quality of the identification resulting from the different modeling choices.
 
 \subsection{Low frequency time evolution}
 A slowly varying signal is characterized by \emph{small first-order time differences}.
@@ -250,7 +245,7 @@ This optimization problem has a closed form solution given by
     \tilde{\w} &= \operatorname{vec}(\w_1, ...\,, \w_T)\nonumber
 \end{align}
 where $\tilde{\A}$ and $\tilde{Y}$ are appropriately constructed matrices and the first-order differentiation operator matrix $D_1$ is constructed such that $\lambda^2\normt{D_1 \tilde{\w}}^2$ equals the second term in~\labelcref{eq:slow}.
-The computational complexity $\mathcal{O}\big((TK)^3\big)$ of computing $k^*$ using the closed-form solution \labelcref{eq:closedform} becomes prohibitive for all but toy problems. We note that the cost function in \labelcref{eq:slow} is the negative data log-likelihood of a Brownian random-walk parameter model, which motivates us to develop a dynamic programming algorithm based on a Kalman smoother, detailed in~\cref{sec:kalmanmodel}.
+The computational complexity $\mathcal{O}\big((TK)^3\big)$ of computing $k^*$ using the closed-form solution \labelcref{eq:closedform} becomes prohibitive for all but toy problems. We note that the cost function in \labelcref{eq:slow} is the negative data log-likelihood of a Brownian random-walk parameter model with $H=I$, which motivates us to develop a dynamic programming algorithm based on a Kalman smoother, detailed in~\cref{sec:kalmanmodel}.
 
 \subsection{Smooth time evolution}
 A smoothly varying signal is characterized by \emph{small second-order time differences}.
@@ -258,7 +253,7 @@ To identify smoothly time-varying dynamics parameters, we thus penalize the squa
 \begin{equation} \label{eq:smooth}
    \minimize{\w} \normt{\y-\hat{\y}}^2 + \lambda^2\sum_t \normt{\w_{t+2} -2 \w_{t+1} + \w_t}^2
 \end{equation}
-Also this optimization problem has a closed form solution on the form~\labelcref{eq:closedform} with the corresponding second-order differentiation operator $D_2$. \Cref{eq:smooth} is the negative data log-likelihood of a Brownian random-walk parameter model with added momentum, for which a Kalman smoother with augmented state returns the optimal solution, elaborated upon in~\cref{sec:kalmanmodel} where we extend problem \labelcref{eq:smooth} to more general regularization terms.
+Also this optimization problem has a closed form solution on the form~\labelcref{eq:closedform} with the corresponding second-order differentiation operator $D_2$. \Cref{eq:smooth} is the negative data log-likelihood of a Brownian random-walk parameter model with added momentum and $H$ derived in~\cref{sec:smoothkalman}, where a Kalman smoother with augmented state is developed to find the optimal solution. We also extend problem \labelcref{eq:smooth} to more general regularization terms in \cref{sec:kalmanmodel}.
 
 
 
@@ -387,7 +382,7 @@ A prior over the output of the system, or a subset thereof, is straight forward
 We can employ the Kalman-based algorithm to solve two of the proposed optimization problems:
 \subsubsection{Low frequency}
 The Kalman filter can be used for solving identification problems like~\labelcref{eq:slow} by noting that~\labelcref{eq:slow} is the negative log-likelihood of the dynamics model~\labelcref{eq:dynsys}.
-\subsubsection{Smooth}
+\subsubsection{Smooth}\label{sec:smoothkalman}
 
 To develop a Kalman-filter based algorithm for solving~\labelcref{eq:smooth}, we augment the model \labelcref{eq:dynsys} with the state variable $k^\prime_t = k_{t} - k_{t-1}$ and note that $k^\prime_{t+1} - k^\prime_t = k_{t+1} - 2k_t + k_{t-1}$. We thus introduce the augmented-state model
 \begin{align}\label{eq:dynsys2}
@@ -403,24 +398,17 @@ The Kalman-filter based identification methods can be generalized to solving opt
     \begin{equation} \label{eq:generalkalman}
        \minimize{\w} \normt{\y-\hat{\y}}^2 + \lambda^2\sum_t \normt{P(z)\w_t}^2
     \end{equation}
-    where $P(z)$ is a polynomial of degree $n>0$ in the time difference operator $z$ with $z^{-n} P(1) = 0$, can be solved with a Kalman smoother employed on to an autonomous state-space system.
+    where $P(z)$ is a polynomial of degree $n>0$ in the time difference operator $z$ with $z^{-n} P(1) = 0$, can be solved with a Kalman smoother employed to an autonomous state-space system.
 \end{proposition}
 \begin{proof}
-    Let $P^*(z^{-1}) = z^{-n} P(z)$. We assume without loss of generality that $P^*(0) = 1$ since any constant $P^*(0)$ can be factored out of the polynomial. $Q(z^{-1}) = P^*(z^{-1})^{-1}$ is a strictly proper transfer function and has a realization as a state-space system of degree $n$. Since $Q(z^{-1})$ is strictly proper, the realization has no direct term.
+    Let $P^*(z^{-1}) = z^{-n} P(z)$. We assume without loss of generality that $P^*(0) = 1$ since any constant $P^*(0)$ can be factored out of the polynomial. $Q(z^{-1}) = P^*(z^{-1})^{-1}$ is a strictly proper transfer function and has a realization as a linear, Gaussian state-space system of degree $n$. Since $Q(z^{-1})$ is strictly proper, the realization has no direct term.
+    The negative data log-likelihood of $Q(z^{-1})$ is equal to the cost function in~\cref{eq:generalkalman}, hence the Kalman smoother applied to $Q$ optimizes \cref{eq:generalkalman}.
 \end{proof}
 For~\labelcref{eq:smooth} $P(z)$ equals $z^2 - 2z + 1 = (z-1)^2$, and augmenting the dynamical model describing the parameter evolution with a state corresponding to the polynomial $Q(z) = (z-1)^{-1}P(z)$.
 
 
 
 \section{Well-posedness and identifiability}\label{sec:identifiability}
-\begin{theorem}
-    Optimization problems have unique global minima provided that $A$ and $u$...
-\end{theorem}
-\begin{proof}
-    The cost function is a sum of two convex terms. For the global minimum to be non-unique, the Hessians of the two terms must share nullspace.
-    Let $k$ be a minimizer of $\lambda^2\sum_t \normt{\w_{t+1} - \w_{t}}^2$, then there exist a basis of dimension $K$ spanning the nullspace of $D_1\T D_1$, $\mathcal{N}_D$. Let $v$ be a vector in $\mathcal{N}_D$, for $\Phi\T \Phi$ to share nullspace with $D$, we must have $\Phi\T\Phi v = 0$ or equivalently $y_t = Ck_t = C(k_t+v_t) \Longrightarrow Cv_t = 0$
-
-\end{proof}
 To assess the well-posedness of the proposed identification methods, we start
 by noting that the problem of finding $A$ in $x_{t+1} = Ax_t$ given a pair
 $(x_{t+1},x_t)$ is an ill-posed problem in the sense that the solution is non
@@ -453,14 +441,23 @@ $\tilde{\A}\T \tilde{\A} + D\T D$, where
 $\operatorname{rank}(D_2\T D_2) = K(T-2)$ respectively
 $\operatorname{rank}(D_1\T D_1) = K(T-1)$ . The matrix $\tilde{\A}\T \tilde{\A} + D\T D$
 will thus be full rank whenever $\tilde{\A}$ has rank
-$2K$ or more, with a basis spanning the rank-$2K$ null
-space of $D$. \recmt{Something missing in last sentence.}\cmt{Make these statements into propositions and introduce asymptotic analysis?} This will be the case for any non-zero
+$2K$ (K) or more, with a basis spanning the rank-$2K$ (K) null
+space of $D_2\T D_2$ ($D_1\T D_1$). This will be the case for any non-zero
 state and input sequences of sufficient length, provided that the input, $u$,
-is persistently exciting of sufficient order~\cite{johansson1993system}.
+is persistently exciting of sufficient order~\cite{johansson1993system} and that the system is observable.
+\cmt{If $v$ is in both the nullspace of $D_1\T D_1$, and the nullspace of $\Phi\T\Phi$ we must have that the original system is invariant to an offset $k^+ + v = H(k+v), \quad y = C(k+v)$, what does this say about $C$?}
 
 Another strategy is to analyze the observability of the dynamical systems~\labelcref{eq:dynsys,eq:dynsys2},
 which will yield the desired results for problems~\labelcref{eq:slow,eq:smooth}.
 
+We formalize the above arguments as
+\begin{proposition}
+    Optimization problems have unique global minima provided that $A$ and $u$...
+\end{proposition}
+\begin{proof}
+    The cost function is a sum of two convex terms. For the global minimum to be non-unique, the Hessians of the two terms must share nullspace.
+    Let $k$ be a minimizer of $\lambda^2\sum_t \normt{\w_{t+1} - \w_{t}}^2$, then there exist a basis of dimension $K$ spanning the nullspace of $D_1\T D_1$, $\mathcal{N}_D$. Let $v$ be a vector in $\mathcal{N}_D$, for $\Phi\T \Phi$ to share nullspace with $D$, we must have $\Phi\T\Phi v = 0$ or equivalently $y_t = Ck_t = C(k_t+v_t) \Longrightarrow Cv_t = 0$
+\end{proof}
 
 \section{Example}
 \cmt{Add two link example where a stiff contact is established and there is a sign change in control signal and hence friction. Apply pwconstant method and maybe scrap linear system example. Fig 3 can be reproduced with analytical jacobian as comparison. Mention in abstract that we then evaluate on one non-linear but smooth system (pendcart) and one non-smooth system (twolink).}