diff --git a/gridworld.jl b/gridworld.jl
new file mode 100644
index 0000000000000000000000000000000000000000..d4344259b36900c41f5225483c59f3ca41b808ea
--- /dev/null
+++ b/gridworld.jl
@@ -0,0 +1,57 @@
+using Plots
+function gridworld()
+    grid_size = (10,10)
+    iters = 10
+    A = [1,2,3,4]
+    running_cost = 0.1
+    grid = zeros(grid_size)
+    Q = 0*ones(length(A),grid_size...)
+    goal = (5,5)
+    grid[goal...] = 5
+
+    function transition(x,y,a)
+        if a == 1
+            y = min(y+1, grid_size[2])
+        elseif a == 2
+            x = min(x+1, grid_size[1])
+        elseif a == 3
+            y = max(y-1, 1)
+        else
+            x = max(x-1, 1)
+        end
+        x,y
+    end
+
+    reward(x,y,a) = -running_cost #+ grid[x,y]
+
+    for i = 1:iters
+        for y = 1:grid_size[2]
+            for x = 1:grid_size[1]
+                if (x,y) == goal
+                    Q[:,x,y] = grid[goal...]
+                    continue
+                end
+                for a = A
+                    xn,yn = transition(x,y,a)
+                    r = reward(xn,yn,a)
+                    Q[a,x,y] = r + maximum(Q[:,xn,yn])
+                end
+            end
+        end
+        heatmap(squeeze(maximum(Q,1),1)); gui()
+        # println(i)
+        # sleep(0.01)
+    end
+    println("done")
+    Q
+end
+@time Q = gridworld()
+
+for y = 1:grid_size[2]
+    for x = 1:grid_size[1]
+        a = indmax(Q[:,x,y])
+        sym = a == 1 ? :utriangle : a == 2 ? :rtriangle : a == 3 ? :dtriangle : :ltriangle
+        scatter!([x],[y],m=(sym,5), lab="", legend=false)
+    end
+end
+gui()
diff --git a/jump_lin_id/bibtexfile.bib b/jump_lin_id/bibtexfile.bib
index eaf5cc552a18ad77622bc8cca38d4c2e22e9de45..76d5e8defbbe865f1bb7541692a8023d271b9b72 100644
--- a/jump_lin_id/bibtexfile.bib
+++ b/jump_lin_id/bibtexfile.bib
@@ -46,3 +46,21 @@
   year={1961},
   publisher={ACM}
 }
+
+@book{johansson1993system,
+  title={System modeling \& identification},
+  author={Johansson, Rolf},
+  year={1993},
+  publisher={Prentice-Hall, Englewood Cliffs, NJ},
+}
+
+@article{hansen1994regularization,
+  title={Regularization tools: A Matlab package for analysis and solution of discrete ill-posed problems},
+  author={Hansen, Per Christian},
+  journal={Numerical algorithms},
+  volume={6},
+  number={1},
+  pages={1--35},
+  year={1994},
+  publisher={Springer}
+}
diff --git a/jump_lin_id/fit.jl b/jump_lin_id/fit.jl
deleted file mode 100644
index 92f9f949061d9e8c45cf738d4bc6b9a7241c8a7e..0000000000000000000000000000000000000000
--- a/jump_lin_id/fit.jl
+++ /dev/null
@@ -1,489 +0,0 @@
-# using DSP
-#
-# if !isdefined(:AbstractModel)
-#     include(Pkg.dir("GuidedPolicySearch","src","kalmanmodel.jl"))
-# end
-# using ReverseDiff: GradientTape, GradientConfig, gradient, gradient!, compile
-
-decayfun(iters, reduction) = reduction^(1/iters)
-
-function toeplitz{T}(c::Array{T},r::Array{T})
-    nc = length(c)
-    nr = length(r)
-    A = zeros(T, nc, nr)
-    A[:,1] = c
-    A[1,:] = r
-    for i in 2:nr
-        A[2:end,i] = A[1:end-1,i-1]
-    end
-    A
-end
-
-function getD(D,T)
-    if D == 3
-        return sparse(toeplitz([-1; zeros(T-4)],[-1 3 -3 1 zeros(1,T-4)]))
-    elseif D == 2
-        return sparse(toeplitz([1; zeros(T-3)],[1 -2 1 zeros(1,T-3)]))
-    elseif D == 1
-        return sparse(toeplitz([-1; zeros(T-2)],[-1 1 zeros(1,T-2)]))
-    end
-    error("Can not handle your choice of D: $D")
-end
-
-function matrices(x,u)
-    T,n = size(x)
-    T -= 1
-    m = size(u,2)
-    A = spzeros(T*n, n^2+n*m)
-    y = zeros(T*n)
-    I = speye(n)
-    for i = 1:T
-        ii = (i-1)*n+1
-        ii2 = ii+n-1
-        A[ii:ii2,1:n^2] = kron(I,x[i,:]')
-        A[ii:ii2,n^2+1:end] = kron(I,u[i,:]')
-        y[ii:ii2] = (x[i+1,:])
-    end
-    y,A
-end
-flatten(A) = reshape(A,prod(size(A,1,2)),size(A,3))'
-
-function fit_statespace(x,u,lambda; normType = 2, D = 2, solver=:ECOS, kwargs...)
-    y,A  = matrices(x,u)
-    T,n  = size(x)
-    T   -= 1
-    m    = size(u,2)
-
-    Dx   = getD(D,T)
-    k    = Convex.Variable(T,n^2+n*m)
-
-    loss = 0
-    for i = 1:T
-        ii = (i-1)*n+1
-        ii2 = ii+n-1
-        loss += Convex.sumsquares((y[ii:ii2,:]) - A[ii:ii2,:]*k[i,:]')
-    end
-    loss += lambda*Convex.norm(Dx*k,normType)
-    loss += Convex.sumsquares(k)
-    problem = Convex.minimize(loss)
-    if solver == :ECOS
-        Convex.solve!(problem, ECOS.ECOSSolver(maxit=100, feastol=1e-10, feastol_inacc=1e-5, reltol=1e-09, abstol=1e-09, nitref=50))
-    else
-        Convex.solve!(problem, SCS.SCSSolver(max_iters=100000, normalize=0, eps=1e-8))
-    end
-
-    At = reshape(k.value[:,1:n^2]',n,n,T)
-    At = permutedims(At, [2,1,3])
-    Bt = reshape(k.value[:,n^2+1:end]',n,m,T)
-    Bt = permutedims(Bt, [2,1,3])
-    At,Bt
-end
-
-
-signfunc(x) = x <= 0 ? -1 : 1
-S(β, γ) = γ >= abs(β) ? 0. : β > 0 ? β-γ : β+γ
-@inline function derivative(t, βt, beta, y, λ)
-    retval = -(y[t]-βt)
-    if t != length(beta)
-        retval -= λ*signfunc(beta[t+1]-βt)
-    end
-    if t != 1
-        retval += λ*signfunc(βt-beta[t-1])
-    end
-    retval
-end
-@inline function derivativeγ(t, γ, beta, y, λ)
-    retval = -(y[t-1]-γ) - (y[t]-γ)
-    if t != length(beta)
-        retval -= λ*signfunc(beta[t+1]-γ)
-    end
-    if t > 2
-        retval += λ*signfunc(γ-beta[t-2])
-    end
-    retval
-end
-
-signchange(t, β, beta, y, λ) = derivative(t, β-eps(), beta, y, λ)*derivative(t, β+eps(), beta, y, λ) < 0
-
-@inline function costfun(t, βt, beta, y, λ)
-    retval = (y[t]-βt)^2
-    if t != length(beta)
-        retval += λ*abs(beta[t+1]-βt)
-    end
-    if t != 1
-        retval += λ*abs(beta[t-1]-βt)
-    end
-    retval
-end
-
-@inline @inbounds function switch(p,i1,i2)
-    temp = p[i1]
-    p[i1] = p[i2]
-    p[i2] = temp
-end
-
-fastsort!(points::Number) = points
-# @inline fastsort!(points) = fastsort2!(points)
-
-@inbounds function fastsort3!(points) # Tested
-    if points[3] < points[2]
-        if points[3] < points[1]
-            switch(points,1,3)
-            if points[3] < points[2]
-                switch(points,2,3)
-            end
-            return points
-        else
-            switch(points,2,3)
-            return points # Done
-        end
-    end
-    if points[2] < points[1]
-        switch(points,1,2)
-    end
-    if points[3] < points[2]
-        switch(points,2,3)
-    end
-    return points
-end
-
-@inbounds function fastsort!(points)
-    if points[2] < points[1]
-        switch(points,2,1)
-    end
-    return points
-end
-
-function findsmallest(t, y, beta, λ)
-    t == 1 && (return beta[t+1])
-    t == length(beta) && (return beta[t-1])
-    return costfun(t, beta[t-1], beta, y, λ) < costfun(t, beta[t+1], beta, y, λ) ? beta[t-1] : beta[t+1]
-end
-function updateβ(t, points, derfun, y, beta, λ)
-    fastsort!(points)
-    d = derfun(t, points[1], beta, y, λ)
-    d > 0 && (return updateβ(t, [points[1]-1, points[1]], derfun, y, beta, λ))
-    abs(d) < 1e-10 || signchange(t, points[1], beta, y, λ) && (return points[1])
-
-    d = derfun(t, points[2], beta, y, λ)
-    if abs(d) < 1e-10 || signchange(t, points[2], beta, y, λ) # Minimum is at discontinuity
-        return points[2]
-    end
-    d = derfun(t, points[2]-eps(), beta, y, λ)
-    if d > 0 # We have passed the minimum and interpolate to find zero
-        width = points[2]-points[1]
-        dlast = derfun(t, points[1]+eps(), beta, y, λ)
-        height = d-dlast
-        return points[1] - dlast*width/height # dlast is negative
-    end
-
-    return updateβ(t, [points[2]+1, points[2]], derfun, y, beta, λ)
-end
-
-function fusedlasso_cd(y,λ; iters=20, λ_increase_factor = 2, kwargs...)
-    T           = size(y,1)
-    beta        = copy(y)
-    costs       = zeros(y)#fill(Inf,size(y))
-    lastcosts   = zeros(y)#fill(Inf,size(y))
-    costhistory = zeros(iters)
-    λvec = logspace(λ/100,λ,10)
-    # for (outer_iter, λ) = enumerate(λvec)
-    for i = 1:iters
-
-        # Descent cycle ========================================================
-        beta[1] = updateβ(1, [beta[1], beta[2]], derivative, y, beta, λ)
-        costs[1] = costfun(1, beta[1], beta, y, λ)
-        for t = 2:T-1 # Coordinate descent for each parameter
-            # if beta[t] == beta[t-1] || beta[t] == beta[t+1]
-            #     # do something else
-            #     continue
-            # end
-            beta[t] = updateβ(t, [beta[t-1], beta[t+1]], derivative, y, beta, λ)
-            costs[t] = costfun(t, beta[t], beta, y, λ)
-        end
-        beta[T] = updateβ(1, [beta[T-1], beta[T]], derivative, y, beta, λ)
-        costs[T] = costfun(T, beta[T], beta, y, λ)
-
-        # Fusion cycle =========================================================
-        for t = 3:T-1 # TODO: fix number 3
-            if costs[t] == lastcosts[t]
-                # println("Fusing t=", t)
-                beta[t] = beta[t-1] = updateβ(t, [beta[t-1], beta[t+1]], derivativeγ, y, beta, λ)
-            end
-        end
-        # Smoothing cycle ======================================================
-        # if any(costs .>= lastcosts)
-        #     continue # The smoothing cycle
-        # end
-        copy!(lastcosts, costs)
-        costhistory[i] = sum(costs)
-    end
-    # costhistory[outer_iter] = sum(costs)
-    # end
-    return beta,costhistory
-end
-
-function test_fusedlasso_cd(N; λ = 1, iters = 100)
-    y = [-10+randn(N); 10+randn(N)]
-
-    @time yh, cost = fusedlasso_cd(y,λ, iters = iters)
-    plot([y yh], layout=2, subplot=1)
-    plot!(cost, subplot=2, yscale=:log10, xscale=:log10)
-end
-
-
-function test_derivative()
-    y = randn(3)
-    beta = copy(y)
-    λ = 1
-    t = 2
-    betagrid = linspace(-3,3,2000)
-    derfun = derivative
-    derivs = [derfun(t, βt, beta, y, λ) for βt in betagrid]
-    costs = [costfun(t, βt, beta, y, λ) for βt in betagrid]
-    points = [beta[1], beta[3]]
-    newbeta = updateβ(t, points, derfun, y, beta, λ)
-    d = derfun(t, newbeta, beta, y, λ)
-    @assert abs(d) < 1e-9  || derfun(t, newbeta-1e-10, beta, y, λ)*derfun(t, newbeta+1e-10, beta, y, λ) < 0
-
-    plot(betagrid,[derivs costs])
-    plot!(points'.*ones(2), [-5, 5])
-    plot!(newbeta*ones(2), [-5, 5], l=:dash)
-end
-
-function test_fit_statespace(lambda)
-    # Generate data
-    srand(1)
-    D        = 1
-    normType = 1
-    T_       = 400
-    n        = 2
-    At_      = [0.95 0.1; 0 0.95]
-    Bt_      = [0.2; 1]''
-    u        = randn(T_)
-    x        = zeros(T_,n)
-    for t = 1:T_-1
-        if t == 200
-            At_ = [0.5 0.05; 0 0.5]
-        end
-        x[t+1,:] = At_*x[t,:] + Bt_*u[t,:] + 0.2randn(n)
-    end
-    xm = x + 0.2randn(size(x));
-    At, Bt, cost, steps = fit_statespace_gd(xm,u,lambda, normType = normType, D = D, step=1e-4, momentum=0.992, iters=4000, reduction=0.1, adaptive=true);
-    y = predict(x,u,At,Bt);
-    e = x[2:end,:] - y;
-    println("RMS error: ",rms(e)^2)
-
-    plot(flatten(At), l=(2,:auto), xlabel="Time index", ylabel="Model coefficients")
-    plot!([1,199], [0.95 0.1; 0 0.95][:]'.*ones(2), ylims=(-0.1,1), l=(:dash,:black, 1))
-    plot!([200,400], [0.5 0.05; 0 0.5][:]'.*ones(2), l=(:dash,:black, 1), grid=false)
-    # # savetikz("figs/ss.tex", PyPlot.gcf())#, [" axis lines = middle,enlargelimits = true,"])
-    #
-    # plot(y, lab="Estimated state values", l=(:solid,), xlabel="Time index", ylabel="State value", grid=false, layout=2)
-    # plot!(x[2:end,:], lab="True state values", l=(:dash,))
-    # plot!(xm[2:end,:], lab="Measured state values", l=(:dot,))
-    # # savetikz("figs/states.tex", PyPlot.gcf())#, [" axis lines = middle,enlargelimits = true,"])
-
-    # plot([cost[1:end-1] steps], lab=["Cost" "Stepsize"],  xscale=:log10, yscale=:log10)
-
-
-    activation = segment(At,Bt)
-    changepoints = [findmax(activation)[2]]
-    Ai,Bi = fit_statespace_constrained(x,u,changepoints)
-
-    rms(e)
-end
-
-function Lcurve()
-    lambdas = logspace(-2,6,24)
-    errors = pmap(test_fit_statespace, lambdas)
-    plot(lambdas, errors, xscale=:log10, m=(:cross,))
-    errors, lambdas
-end
-
-
-# using Optim
-# using CatViews
-
-function fit_statespace_gd(x,u,lambda, initializer::Symbol=:kalman ;kwargs...)
-    y,A     = matrices(x,u)
-    T,n     = size(x)
-    m       = size(u,2)
-    T      -= 1
-    if initializer != :kalman
-        k = A\y
-        k = repmat(k',T,1)
-        k .+= 0.00001randn(size(k))
-    else
-        model = fit_model(KalmanModel, x[1:end-1,:],u[1:end-1,:],x[2:end,:],0.00001*eye(n^2+n*m),eye(n), false)
-        k = [flatten(model.A) flatten(model.B)]
-    end
-    fit_statespace_gd(x,u,lambda, k; kwargs...)
-end
-function fit_statespace_gd(x,u,lambda, k; normType = 1, D = 2, step=0.001, iters=10000, decay_rate=0.999, momentum=0.9, reduction=0, adaptive=true, kwargs...)
-    if reduction > 0
-        decay_rate = decayfun(iters, reduction)
-    end
-    y,A     = matrices(x,u)
-    nparams = size(A,2)
-    T,n     = size(x)
-    T      -= 1
-    m       = size(u,2)
-    bestk   = copy(k)
-    diff_fun = D == 2 ? x-> diff(diff(x,1),1) : x-> diff(x,1)
-    function lossfun(k2)
-        loss    = 0.
-        for i = 1:T
-            ii = (i-1)*n+1
-            ii2 = ii+n-1
-            loss += mean((y[ii:ii2,:] - A[ii:ii2,:]*k2[i,:]).^2)
-        end
-        NK = length(k2)
-        if normType == 1
-            loss += lambda/NK*sum( sqrt(sum(diff_fun(k2).^2, 2)) )
-        else
-            loss += lambda/NK*sum(diff_fun(k2).^2)
-        end
-        loss
-    end
-    loss_tape   = ReverseDiff.GradientTape(lossfun, (k,))
-    inputs      = (k,)
-    results     = (similar(k),)
-    all_results = map(DiffBase.GradientResult, results)
-    mom         = zeros(k)
-    bestcost    = lossfun(k)
-    costs       = Inf*ones(iters+1); costs[1] = bestcost
-    steps       = zeros(iters)
-    for iter = 1:iters
-        steps[iter] = step
-        ReverseDiff.gradient!(all_results, loss_tape, inputs)
-        mom .= step.*all_results[1].derivs[1] + momentum*mom
-        k .-= mom
-        costs[iter+1] = lossfun(k) #all_results[1].value
-        iter % 100 == 0 && println("Iteration: ", iter, " cost: ", round(costs[iter],6), " stepsize: ", step)
-        if costs[iter+1] <= bestcost
-            # bestk .= k
-            bestcost = costs[iter+1]
-            step *= (adaptive ? 10^(1/200) : decay_rate)
-        elseif iter > 10 && costs[iter+1] >= costs[iter-10]
-            step /= (adaptive ? 10^(1/200) : decay_rate)
-            # k .= 0.5.*bestk .+ 0.5.*k; mom .*= 0
-        end
-    end
-    At = reshape(k[:,1:n^2]',n,n,T)
-    At = permutedims(At, [2,1,3])
-    Bt = reshape(k[:,n^2+1:end]',m,n,T)
-    Bt = permutedims(Bt, [2,1,3])
-    At,Bt,costs, steps
-end
-
-function fit_statespace_constrained(x,u,changepoints::AbstractVector)
-    y,A           = matrices(x,u)
-    n             = size(x,2)
-    m             = size(u,2)
-    nc            = length(changepoints)
-    changepointse = [1; changepoints; size(x,1)-1]
-    Ai            = zeros(n,n,nc+1)
-    Bi            = zeros(n,m,nc+1)
-    for i = 1:nc+1
-        ii = (changepointse[i]-1)*n+1
-        ii2 = changepointse[i+1]*n-1
-        inds = ii:ii2
-        k = A[inds,:]\y[inds]
-        Ai[:,:,i] = reshape(k[1:n^2]',n,n)'
-        Bi[:,:,i] = reshape(k[n^2+1:end]',m,n)'
-    end
-    return Ai,Bi
-end
-
-
-
-function predict(x,u,At,Bt)
-    n,T = size(At,2,3)
-    y   = zeros(T,n)
-    for i = 1:T
-        y[i,:] = At[:,:,i]*x[i,:] + Bt[:,:,i]*u[i,:]
-    end
-    y
-end
-
-
-segment(res) = segment(res...)
-function segment(At,Bt, args...)
-    diffparams = (diff([flatten(At) flatten(Bt)],1)).^2
-    # diffparams .-= minimum(diffparams,1)
-    # diffparams ./= maximum(diffparams,1)
-    activation = sqrt.(sum(diffparams,2)[:])
-    activation
-end
-
-function segmentplot(activation, state; filterlength=10, doplot=false, kwargs...)
-    plot(activation, lab="Activation")
-    ds = diff(state)
-    ma = mean(activation)
-    ds[ds.==0] = NaN
-    segments = findsegments(activation; filterlength=filterlength, doplot=doplot, kwargs...)
-    doplot || scatter!(segments,[3ma], lab="Automatic segments", m=(10,:xcross))
-    scatter!(2ma*ds, lab="Manual segments", xlabel="Time index", m=(10,:cross))
-end
-
-# filterlength=2; minh=5; threshold=-Inf; minw=18; maxw=Inf; doplot=true
-# plot(activationf)
-# scatter!(peaks,activationf[peaks])
-function findsegments(activation; filterlength=10, minh=0, threshold=-Inf, minw=0, maxw=Inf, doplot=false)
-    activationf = filtfilt(ones(filterlength),[filterlength],activation)
-    doplot && plot!(activationf, lab="Filtered Activation")
-
-    peaks = allpeaks(activationf)
-    doplot &&  scatter!(peaks, activationf[peaks], lab="All peaks")
-
-    peaks = peaks[activationf[peaks] .>= minh] # Remove too small peaks
-    doplot &&  scatter!(peaks, activationf[peaks]*1.1, lab="Below minh removed")
-
-    peaks = remove_threshold(peaks,activationf,threshold)
-    peaks = remove_width(peaks,activationf,minw,maxw)
-    doplot &&  scatter!(peaks, activationf[peaks]*1.2, lab="Too wide or narrow removed" , m=(10,:xcross))
-    return peaks
-end
-
-function remove_threshold(peaks,activation,threshold)
-    base = max.(activation[peaks-1],activation[peaks+1])
-    peaks = peaks[activation[peaks]-base .>= threshold]
-end
-
-function remove_width(peaks,activation,minw,maxw)
-    if minw <= 0 && maxw == Inf
-        return peaks # Nothing needs to be done
-    end
-    # widths = diff(peaks)
-    i = 1
-    while i < length(peaks)
-        speaks = sortperm(activation[peaks], rev=true) # sorted peak locations (in peaks vector)
-
-        if speaks[i] > 1
-            wl = abs(peaks[speaks[i]] - peaks[speaks[i]-1]) # highest peak location - location to the left
-            if wl < minw || wl > maxw
-                deleteat!(peaks,speaks[i]-1)
-                continue
-            end
-        end
-        if speaks[i] < length(peaks)
-            wr = abs(peaks[speaks[i]] - peaks[speaks[i]+1])
-            if wr < minw || wr > maxw
-                deleteat!(peaks,speaks[i]+1)
-                continue
-            end
-        end
-
-        i += 1
-
-    end
-    @show peaks
-    @assert minimum(diff(peaks)) >= minw
-    peaks
-end
-
-function allpeaks(activation)
-    da = diff(activation)
-    peaks = find((diff(da) .< 0) & (da[1:end-1] .> 0) & (da[2:end] .< 0)) + 1
-end
diff --git a/jump_lin_id/id.jl b/jump_lin_id/id.jl
deleted file mode 100644
index d5714b805814f091fd5bd806859bee23423c0217..0000000000000000000000000000000000000000
--- a/jump_lin_id/id.jl
+++ /dev/null
@@ -1,68 +0,0 @@
-# This script produces the simple MA example plot
-
-using ControlSystems
-using Convex
-using SCS
-default(reuse=false)
-N  = 200
-m  = 3
-n  = 3
-B1 = randn(m)
-A1 = randn(n)
-
-B2 = randn(m)
-A2 = randn(n)
-u1 = randn(N)
-u2 = randn(N)
-u  = [u1;u2]
-
-y1 = filt(B1,[1],u1)
-y2 = filt(B2,[1],u2)
-y  = [y1;y2]
-
-function toeplitz{T}(c::Array{T},r::Array{T})
-    nc = length(c)
-    nr = length(r)
-    A = zeros(T, nc, nr)
-    A[:,1] = c
-    A[1,:] = r
-    @views for i in 2:nr
-        A[2:end,i] = A[1:end-1,i-1]
-    end
-    A
-end
-
-A  = toeplitz(u,u[1:m])
-BB = [repmat(B1',N,1);repmat(B2',N,1)]
-yo = sum(A.*BB,2)
-yo .+= 0.01randn(size(yo))
-Nn = size(A,1)
-# plot(yo); gui()
-
-
-x = Variable(Nn,m)
-loss = sumsquares(yo - sum(A.*x,2))
-
-for i = 1:Nn-1
-    loss += 10*sum(abs(x[i,:]-x[i+1,:]))
-end
-# loss += 0.01*sumsquares(x)
-
-problem = minimize(loss)
-
-solve!(problem, SCSSolver(max_iters=100000, normalize=0, eps=1e-6))
-# Solve the problem by calling solve!
-plot(x.value, c=[:red :green :blue])
-plot!(1:N, [B1';B1'], c=[:red :green :blue], l=:dash)
-plot!(N+1:2N, [B2';B2'], c=[:red :green :blue], l=:dash)
-plot!(legend=false, grid=false, xlabel="Time index", ylabel="Model coefficients")
-gui()
-# savetikz("figs/discont.tex", PyPlot.gcf())#, [" axis lines = middle,enlargelimits = true,"])
-
-# plot([yo sum(A.*x.value,2)], lab=["y" "yhat"])
-# gui()
-# Check the status of the problem
-problem.status # :Optimal, :Infeasible, :Unbounded etc.
-
-# Get the optimal value
-problem.optval
diff --git a/jump_lin_id/id_paper.tex b/jump_lin_id/id_paper.tex
index f96bf1a8b25672b37f41f9cc091e40c82173e1b9..1abf2058b63ee0874e4219f91aceb608d5a317ac 100644
--- a/jump_lin_id/id_paper.tex
+++ b/jump_lin_id/id_paper.tex
@@ -22,7 +22,14 @@
 %\usepackage[utf8x]{inputenc}
 %\usepackage{multirow}
 %\usepackage{natbib}
-\usepackage{url}
+\usepackage[hidelinks]{hyperref}
+\hypersetup{
+    colorlinks,
+    linkcolor={black},
+    citecolor={black},
+    urlcolor={blue!90!black}
+}
+\urlstyle{tt}
 \usepackage{listings}
 %\usepackage[]{algorithm}
 \usepackage{algpseudocode}
@@ -115,7 +122,7 @@ Identification of LTV Dynamical Models with\\ Smooth or Discontinuous Time Evolu
 
 \author{
 \centering Fredrik Bagge Carlson* \quad Anders Robertsson \quad Rolf Johansson
-\thanks{*Open-source implementations of all presented methods and examples marked with a star ($\star$) in this paper are made available at \url{github.com/baggepinnen/LTVModels.jl}. The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University.\protect\\
+\thanks{*Open-source implementations of all presented methods and examples marked with a star ($\star$) in this paper are made available at \href{github.com/baggepinnen/LTVModels.jl}{github.com/baggepinnen/LTVModels.jl}. The reported research was supported by the European Commission under the Framework Programme Horizon 2020 under grant agreement 644938 SARAFun. The authors are members of the LCCC Linnaeus Center and the eLLIIT Excellence Center at Lund University.\protect\\
 Lund University, Dept Automatic Control, PO Box 118\protect\\
 SE22100 Lund Sweden\protect\\
 {Fredrik.Bagge\_Carlson@control.lth.se}}
@@ -231,7 +238,7 @@ This optimization problem has a closed form solution\footnote{The computational
 where $\tilde{\A}$ and $\tilde{y}$ are appropriately constructed matrices and the second-order differentiation operator matrix $D_2$ is constructed such that $\lambda\normt{D_2 \tilde{\w}}^2$ equals the second term in~\labelcref{eq:smooth}.
 \Cref{eq:smooth} is the negative data log-likelihood of a Brownian random-walk parameter model for which also a Kalman smoother returns the optimal solution, see~\cref{sec:kalmanmodel} for details and extensions.
 
-This identification method is a reasonable first choice if no particular knowledge regarding the dynamics evolution is available, or when the dynamics are known to vary slowly. It is also by far the fastest of the proposed methods due to the Kalman-filter implementation of~\cref{sec:kalmanmodel}.\footnote{The Kalman-filter implementation is often several orders of magnitude faster than solving the optimization problems with an iterative solver.} Example use cases include when dynamics are changing with a continuous auxiliary variable, such as temperature, altitude or velocity. If a smooth parameter drift is found to correlate with an auxiliary variable, LPV-methodology can be employed to model the dependence explicitly.
+
 
 \subsection{Piecewise constant time evolution $\star$}\label{sec:pwconstant}
 In the presence of discontinuous or abrupt changes in the dynamics, estimation method~\labelcref{eq:smooth} might perform poorly. A signal which is mostly flat, with a low number of distinct level changes, is characterized by a \emph{sparse first-order time difference}. To detect sudden changes in dynamics, we thus formulate and solve the optimization problem
@@ -405,7 +412,8 @@ $K(T-1)$. The matrix $\tilde{\A}\T \tilde{\A} + D\T D$
 will thus be full rank whenever $\tilde{\A}$ has rank
 $2K$ or more, with a basis spanning the rank-$2K$ null
 space of $D$. This will be the case for any non-zero
-state- and input sequences off sufficient length.
+state- and input sequences off sufficient length, provided that the input, $u$,
+is persistently exciting of sufficient order~\cite{johansson1993system}.
 
 
 \section{Example $\star$}
@@ -447,7 +455,7 @@ to $$A_t = \left[
 $$
 occurred at $t=200$.
 The input was Gaussian noise of zero mean and unit variance, state transition noise and measuremet noise of zero mean and $\sigma = 0.2$ were added. The generated and filtered signals signals are shown in~\cref{fig:states}.
-\Cref{fig:ss} depicts the estimated coefficients in the dynamics matrices for a value of $\lambda$ chosen using the L-curve method~\cite{someone}.
+\Cref{fig:ss} depicts the estimated coefficients in the dynamics matrices for a value of $\lambda$ chosen using the L-curve method~\cite{hansen1994regularization}.
 \begin{figure}
     \centering
     \setlength{\figurewidth}{0.55\linewidth}
@@ -472,7 +480,7 @@ The input was Gaussian noise of zero mean and unit variance, state transition no
 
 \section{Applications}
 \subsection{Reinforcement learning $\star$}
-In this example, we use the proposed methods to identify LTV dynamics models for reinforcement learning. The task is to stabilize a pendulum attached to a moving cart by means of moving the cart, with bounds on the control signal and a quadratic cost on states and control. To this end, we perform a series of experiments, whereafter each we 1) fit a dynamics model, 2) optimize the cost function under the model using differential dynamic programming,\footnote{Implementation made available at \url{github.com/baggepinnen/DifferentialDynamicProgramming.jl}} with bounds on the deviation between the new trajectory and the last trajectory in order to stay close to the validity region of the model. We compare three different models, the ground truth system model, an LTV model (\labelcref{eq:smooth}) and an LTI model. The total cost over $T=500$ time steps is shown as a function of learning iteration in~\cref{fig:ilc}. The figure illustrates how the learning procedure reaches the optimal cost of the ground truth model when an LTV model is used, whereas when using an LTI model, the learning diverges. The figure further illustrates that if the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed is increased.
+In this example, we use the proposed methods to identify LTV dynamics models for reinforcement learning. The task is to stabilize a pendulum attached to a moving cart by means of moving the cart, with bounds on the control signal and a quadratic cost on states and control. To this end, we perform a series of experiments, whereafter each we 1) fit a dynamics model, 2) optimize the cost function under the model using differential dynamic programming,\footnote{Implementation made available at \href{github.com/baggepinnen/DifferentialDynamicProgramming.jl}{github.com/baggepinnen/DifferentialDynamicProgramming.jl}} with bounds on the deviation between the new trajectory and the last trajectory in order to stay close to the validity region of the model. We compare three different models, the ground truth system model, an LTV model (\labelcref{eq:smooth}) and an LTI model. The total cost over $T=500$ time steps is shown as a function of learning iteration in~\cref{fig:ilc}. The figure illustrates how the learning procedure reaches the optimal cost of the ground truth model when an LTV model is used, whereas when using an LTI model, the learning diverges. The figure further illustrates that if the LTV model is fit using a prior (\cref{sec:kalmanmodel}), the learning speed is increased.
 
 \cmt{Mention where prior came from. For what noise level does method break down with/without prior? Fix NN implementation and get prior from there.}
 
@@ -511,9 +519,46 @@ The application we consider is the mating of two plastic pieces, described in de
 \cmt{Maybe include an example with a disturbance}
 
 \section{Discussion}
-All terms $\lambda\normt{\Delta k}^2 = (\Delta k)\T (\lambda I) (\Delta k)$
-can be generalized to $(\Delta k)\T \Lambda (\Delta k)$
-where $\Lambda$ is an arbitrary positive definite matrix. This allows incorporation of different scales for different variables.
+
+This article is written focused on state-space models, where the state sequence
+is assumed to be known. This may seem like an overly optimistic setting at first,
+but all methods and algorithm discussed apply equally well to the input-output
+setting, where only inputs and outputs are available. Indeed, any
+state-space system
+\begin{align}
+    x_{t+1} &= Ax_t + Bu_t\\
+    y_t &= Cx_t + Du_t
+\end{align}
+can be interpreted as a discrete-time transfer function in the
+$z$-transform operator
+\begin{equation}
+    \dfrac{Y(z)}{U(z)} = \big(C(zI-A)^{-1}B + D\big) = \dfrac{B(z)}{A(z)}
+\end{equation}
+The problem of estimating the coefficients in the polynomials $B$ and $A$ remains
+linear given the input-output sequences $u$ and $y$. Extension to time-varying
+dynamics is straight forward.
+
+When faced with a system where time-varying dynamics is suspected and no particular
+knowledge regarding the dynamics evolution is available, or when the dynamics are known to vary slowly,
+a reasonable first choice of algorithm is~\labelcref{eq:smooth}. It is also by
+far the fastest of the proposed methods due to the Kalman-filter implementation
+of~\cref{sec:kalmanmodel}.\footnote{The Kalman-filter implementation is often
+several orders of magnitude faster than solving the optimization problems with
+an iterative solver.} Example use cases include when dynamics are changing with a
+continuous auxiliary variable, such as temperature, altitude or velocity.
+If a smooth parameter drift is found to correlate with an auxiliary variable,
+LPV-methodology can be employed to model the dependence explicitly.
+
+Identification of systems known to contain gain scheduling may benefit from formulation~\labelcref{eq:pwlinear}.
+Gain scheduling is usually implemented with overlapping regions, in between which linear interpolation is used.
+
+
+For simplicity, the regularization weights were kept as simple scalars in this article.
+However, all terms $\lambda\normt{\Delta k}^2 = (\Delta k)\T (\lambda I) (\Delta k)$
+can be generalized to $(\Delta k)\T \Lambda (\Delta k)$, where $\Lambda$ is an arbitrary positive definite matrix.
+This allows incorporation of different scales for different variables with little added implementation complexity.
+
+
 
 % \printbibliography
 \bibliography{bibtexfile}{}