adaptive_width.jl

# include(Pkg.dir("ExperimentalAnalysis","src","ExperimentalAnalysis.jl"))
using TensorFlow, DeterministicPolicyGradient, ValueHistories, JLD, DataFrames
using Plots
default(size=(1600,900), show=false, legend=false, lab="")
const input_size  = 1
const output_size = 1
const neurons     = [200,200]
const dir         = "adaptive_width_experiments"
const N_burnin    = 2000

function fanin(T,dims...)
    s = √(6/(sum(dims)))
    2s*rand(T,dims...)-s
end

function fanin2(T,dims...)
    s = √(2/(dims[1]))
    s*randn(T,dims...)
end

function runstuff(stepsize=2e-4,gate_ratio=0.5,gate12=2,gate_cost=0.001,weight_cost=0.000001,plotflag=true,verbose=false)

    gate_size    = round(Int,gate_ratio*neurons)#neurons .÷ 2
    session      = Session(Graph())

    # TODO: check if these must be moved outside function because they cause allocations
    x_ = placeholder(Float32, shape                                      =[-1,input_size])
    y_ = placeholder(Float32, shape                                      =[-1,output_size])
    W1 = Variable(fanin2(Float32, input_size, neurons[1]), name  ="weights1")
    W2 = Variable(fanin2(Float32, neurons[1], neurons[2]), name  ="weights2")
    W3 = Variable(fanin2(Float32, neurons[2], output_size), name ="weights3")
    B1 = Variable(0.002*rand(Float32,neurons[1])-0.001, name             ="bias1")
    B2 = Variable(0.002*rand(Float32,neurons[2])-0.001, name             ="bias2")
    B3 = Variable(0*ones(Float32,output_size), name                      ="bias3")

    l1   = x_*W1 + B1 |> nn.relu
    l2   = l1*W2 + B2 |> nn.relu
    q    = l2*W3 + B3

    # gate = [slice(l1,[0,0],[-1,gate_size[1]-1]), slice(l2,[0,0],[-1,gate_size[2]-1])] # For l1,l2
    gate = [slice(W1,[0,0],[-1,gate_size[1]-1]), slice(W3,[0,0],[gate_size[2]-1,-1])] # For W1, W3

    loss         = reduce_mean((y_ - q).^2)
    weight_decay = weight_cost*(reduce_sum(W1.^2) + reduce_sum(W2.^2) + reduce_sum(W3.^2))
    gate_penalty = gate_cost*reduce_sum(abs(gate[gate12]))/loss #+reduce_sum(abs(gate[2])) ) # TODO: divided by loss
    cost         = loss + weight_decay + gate_penalty
    train_step   = train.minimize(train.AdamOptimizer(stepsize), cost)

    fun(x)       = (x + x^2 - 5)*(x < 3) - (x )*(x >= 3)

    N_initial = 50
    initial_x = 6rand(N_initial)-3 |> sort
    initial_x = initial_x''
    initial_y = fun.(initial_x)

    N_second  = 100
    second_x  = 9rand(N_second)-3 |> sort
    second_x  = second_x''
    second_y  = fun.(second_x)

    if plotflag
        gr()
        fig = scatter(second_x, second_y, layout=(1,4))
    end
    run(session, initialize_all_variables())
    vh = History(Float32)
    vh2 = History(Float32)

    # Train using initial data distribution
    i = 0
    for i in 1:N_burnin
        yh,l,_,L1,L2 = run(session, [q,loss, train_step,l1,l2], Dict(y_ => initial_y, x_ => initial_x))
        push!(vh,i,l)
        verbose && println("Epoch: ", i, " loss: ", l)
        plotflag || continue
        scatter!(second_x, second_y,c=:blue, subplot=1)
        scatter!(initial_x,yh,c=:red,subplot=1)
        scatter!(vh,yscale=:log10,subplot=2)
        update_plot!(fig[1], max_history=4)
        update_plot!(fig[2], max_history=1)
        if i % 5 == 0
            heatmap!(L1, subplot=3)
            heatmap!(L2, subplot=4)
            update_plot!(fig[3], max_history=1)
            update_plot!(fig[4], max_history=1)
        end
        # sleep(0.01)
        gui(fig)
    end

    # Train using modified data distribution
    for i in i+1:i+1000
        yh,second_l,_ ,L1,L2= run(session, [q,loss, train_step,l1,l2], Dict(y_ => second_y, x_ => second_x))
        initial_l = run(session, loss, Dict(y_ => initial_y, x_ => initial_x))
        verbose && println("Epoch: ", i, " loss: ", initial_l)
        push!(vh,i,initial_l)
        push!(vh2,i,second_l)
        plotflag || continue
        scatter!(second_x, second_y, subplot=1)
        scatter!(second_x,yh,c=:red,subplot=1)
        scatter!(vh,yscale=:log10,subplot=2)
        scatter!(vh2,yscale=:log10,subplot=2)
        update_plot!(fig[1], max_history=2)
        update_plot!(fig[2], max_history=2)
        if i % 5 == 0
            heatmap!(L1, subplot=3)
            heatmap!(L2, subplot=4)
            update_plot!(fig[3], max_history=1)
            update_plot!(fig[4], max_history=1)
        end
        gui()
    end
    @save(joinpath(dir,"adaptive_width_experiment$(length(readdir(dir))).jld"),stepsize,vh.values,vh2.values,gate_ratio,gate12,gate_cost,weight_cost)
    vh,vh2,gate_ratio,gate12,gate_cost
end

function runmany(howmany)

    stepsizes    = logspace(-2.5,-1,20)
    gate_costs   = logspace(-6,5,20)
    weight_costs = logspace(-7,1,20)
    for i = 1:howmany
        stepsize    = stepsizes[rand(1:length(stepsizes))]
        gate_ratio  = 0.8rand()
        gate12      = rand(1:2)
        gate_cost   = gate_costs[rand(1:length(gate_costs))]
        weight_cost = weight_costs[rand(1:length(weight_costs))]
        runstuff(stepsize,gate_ratio,gate12,gate_cost,weight_cost,false)
    end
end