Machine learning in Julia: Digit recognizer
Published:
In this blog article I will process the CSV files from the Kaggle “Digit recognizer” competition. First challenge is to wrangle the data from the CSV file into a format that can be used as input for a Flux.jl neural network. The neural network is the Lenet example from the Flux.jl website and it seems to work quite well.
The overall conclusion is that machine learning on image data is fairly straightforward using Julia. If you have GPU that is supported (for example Nvidia with CUDA) then it is straightforward to load and run the neural network on the gpu. You add ”|> gpu” to load it on the GPU and ”|> cpu” to get it back on the CPU. Not all operations are supported on the GPU, so if you want to do things like indexing on the data, you should first get the data back on the CPU.
using CSV,DataFrames, Flux, JLD2, CUDA, MLUtils, Statistics
struct Dataset
x::Array{Float64, 4}
y::Vector{Int64}
end
train_file = "C:\Git\juliacode\Data\digit-recognizer\train.csv"
test_file = "C:\Git\juliacode\Data\\digit-recognizer\test.csv"
output_file = "C:\Git\juliacode\Data\\digit-recognizer\output.csv"
function load_train_data(file)
# load data from CSV file and split in train and test set
data=Tables.matrix(CSV.File(file))
train, test = splitobs(data', at=0.8)
train=train';
test=test';
y_train=train[:,1];
train=train[:,2:785]./256;
#the input data for the Flux.jl network should have the following dimensions (28,28,1, number_of_rows)
x_train=zeros(28,28,size(train,1))
for i = 1:size(train,1)
x_train[:,:,i]=reshape(train[i,:],28,28)
end
x_train=reshape(x_train,28,28,1,:)
y_test=test[:,1];
test=test[:,2:785]./256;
x_test=zeros(28,28,size(test,1))
for i = 1:size(test,1)
x_test[:,:,i]=reshape(test[i,:],28,28)
end
x_test=reshape(x_test,28,28,1,:)
train=Dataset(x_train,y_train)
test=Dataset(x_test,y_test)
train,test
end
function load_kaggle_test_data(file)
data=Tables.matrix(CSV.File(file))
data=data[:,:]./256;
x=zeros(28,28,size(data,1))
for i = 1:size(data,1)
x[:,:,i]=reshape(data[i,:],28,28)
end
x=reshape(x,28,28,1,:)
x
end
data_train, data_test=load_train_data(train_file);
After loading the data from the CSV file, we define a neural network that we will train with the MNIST data. In Python there are several solutions for neural networks such as PyTorch and TensorFlow. In Julia the most commonly used package for neural networks is Flux.jl. We first create a loader that can load batches of data for training the neural network. I also print one image to validate that the data wrangling from CSV file to image was successful
function loader(data=data_train; batchsize::Int=64)
yhot = Flux.onehotbatch(data.y, 0:9) # make a OneHotMatrix
Flux.DataLoader((data.x, yhot); batchsize, shuffle=true) |> gpu
end
x1, y1 = first(loader());
println(size(x1))
using ImageCore, ImageInTerminal
image= x1[:,:,1,1]
#let's first print the value the image represents and then show the image
imageY = ( y1 |> cpu)[:,1]
println(Flux.onecold(imageY)-1)
image .|> Gray |> transpose |> cpu
(28, 28, 1, 64)
0
#define the Flux.jl network
lenet = Chain(
Conv((5, 5), 1=>6, relu),
MaxPool((2, 2)),
Conv((5, 5), 6=>16, relu),
MaxPool((2, 2)),
Flux.flatten,
Dense(256 => 120, relu),
Dense(120 => 84, relu),
Dense(84 => 10),
) |> gpu
Chain(
Conv((5, 5), 1 => 6, relu), # 156 parameters
MaxPool((2, 2)),
Conv((5, 5), 6 => 16, relu), # 2_416 parameters
MaxPool((2, 2)),
Flux.flatten,
Dense(256 => 120, relu), # 30_840 parameters
Dense(120 => 84, relu), # 10_164 parameters
Dense(84 => 10), # 850 parameters
) # Total: 10 arrays, 44_426 parameters, 2.086 KiB.
# let's define a function that calculates the loss and accuracy
function loss_and_accuracy(model, data=data_train)
(x1,y1) = only(loader(data, batchsize=Int(size(data.x,4))))
ŷ = model(x1)
loss = Flux.logitcrossentropy(ŷ, y1)
acc = round(100 * mean(Flux.onecold(ŷ) .== Flux.onecold(y1)); digits=2)
(; loss, acc)
end
@show loss_and_accuracy(lenet);
loss_and_accuracy(lenet) = (loss = 2.3034132f0, acc = 15.6)
settings = (;
eta = 3e-4,
lambda = 1e-2,
batchsize = 128,
epochs = 30,
)
train_log = []
opt_rule = OptimiserChain(WeightDecay(settings.lambda), Adam(settings.eta))
opt_state = Flux.setup(opt_rule, lenet);
for epoch in 1:settings.epochs
for (x,y) in loader(batchsize=settings.batchsize)
grads = Flux.gradient(m -> Flux.logitcrossentropy(m(x), y), lenet)
Flux.update!(opt_state, lenet, grads[1])
end
# Logging is printed every 5th epoch
if epoch % 5 == 1
loss, acc = loss_and_accuracy(lenet)
test_loss, test_acc = loss_and_accuracy(lenet, data_test)
@info "logging:" epoch acc test_acc
end
end
#let's compare the output from the trained network with ground truth
y1hat = lenet(x1)
@show hcat(Flux.onecold(y1hat, 0:9), Flux.onecold(y1, 0:9))
hcat(Flux.onecold(y1hat, 0:9), Flux.onecold(y1, 0:9)) = [0 0; 8 8; 5 5; 2 2; 4 4; 8 8; 2 2; 6 6; 7 7
; 9 9; 5 5; 5 5; 9 9; 1 1; 8 8; 9 4; 3 3; 5 5; 2 7; 8 8; 1 1; 9 9; 6 6; 5 5; 7 7; 4 4; 2 2; 2 2; 1 1
; 5 5; 7 7; 9 4; 0 0; 6 6; 4 4; 6 6; 6 6; 5 5; 0 0; 3 3; 4 4; 9 5; 8 8; 7 7; 7 7; 9 9; 8 2; 4 4; 6 6
; 4 4; 0 0; 3 3; 0 0; 1 1; 3 3; 4 4; 9 9; 8 8; 3 3; 9 9; 7 7; 5 5; 8 8; 5 5]
64×2 CUDA.CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}:
0 0
8 8
5 5
2 2
4 4
8 8
2 2
6 6
7 7
9 9
⋮
4 4
9 9
8 8
3 3
9 9
7 7
5 5
8 8
5 5
In order to check whether our trained network is any good, we can have it process the test data provided by Kaggle and submit the estimated classification to the Kaggle website. For this we need to save the labels together with the imageID’s in a CSV file
#compute classification for Kaggle test set and save it to CSV file so it can be submitted to the Kaggle website
x_kaggle = load_kaggle_test_data(test_file);
x_kaggle= Float32.(x_kaggle) |> gpu
y_kaggle= lenet(x_kaggle);
column2=Flux.onecold(y_kaggle, 0:9) |> cpu
column1=collect(1:length(column2)) |> cpu
output=hcat(column1, column2)
df=DataFrame( output, ["ImageId","Label"])
CSV.write( output_file,df)
"C:\Git\juliacode\Data\\digit-recognizer\output.csv"