When I try to train a simple resnet on CIFAR10 dataset, FastAi seems very slow compared to Flux (≈ 9-19 times slower).
It seems, it could be a garbage collector problem, because with Flux I can have a batch-size of 512, and with FastAI I can't exceed 128 without having a out of memory error.
FastAI code:
using FastAI
using ResNet9 # Pkg.add(url = "https://github.com/a-r-n-o-l-d/ResNet9.jl", rev="v0.1.1")
data, blocks = loaddataset("cifar10", (Image, Label))
method = ImageClassificationSingle(blocks)
model = resnet9(inchannels=3, nclasses=10, dropout=0.0)
method = ImageClassificationSingle(blocks)
learner = methodlearner(method, data;
lossfn=Flux.crossentropy,
callbacks=[ToGPU()],
batchsize=16,
model=model,
optimizer=Descent())
@time fitonecycle!(learner, 5, 1f-3, pct_start=0.5, divfinal=100, div=100)
Flux code:
using Flux
using Flux: DataLoader, onehotbatch
using Augmentor
using MLDatasets
using ParameterSchedulers
using ParameterSchedulers: Scheduler
using ResNet9 # Pkg.add(url = "https://github.com/a-r-n-o-l-d/ResNet9.jl", rev="v0.1.1")
normpip = SplitChannels() |> PermuteDims(3, 2, 1) |> ConvertEltype(Float32)
labels = CIFAR10.classnames() .|> Symbol
function datasets(batchsize)
train = let
x = CIFAR10.traintensor() |> CIFAR10.convert2image
y = map(i -> labels[i + 1], CIFAR10.trainlabels())
DataLoader((x, y), batchsize = batchsize, shuffle = true, partial = false)
end
test = let
x = CIFAR10.testtensor() |> CIFAR10.convert2image
y = map(i -> labels[i + 1], CIFAR10.testlabels())
DataLoader((x, y), batchsize = batchsize)
end
train, test
end
function minibatch(x, y)
h, w, n = size(x)
xb = Array{Float32}(undef, w, h, 3, n)
augmentbatch!(CPUThreads(), xb, x, normpip)
yb = onehotbatch(y, labels)
xb, yb
end
function train!(model, optimiser, nepochs)
loss_hist = []
loss(x, y) = Flux.crossentropy(model(x), y)
ps = params(model)
for e in 1:nepochs
# Training phase
tloss = 0
trainmode!(model)
for (x, y) ∈ train
x, y = minibatch(x, y) |> gpu
gs = gradient(ps) do
l = loss(x, y)
tloss += l
l
end
Flux.Optimise.update!(optimiser, ps, gs)
end
tloss /= length(train)
# Validation phase
testmode!(model)
vloss = 0
for (x, y) ∈ test
x, y = minibatch(x, y) |> gpu
vloss += loss(x, y)
end
vloss /= length(test)
push!(loss_hist, (tloss, vloss))
end
loss_hist
end
train, test = datasets(16)
nepochs = 5
s = Triangle(λ0 = 1f-5, λ1 = 1f-3, period = nepochs * length(train))
opt = Scheduler(s, Descent())
model = resnet9(inchannels = 3, nclasses = 10, dropout = 0.0) |> gpu
@time train!(model, opt, nepochs)
Results on a RTX 2080 Ti:
FastAI:
1841.008685 seconds (3.92 G allocations: 212.561 GiB, 59.59% gc time, 0.00% compilation time)
Flux:
98.444806 seconds (106.49 M allocations: 16.643 GiB, 3.58% gc time, 2.58% compilation time)
Results on a Quadro P5000:
FastAI:
1574.714976 seconds (3.92 G allocations: 212.473 GiB, 11.08% gc time)
Flux:
177.416636 seconds (105.55 M allocations: 16.639 GiB, 2.05% gc time, 1.42% compilation time)