LSTM for character-level language detection

This model uses an LSTM for character-level language detection. Given a sentence of text, each character is fed into the model and the final output determines which of five languages the sentence was written in.

First, we run the following code to download a Wikipedia dataset.

using Cascadia, Gumbo, HTTP

pages = Dict(
  :en => ["Wikipedia", "Osama_bin_Laden_(elephant)", "List_of_lists_of_lists", "Josephine_Butler", "Canadian_football", "Judaism"],
  :it => ["Wikipedia", "Ludovico_Einaudi", "Filosofia_della_scienza", "Pizza", "Effie_Gray", "Galeazzo_Maria_Sforza", "Ebraismo"],
  :fr => ["Wikipedia", "Philosophie_des_sciences", "Seconde_Guerre_mondiale", "Eric_Hakonsson"],
  :es => ["Wikipedia", "Chorizo", "Historia_de_Barcelona", "Espania", "Las_Vegas_Strip", "Judaismo"],
  :da => ["Wikipedia", "H.C._Andersen", "L.A._Ring", "Jiangxi", "NATO", "Thomas_Edison", "Bangladesh"])

rawpage(url) = parsehtml(String(HTTP.get(url).body)).root

content(url) = join((collect(nodeText(m) for m in eachmatch(sel".mw-parser-output > p", rawpage(url)))), "\n")


for (lang, ps) in pages
    open("corpus/$lang.txt", "w") do io
        for p in ps
            write(io, content("https://$$p"))

Now, we define the actual model and training code.

Load the necessary packages.

using Flux
using Flux: onehot, onehotbatch, logitcrossentropy, reset!, throttle
using Statistics: mean
using Random
using Unicode
using Parameters: @with_kw

Define Hyperparameter arguments.

@with_kw mutable struct Args
    lr::Float64 = 1e-3     # learning rate
    N::Int = 15            # Number of perceptrons in hidden layer
    test_len::Int = 100    # length of test data
    langs_len::Int = 0     # Number of different languages in Corpora
    alphabet_len::Int = 0  # Total number of characters possible, in corpora
    throttle::Int = 10     # throttle timeout

function get_processed_data(args)
    corpora = Dict()

    for file in readdir("corpus")
        lang = Symbol(match(r"(.*)\.txt", file).captures[1])
        corpus = split(String(read("corpus/$file")), ".")
        corpus = strip.(Unicode.normalize.(corpus, casefold=true, stripmark=true))
        corpus = filter(!isempty, corpus)
        corpora[lang] = corpus

    langs = collect(keys(corpora))
    args.langs_len = length(langs)
    alphabet = ['a':'z'; '0':'9'; ' '; '\n'; '_']
    args.alphabet_len = length(alphabet)

    # See which chars will be represented as "unknown"
    unique(filter(x -> x  alphabet, join(vcat(values(corpora)...))))

    dataset = [(onehotbatch(s, alphabet, '_'), onehot(l, langs)) for l in langs for s in corpora[l]] |> shuffle

    train, test = dataset[1:end-args.test_len], dataset[end-args.test_len+1:end]
    return train, test

Build the model.

function build_model(args)
    scanner = Chain(Dense(args.alphabet_len, args.N, σ), LSTM(args.N, args.N))
    encoder = Dense(args.N, args.langs_len)
    return scanner, encoder
function model(x, scanner, encoder)
    state = scanner.([end]

Define the train function.

function train(; kws...)
    # Initialize Hyperparameters
    args = Args(; kws...)
    # Load Data
    train_data, test_data = get_processed_data(args)

    @info("Constructing Model...")
    scanner, encoder = build_model(args)

    loss(x, y) = logitcrossentropy(model(x, scanner, encoder), y)
    testloss() = mean(loss(t...) for t in test_data)
    opt = ADAM(
    ps = params(scanner, encoder)
    evalcb = () -> @show testloss()
    Flux.train!(loss, ps, train_data, opt, cb = throttle(evalcb, args.throttle))

Train the model.



[ Info: Constructing Model...
[ Info: Training...
testloss() = 1.6677606f0
testloss() = 1.5074775f0
testloss() = 1.1638744f0
testloss() = 1.2398477f0

