This example demonstrates the use of fasttext for text classification

Based on Joulin et al’s paper:

Bags of Tricks for Efficient Text Classification https://arxiv.org/abs/1607.01759

Results on IMDB datasets with uni and bi-gram embeddings: Uni-gram: 0.8813 test accuracy after 5 epochs. 8s/epoch on i7 cpu. Bi-gram : 0.9056 test accuracy after 5 epochs. 2s/epoch on GTx 980M gpu.

library(keras)
library(purrr)

# Function definition -----------------------------------------------------

create_ngram_set <- function(input_list, ngram_value = 2){
  indices <- map(0:(length(input_list) - ngram_value), ~1:ngram_value + .x)
  indices %>%
    map_chr(~input_list[.x] %>% paste(collapse = "|")) %>%
    unique()
}

add_ngram <- function(sequences, token_indice, ngram_range = 2){
  ngrams <- map(
    sequences, 
    create_ngram_set, ngram_value = ngram_range
  )
  
  seqs <- map2(sequences, ngrams, function(x, y){
    tokens <- token_indice$token[token_indice$ngrams %in% y]  
    c(x, tokens)
  })
  
  seqs
}


# Parameters --------------------------------------------------------------

# ngram_range = 2 will add bi-grams features
ngram_range <- 2
max_features <- 20000
maxlen <- 400
batch_size <- 32
embedding_dims <- 50
epochs <- 5


# Data preparation --------------------------------------------------------

imdb_data <- dataset_imdb(num_words = max_features)

print(length(imdb_data$train$x)) # train sequences
print(length(imdb_data$test$x)) # test sequences
print(sprintf("Average train sequence length: %f", mean(map_int(imdb_data$train$x, length))))
print(sprintf("Average test sequence length: %f", mean(map_int(imdb_data$test$x, length))))

if(ngram_range > 1) {
  
  # Create set of unique n-gram from the training set.
  ngrams <- imdb_data$train$x %>% 
    map(create_ngram_set) %>%
    unlist() %>%
    unique()

  # Dictionary mapping n-gram token to a unique integer.
  # Integer values are greater than max_features in order
  # to avoid collision with existing features.
  token_indice <- data.frame(
    ngrams = ngrams,
    token  = 1:length(ngrams) + (max_features), 
    stringsAsFactors = FALSE
  )
  
  # max_features is the highest integer that could be found in the dataset.
  max_features <- max(token_indice$token) + 1
  
  # Augmenting x_train and x_test with n-grams features
  imdb_data$train$x <- add_ngram(imdb_data$train$x, token_indice, ngram_range)
  imdb_data$test$x <- add_ngram(imdb_data$test$x, token_indice, ngram_range)
}

# pad sequences
imdb_data$train$x <- pad_sequences(imdb_data$train$x, maxlen = maxlen)
imdb_data$test$x <- pad_sequences(imdb_data$test$x, maxlen = maxlen)


# Model definition --------------------------------------------------------

model <- keras_model_sequential()

model %>%
  layer_embedding(
    input_dim = max_features, output_dim = embedding_dims, 
    input_length = maxlen
    ) %>%
  layer_global_average_pooling_1d() %>%
  layer_dense(1, activation = "sigmoid")

model %>% compile(
  loss = "binary_crossentropy",
  optimizer = "adam",
  metrics = "accuracy"
)


# Fitting -----------------------------------------------------------------

model %>% fit(
  imdb_data$train$x, imdb_data$train$y, 
  batch_size = batch_size,
  epochs = epochs,
  validation_data = list(imdb_data$test$x, imdb_data$test$y)
)