Supervised learning

M. Benesty

2019-10-27

library(fastrtext)

data("train_sentences")
data("test_sentences")

# prepare data
tmp_file_model <- tempfile()

train_labels <- paste0("__label__", train_sentences[,"class.text"])
train_texts <- tolower(train_sentences[,"text"])
train_to_write <- paste(train_labels, train_texts)
train_tmp_file_txt <- tempfile()
writeLines(text = train_to_write, con = train_tmp_file_txt)

test_labels <- paste0("__label__", test_sentences[,"class.text"])
test_labels_without_prefix <- test_sentences[,"class.text"]
test_texts <- tolower(test_sentences[,"text"])
test_to_write <- paste(test_labels, test_texts)

# learn model
execute(commands = c("supervised", "-input", train_tmp_file_txt, "-output", tmp_file_model, "-dim", 20, "-lr", 1, "-epoch", 20, "-wordNgrams", 2, "-verbose", 1))
## 
Read 0M words
## Number of words:  5060
## Number of labels: 15
## 
Progress: 100.0% words/sec/thread: 1105085 lr:  0.000000 avg.loss:  0.351986 ETA:   0h 0m 0s
# load model
model <- load_model(tmp_file_model)
## add .bin extension to the path
# prediction are returned as a list with words and probabilities
predictions <- predict(model, sentences = test_to_write)
print(head(predictions, 5))
## [[1]]
##     OWNX 
## 0.999851 
## 
## [[2]]
##      MISC 
## 0.9858458 
## 
## [[3]]
##      MISC 
## 0.9926952 
## 
## [[4]]
##      OWNX 
## 0.9089149 
## 
## [[5]]
##     AIMX 
## 0.991272
# Compute accuracy
mean(names(unlist(predictions)) == test_labels_without_prefix)
## [1] 0.83
# because there is only one category by observation, hamming loss will be the same
get_hamming_loss(as.list(test_labels_without_prefix), predictions)
## [1] 0.83
# test predictions
predictions <- predict(model, sentences = test_to_write)
print(head(predictions, 5))
## [[1]]
##     OWNX 
## 0.999851 
## 
## [[2]]
##      MISC 
## 0.9858458 
## 
## [[3]]
##      MISC 
## 0.9926952 
## 
## [[4]]
##      OWNX 
## 0.9089149 
## 
## [[5]]
##     AIMX 
## 0.991272
# you can get flat list of results when you are retrieving only one label per observation
print(head(predict(model, sentences = test_to_write, simplify = TRUE)))
##      OWNX      MISC      MISC      OWNX      AIMX MISC--the 
## 0.9998510 0.9858458 0.9926952 0.9089149 0.9912720 0.4533829
# free memory
unlink(train_tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  601082 32.2    1222758 65.4  1222758 65.4
## Vcells 1257842  9.6    8388608 64.0  1802458 13.8