Unsupervised learning

M. Benesty

2019-10-27

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   44650 lr:  0.000000 avg.loss:  2.704427 ETA:   0h 0m 0s
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##                 [,1]       [,2]        [,3]       [,4]      [,5]
## time   -0.1083451435 -0.1903037 -0.02330873 -0.1466811 0.1008591
## timing  0.0005007559 -0.2070167 -0.06165484 -0.2116760 0.0987759
##               [,6]       [,7]       [,8]       [,9]     [,10]      [,11]
## time    0.02795185 -0.3365301 -0.2749992 -0.3800159 0.2364395 -0.3579780
## timing -0.02718949 -0.3657075 -0.2856203 -0.4265947 0.3240046 -0.2640628
##             [,12]       [,13]       [,14]        [,15]     [,16]
## time   -0.2728569 -0.06378686 -0.09227623 -0.029650018 0.3622415
## timing -0.3143646 -0.03094283 -0.05996651  0.005705716 0.3154159
##             [,17]      [,18]      [,19]      [,20]      [,21]      [,22]
## time   -0.3053523 -0.1081002 -0.1998572 -0.1763009 -0.1973865 -0.3375100
## timing -0.2591803 -0.1612753 -0.1694797 -0.1665424 -0.1355038 -0.3938975
##              [,23]     [,24]       [,25]       [,26]     [,27]      [,28]
## time    0.03022043 0.1175378 -0.11570091 -0.02312460 0.1980775 -0.1459006
## timing -0.06107417 0.1199433 -0.08328937  0.09613875 0.1597574 -0.1230217
##              [,29]       [,30]      [,31]      [,32]       [,33]
## time   -0.09981743  0.02249773 0.02343624 0.15272696  0.02101585
## timing -0.09172077 -0.04449952 0.07853226 0.09061474 -0.08497150
##              [,34]       [,35]      [,36]      [,37]      [,38]      [,39]
## time   -0.07655367 0.073332705 0.06483959 0.01729319 -0.1984900 0.03948210
## timing -0.05537284 0.004482132 0.01026972 0.07896709 -0.1690778 0.05373015
##            [,40]      [,41]      [,42]      [,43]     [,44]       [,45]
## time   0.1315392 -0.3725052 0.08204433 -0.2699454 0.3774788 -0.10433733
## timing 0.1526500 -0.3871703 0.11078099 -0.2121572 0.4011536 -0.09585017
##              [,46]      [,47]       [,48]     [,49]      [,50]
## time   -0.09187908 0.08221020 -0.00773465 0.2233987 0.02696453
## timing -0.10056116 0.05336803 -0.08192983 0.1796382 0.04999176
##               [,51]      [,52]      [,53]     [,54]       [,55]
## time   -0.001078238 0.01273843 -0.3302557 0.1121921 -0.03059405
## timing -0.008135454 0.05191760 -0.2652633 0.1773732 -0.06725810
##              [,56]     [,57]     [,58]      [,59]        [,60]      [,61]
## time    0.01164111 0.1641806 0.2698481 0.03715590  0.004131594 0.05241371
## timing -0.05284570 0.1693291 0.2131423 0.05148774 -0.002195647 0.07033599
##             [,62]       [,63]     [,64]     [,65]      [,66]      [,67]
## time   0.06606552 -0.02488677 0.3051692 0.1311519 -0.1928752 -0.1783033
## timing 0.09959690 -0.01495479 0.2865503 0.1950594 -0.2278213 -0.1816446
##            [,68]      [,69]        [,70]       [,71]     [,72]      [,73]
## time   0.1526355 0.08558945 -0.001226741 -0.04638821 0.1256661 -0.1121099
## timing 0.1789374 0.07952510  0.006552925 -0.04017229 0.1582013 -0.1218574
##             [,74]       [,75]      [,76]       [,77]      [,78]      [,79]
## time   -0.1522974 -0.04702128 -0.3628783  0.02273799 0.02705385 0.06117994
## timing -0.2310218 -0.14709373 -0.4647493 -0.01359840 0.03340237 0.06515899
##              [,80]     [,81]     [,82]     [,83]        [,84]      [,85]
## time   -0.02846841 0.3246377 0.1278958 0.1342877 -0.001412821 -0.3427064
## timing -0.04665632 0.2706675 0.1632123 0.1141842 -0.037994694 -0.3239506
##              [,86]      [,87]     [,88]       [,89]      [,90]       [,91]
## time   -0.08055878 -0.1629244 0.1985570 -0.07048327 0.05328575 -0.03453431
## timing -0.06347448 -0.1709164 0.1520826 -0.16586256 0.13097736 -0.02235916
##             [,92]     [,93]       [,94]       [,95]      [,96]
## time   -0.2081913 0.1446141 -0.03336290 -0.03442269 0.08051111
## timing -0.1646082 0.1740896 -0.01219319 -0.08424617 0.14640304
##               [,97]      [,98]       [,99]      [,100]
## time   -0.004512462 0.06665278 -0.08772212 -0.09487572
## timing -0.064253002 0.08243029 -0.11470184 -0.12634690
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.03901661
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  603450 32.3    1222758 65.4  1222758 65.4
## Vcells 1264799  9.7    8388608 64.0  1802458 13.8