Unsupervised learning

M. Benesty

2017-09-15

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0%  words/sec/thread: 18724  lr: 0.000000  loss: 2.461299  eta: 0h0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##             [,1]       [,2]         [,3]       [,4]      [,5]        [,6]
## time   0.1051282 0.12150212  0.005264201 0.01955215 0.1759570 -0.07116566
## timing 0.1462128 0.08484089 -0.003055850 0.05790246 0.1756263 -0.08370596
##              [,7]         [,8]      [,9]       [,10]       [,11]
## time   0.02374416 -0.016329534 0.1098138 -0.06714629 -0.05566642
## timing 0.03188968 -0.005233053 0.1142955 -0.07134001 -0.09649358
##             [,12]      [,13]       [,14]         [,15]     [,16]
## time   -0.1005409 -0.1614572 -0.08737041 -0.0551756620 0.3023179
## timing -0.1045317 -0.1315412 -0.06607364  0.0005156712 0.2729409
##              [,17]       [,18]       [,19]     [,20]      [,21]      [,22]
## time   -0.03827165 -0.09630886 -0.06744245 0.2874633 -0.1316409 0.08701636
## timing  0.01492595 -0.07921946 -0.07997738 0.2815897 -0.1422282 0.12186342
##              [,23]     [,24]      [,25]       [,26]     [,27]       [,28]
## time   -0.07737042 0.1144370 0.01231114 0.022084633 0.2550842 -0.07494630
## timing -0.08972809 0.1423234 0.03694145 0.001559853 0.2445612 -0.07919279
##           [,29]     [,30]      [,31]     [,32]      [,33]      [,34]
## time   0.183799 0.1565165 -0.1971460 0.1906143 -0.2779255 0.09122123
## timing 0.178959 0.1732142 -0.2147351 0.1713849 -0.3568698 0.15957016
##             [,35]       [,36]      [,37]      [,38]      [,39]      [,40]
## time   0.10281681 -0.10707969 -0.2297566 -0.1279059 -0.1924931 -0.2512977
## timing 0.03638727 -0.08419075 -0.2362274 -0.1197964 -0.1748915 -0.2812563
##              [,41]       [,42]     [,43]      [,44]      [,45]       [,46]
## time   -0.10382315 -0.01779083 0.4809303 0.08179988 -0.1925488 -0.13714731
## timing -0.08868551  0.02378480 0.4893288 0.09510561 -0.2041904 -0.09322326
##              [,47]     [,48]       [,49]      [,50]      [,51]       [,52]
## time   0.010338202 0.2487794 -0.06054345 0.03952054 0.09150095 -0.09987199
## timing 0.009296207 0.2033333 -0.10158393 0.06320308 0.09833961 -0.10175142
##            [,53]      [,54]      [,55]        [,56]       [,57]      [,58]
## time   0.1699462 0.08838340 -0.2158270 -0.008011733 -0.08851113 -0.3705585
## timing 0.2526238 0.06378453 -0.2260311 -0.018755538 -0.10650443 -0.3642533
##             [,59]       [,60]      [,61]      [,62]      [,63]     [,64]
## time   0.04588198 -0.01582638 -0.1702349 0.04900502 -0.1353486 0.4526037
## timing 0.02069537 -0.01828872 -0.1766143 0.02959018 -0.1763782 0.4583665
##            [,65]      [,66]        [,67]      [,68]       [,69]      [,70]
## time   0.1785897 -0.1104875  0.001167171 -0.2017725 -0.09577073 0.04960832
## timing 0.1521341 -0.1233444 -0.014244801 -0.1996749 -0.13736786 0.03397766
##               [,71]     [,72]       [,73]      [,74]       [,75]
## time    0.002605274 0.1097845 -0.06954949 -0.2513508 -0.06048902
## timing -0.001418267 0.1040714 -0.05710314 -0.2230393 -0.08617350
##             [,76]       [,77]       [,78]      [,79]      [,80]
## time   0.03735455 0.005735692  0.01011794 0.08748085 0.02329874
## timing 0.07941721 0.012174603 -0.04015623 0.06538074 0.01149520
##              [,81]     [,82]       [,83]     [,84]       [,85]      [,86]
## time   -0.05500054 0.1812908 -0.07001343 0.1544676 -0.09727329 -0.1552624
## timing -0.02101557 0.1562615 -0.06231448 0.1751388 -0.11129720 -0.1879628
##               [,87]     [,88]       [,89]       [,90]     [,91]     [,92]
## time    0.013606191 0.1476225 -0.10499682 0.005484045 0.2271290 0.1721083
## timing -0.009402145 0.1339587 -0.09369148 0.015345438 0.2563034 0.1663923
##            [,93]      [,94]      [,95]     [,96]      [,97]      [,98]
## time   0.1915366 -0.1987056 -0.1243258 0.1142971 -0.1431325 -0.2081325
## timing 0.1433011 -0.1725464 -0.1056151 0.1517264 -0.1042065 -0.2216748
##             [,99]       [,100]
## time   -0.1489273 -0.024113297
## timing -0.1327253  0.003199333
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.01655479
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  555738 29.7     940480 50.3   940480 50.3
## Vcells 1157172  8.9    1946338 14.9  1548136 11.9