Unsupervised learning

M. Benesty

2017-11-06

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0%  words/sec/thread: 17850  lr: 0.000000  loss: 2.565117  eta: 0h0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##              [,1]      [,2]       [,3]       [,4]      [,5]       [,6]
## time   0.03395659 0.1470706 -0.1058364 0.06878183 0.2309285 0.08670054
## timing 0.07405758 0.1085439 -0.1618748 0.13510625 0.2442455 0.08400857
##              [,7]        [,8]        [,9]       [,10]      [,11]
## time   0.02896640 0.005708979 -0.05033836 -0.05676518 0.08089076
## timing 0.05488665 0.011414430 -0.02430609 -0.07843672 0.08025268
##             [,12]       [,13]       [,14]      [,15]     [,16]       [,17]
## time   -0.1100482 -0.08806141 -0.03016183 0.01264069 0.1562749 -0.07370672
## timing -0.1148377 -0.03641872 -0.01254931 0.05481669 0.1380973 -0.01840515
##              [,18]       [,19]        [,20]      [,21]       [,22]
## time   -0.00132384 -0.07566715  0.006581096 -0.2997985 -0.02544638
## timing  0.02809528 -0.09056149 -0.008341677 -0.3440099 -0.04793579
##             [,23]      [,24]       [,25]      [,26]     [,27]      [,28]
## time   0.03822010 0.05653326 -0.09892341 -0.1603222 0.2401310 -0.3608408
## timing 0.03535323 0.06697033 -0.05893456 -0.2078563 0.2188988 -0.4095490
##             [,29]       [,30]       [,31]      [,32]      [,33]     [,34]
## time   0.06792628 -0.03036106 0.001180083 0.08089674 -0.2813881 0.3012297
## timing 0.09410859 -0.01473445 0.010983766 0.07254659 -0.3645603 0.3640393
##               [,35]       [,36]      [,37]       [,38]      [,39]
## time   -0.005681373 0.007189576 -0.4793869 -0.00365201 -0.1695314
## timing -0.072563231 0.039331511 -0.4984366  0.03357036 -0.1592357
##             [,40]      [,41]     [,42]     [,43]      [,44]      [,45]
## time   -0.1274402 -0.2056829 0.2732867 0.3806124 0.04418882 -0.1379555
## timing -0.1524590 -0.2046957 0.3321827 0.3837048 0.04897582 -0.1643560
##             [,46]      [,47]       [,48]      [,49]     [,50]      [,51]
## time   -0.2657112 0.09444665  0.04182133 -0.2980613 0.1187810 0.05404853
## timing -0.2357648 0.09773789 -0.02136410 -0.3200164 0.1316597 0.01960943
##             [,52]     [,53]       [,54]      [,55]      [,56]      [,57]
## time   -0.2663261 0.1983692  0.04460498 -0.3296352 -0.1995742 -0.1101850
## timing -0.2323197 0.2697556 -0.01013977 -0.3450947 -0.1700735 -0.1021873
##             [,58]     [,59]       [,60]      [,61]       [,62]      [,63]
## time   -0.2103852 0.1975938 -0.06060537 -0.2789482  0.01219837 -0.1785361
## timing -0.1650001 0.1324280 -0.07333889 -0.2980339 -0.01236813 -0.1698531
##             [,64]       [,65]      [,66]      [,67]      [,68]      [,69]
## time   0.16413416  0.02298515 -0.1908897 -0.1314048 -0.3613428 0.07670279
## timing 0.09903549 -0.03361671 -0.2101583 -0.1608602 -0.3201561 0.05508030
##             [,70]      [,71]        [,72]      [,73]       [,74]
## time   0.08774014 0.06846005  0.014074524 -0.2255118 -0.01239411
## timing 0.07171863 0.08590530 -0.001346468 -0.1820332  0.01407885
##              [,75]      [,76]       [,77]     [,78]       [,79]      [,80]
## time   -0.01097302 -0.1626983 0.006974617 0.2626570  0.03390564 0.07947004
## timing -0.02266649 -0.1332968 0.054836825 0.2141165 -0.01603279 0.04765026
##              [,81]     [,82]       [,83]      [,84]      [,85]      [,86]
## time   -0.16737209 0.1290229 -0.02671744 0.04563151 0.03909177 -0.1143934
## timing -0.09552445 0.0981615 -0.05118698 0.06089164 0.02142411 -0.1404413
##           [,87]     [,88]       [,89]       [,90]     [,91]      [,92]
## time   0.242874 0.1472319 -0.10438239 -0.05576196 0.2115069 0.03797328
## timing 0.227627 0.1721801 -0.07830971 -0.04612612 0.2292118 0.02529217
##             [,93]       [,94]      [,95]     [,96]      [,97]       [,98]
## time   0.08971343 -0.05291241 -0.2635795 0.1595984 -0.2170034 -0.07207751
## timing 0.02344621 -0.01787464 -0.2559279 0.2059559 -0.1996885 -0.04828427
##             [,99]      [,100]
## time   0.02693086 -0.07465150
## timing 0.07960488 -0.06226416
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.02274952
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  556591 29.8     940480 50.3   940480 50.3
## Vcells 1156124  8.9    1943012 14.9  1551572 11.9