Unsupervised learning

M. Benesty

2019-04-14

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0% words/sec/thread:   26804 lr:  0.000000 loss:  2.612961 ETA:   0h 0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##              [,1]        [,2]        [,3]       [,4]       [,5]       [,6]
## time   0.01004267 -0.00978735 -0.02632043 0.01043909 0.14357023 0.05902158
## timing 0.04440358 -0.09404790 -0.10122213 0.05119922 0.09301324 0.05918713
##              [,7]        [,8]         [,9]      [,10]       [,11]
## time   0.07371495 -0.11321857 -0.057459913 -0.1446028 -0.08348616
## timing 0.09317038 -0.08922739 -0.003908521 -0.1611945 -0.12542203
##              [,12]       [,13]       [,14]     [,15]     [,16]      [,17]
## time   -0.10549711 -0.08317305 -0.07013574 0.1141702 0.2290001 0.02704064
## timing -0.07846112  0.00969881 -0.04332303 0.1590887 0.1596066 0.11701519
##             [,18]      [,19]     [,20]      [,21]      [,22]      [,23]
## time   -0.2145353 -0.2862271 0.1336932 -0.2072568 -0.1440565 0.07360851
## timing -0.1693782 -0.2980982 0.1093466 -0.2079391 -0.1500264 0.04556768
##             [,24]      [,25]       [,26]     [,27]      [,28]     [,29]
## time   0.08057357 -0.2049112 -0.09975825 0.3907839 -0.2674007 0.1394143
## timing 0.10099000 -0.1373469 -0.10787729 0.3692110 -0.2687091 0.1098244
##               [,30]      [,31]      [,32]      [,33]     [,34]      [,35]
## time   -0.038374759 -0.2749258 0.07758265 -0.3626939 0.2664215 0.15163673
## timing  0.002495926 -0.2831641 0.03869082 -0.4248018 0.3485428 0.06422526
##             [,36]      [,37]      [,38]        [,39]      [,40]      [,41]
## time   -0.1627192 -0.2540028 -0.1495155 -0.093088806 -0.2564609 -0.1857021
## timing -0.1213544 -0.2431337 -0.1320437 -0.005504473 -0.2423026 -0.2162778
##            [,42]     [,43]     [,44]      [,45]      [,46]       [,47]
## time   0.2541378 0.3338462 0.1519625 -0.1629103 -0.3351033 -0.05172665
## timing 0.3349099 0.2840048 0.1522866 -0.1792853 -0.2645139 -0.07678813
##             [,48]      [,49]      [,50]       [,51]       [,52]     [,53]
## time   0.13128591 -0.1623360 0.08138186 -0.02016281 -0.04892532 0.3365192
## timing 0.07899003 -0.2105293 0.07066419 -0.01687796 -0.01109806 0.4132735
##             [,54]      [,55]      [,56]        [,57]      [,58]      [,59]
## time   0.08412295 -0.2208018 -0.2274560 -0.025774356 -0.3632224 0.04836709
## timing 0.04173822 -0.2155417 -0.1869687 -0.007443829 -0.3043914 0.01552445
##            [,60]      [,61]      [,62]      [,63]     [,64]       [,65]
## time   0.1988651 -0.1411520 -0.1704554 0.03502641 0.2495242 -0.02895846
## timing 0.2231452 -0.1688523 -0.1878276 0.03517740 0.1714370 -0.06323074
##             [,66]      [,67]      [,68]       [,69]       [,70]
## time   -0.2858789 0.03527451 -0.2233494 -0.08666036  0.02355599
## timing -0.2827499 0.04000413 -0.1508684 -0.09320504 -0.02431842
##              [,71]     [,72]       [,73]      [,74]        [,75]
## time   -0.06684174 0.1703863 -0.03173536 -0.1775711 -0.009198606
## timing -0.06111128 0.1666779 -0.04293436 -0.1898222 -0.018346613
##               [,76]      [,77]     [,78]      [,79]     [,80]       [,81]
## time   -0.009345339 0.03558833 0.2540376 -0.0687244 0.2833966 -0.10932555
## timing -0.036286339 0.04411973 0.2291139 -0.1488639 0.2587606 -0.03632433
##             [,82]      [,83]       [,84]       [,85]      [,86]     [,87]
## time   0.08877143 -0.1706144 -0.01047280 -0.08134419 -0.1255308 0.1784413
## timing 0.07476990 -0.2091434 -0.03658024 -0.08305854 -0.1244449 0.1929368
##               [,88]      [,89]      [,90]     [,91]      [,92]     [,93]
## time   -0.008239874 -0.2267378 0.04326093 0.2765167 0.08179552 0.1399342
## timing -0.092158504 -0.2460469 0.04100705 0.3120714 0.09002918 0.1123436
##              [,94]      [,95]      [,96]      [,97]      [,98]       [,99]
## time   -0.04776959 -0.1867126 0.03033837 -0.2451774 -0.1586638 -0.06205603
## timing  0.01738757 -0.2083907 0.10873853 -0.2278002 -0.1555471 -0.04585168
##            [,100]
## time   -0.1142258
## timing -0.1245154
  # test word distance
  get_word_distance(model, "time", "timing")
##            [,1]
## [1,] 0.03164777
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  580392 31.0    1173920 62.7  1173920 62.7
## Vcells 1251614  9.6    8388608 64.0  1758476 13.5