Unsupervised learning

M. Benesty

2017-09-18

 library(fastrtext)
    
    data("train_sentences")
    data("test_sentences")
    texts <- tolower(train_sentences[,"text"])
    tmp_file_txt <- tempfile()
    tmp_file_model <- tempfile()
    writeLines(text = texts, con = tmp_file_txt)
    execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
## 
Read 0M words
## Number of words:  2061
## Number of labels: 0
## 
Progress: 100.0%  words/sec/thread: 18593  lr: 0.000000  loss: 2.373291  eta: 0h0m
    model <- load_model(tmp_file_model)
## add .bin extension to the path
    # test word extraction
    dict <- get_dictionary(model)
    print(head(dict, 5))
## [1] "the"  "</s>" "of"   "to"   "and"
  # print vector
  print(get_word_vectors(model, c("time", "timing")))
##             [,1]        [,2]        [,3]      [,4]        [,5]       [,6]
## time   0.1810962 0.091241419 -0.03272095 0.1411836  0.04008620 0.04082145
## timing 0.2088803 0.006214355 -0.08493112 0.2028206 -0.01282202 0.02843329
##             [,7]         [,8]        [,9]      [,10]       [,11]
## time   0.1173773 -0.006488502 -0.08286844 -0.4089664 -0.09718096
## timing 0.1315300  0.009406463 -0.05926144 -0.4115326 -0.10840150
##              [,12]       [,13]      [,14]     [,15]     [,16]        [,17]
## time   -0.08043761 -0.12648851 -0.1792859 0.1501502 0.2399595 -0.017405044
## timing -0.07798404 -0.07228893 -0.1559062 0.1958283 0.2047626 -0.003985534
##              [,18]       [,19]      [,20]       [,21]      [,22]
## time   -0.11286198 -0.05555766 0.06132117 -0.09349498 0.06450543
## timing -0.06139282 -0.07624672 0.06129446 -0.12990449 0.04641122
##              [,23]     [,24]     [,25]      [,26]      [,27]      [,28]
## time   -0.01703859 0.2523808 0.1551545 -0.1912673 0.06897227 -0.1221295
## timing -0.02958799 0.2944891 0.1853036 -0.2466569 0.04911768 -0.1635324
##              [,29]       [,30]      [,31]      [,32]      [,33]     [,34]
## time   -0.03134793 -0.06415915 0.03088924 0.05158316 -0.4464382 0.2515295
## timing -0.01831852 -0.05166331 0.04040765 0.04673322 -0.5376940 0.3356575
##             [,35]      [,36]      [,37]        [,38]      [,39]
## time   -0.1111264 0.03204155 -0.3211517 0.0001543164 -0.2078307
## timing -0.1817773 0.06042979 -0.3446439 0.0603916459 -0.1599268
##              [,40]        [,41]     [,42]     [,43]       [,44]      [,45]
## time   -0.03765364 -0.002702016 0.3641257 0.2519707 -0.08028603 -0.2670054
## timing -0.04089422  0.017883314 0.4110973 0.2057763 -0.07353166 -0.2764683
##             [,46]      [,47]      [,48]      [,49]     [,50]       [,51]
## time   -0.1647325 0.02423937 -0.1501303 -0.1554566 0.1231574 -0.07282387
## timing -0.1568905 0.04621249 -0.2406959 -0.2102277 0.1406883 -0.09688005
##             [,52]     [,53]      [,54]       [,55]      [,56]      [,57]
## time   -0.1775701 0.1618737 0.06732297 -0.04623672 -0.1711912 -0.1153811
## timing -0.1829287 0.2544785 0.02287204 -0.05464637 -0.1499546 -0.1125418
##             [,58]      [,59]       [,60]       [,61]       [,62]
## time   -0.2052858 0.09949223 -0.07293445 -0.08733374 -0.06538917
## timing -0.1554236 0.04544684 -0.08035001 -0.07825585 -0.08396165
##             [,63]     [,64]        [,65]      [,66]      [,67]      [,68]
## time   -0.2541366 0.2872204  0.004539874 -0.2434946 -0.1632082 -0.3228838
## timing -0.2538847 0.2316661 -0.064338997 -0.2582107 -0.2079943 -0.3043610
##              [,69]     [,70]       [,71]      [,72]      [,73]      [,74]
## time   -0.01972223 -0.160838 0.006598057 0.05379357 -0.1623145 0.02683534
## timing -0.05862435 -0.178684 0.030743094 0.04776795 -0.1461382 0.05982563
##             [,75]     [,76]       [,77]     [,78]      [,79]     [,80]
## time   0.05596030 0.1946809 -0.06664713 0.1740213 -0.1227354 0.2281437
## timing 0.04829667 0.1922634 -0.01243966 0.1359084 -0.2028181 0.1990591
##             [,81]       [,82]       [,83]      [,84]     [,85]
## time   0.04972129 0.020158198  0.02750161 0.03340785 0.1036480
## timing 0.12683350 0.005008661 -0.02642920 0.02673675 0.1098221
##               [,86]      [,87]     [,88]      [,89]      [,90]     [,91]
## time   -0.003992621 0.03330673 0.3244937 -0.1897279 -0.1270247 0.3045833
## timing -0.034443688 0.03253958 0.3478365 -0.1766882 -0.1198853 0.3332068
##             [,92]      [,93]       [,94]      [,95]     [,96]      [,97]
## time   0.06891263 0.05959697 -0.05578373 -0.1272105 0.1327141 -0.1079597
## timing 0.05631062 0.01493023 -0.01612334 -0.1422063 0.1822213 -0.1058144
##              [,98]       [,99]     [,100]
## time   -0.08175591 -0.08468916 0.09661242
## timing -0.08469449 -0.03905548 0.14269169
  # test word distance
  get_word_distance(model, "time", "timing")
##           [,1]
## [1,] 0.0258146
  # free memory
  unlink(tmp_file_txt)
  unlink(tmp_file_model)
  rm(model)
  gc()
##           used (Mb) gc trigger (Mb) max used (Mb)
## Ncells  556989 29.8     940480 50.3   940480 50.3
## Vcells 1159542  8.9    1946338 14.9  1554547 11.9