Unsupervised learning
M. Benesty
2017-09-15
library(fastrtext)
data("train_sentences")
data("test_sentences")
texts <- tolower(train_sentences[,"text"])
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = texts, con = tmp_file_txt)
execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
##
Read 0M words
## Number of words: 2061
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 18724 lr: 0.000000 loss: 2.461299 eta: 0h0m
model <- load_model(tmp_file_model)
## add .bin extension to the path
# test word extraction
dict <- get_dictionary(model)
print(head(dict, 5))
## [1] "the" "</s>" "of" "to" "and"
# print vector
print(get_word_vectors(model, c("time", "timing")))
## [,1] [,2] [,3] [,4] [,5] [,6]
## time 0.1051282 0.12150212 0.005264201 0.01955215 0.1759570 -0.07116566
## timing 0.1462128 0.08484089 -0.003055850 0.05790246 0.1756263 -0.08370596
## [,7] [,8] [,9] [,10] [,11]
## time 0.02374416 -0.016329534 0.1098138 -0.06714629 -0.05566642
## timing 0.03188968 -0.005233053 0.1142955 -0.07134001 -0.09649358
## [,12] [,13] [,14] [,15] [,16]
## time -0.1005409 -0.1614572 -0.08737041 -0.0551756620 0.3023179
## timing -0.1045317 -0.1315412 -0.06607364 0.0005156712 0.2729409
## [,17] [,18] [,19] [,20] [,21] [,22]
## time -0.03827165 -0.09630886 -0.06744245 0.2874633 -0.1316409 0.08701636
## timing 0.01492595 -0.07921946 -0.07997738 0.2815897 -0.1422282 0.12186342
## [,23] [,24] [,25] [,26] [,27] [,28]
## time -0.07737042 0.1144370 0.01231114 0.022084633 0.2550842 -0.07494630
## timing -0.08972809 0.1423234 0.03694145 0.001559853 0.2445612 -0.07919279
## [,29] [,30] [,31] [,32] [,33] [,34]
## time 0.183799 0.1565165 -0.1971460 0.1906143 -0.2779255 0.09122123
## timing 0.178959 0.1732142 -0.2147351 0.1713849 -0.3568698 0.15957016
## [,35] [,36] [,37] [,38] [,39] [,40]
## time 0.10281681 -0.10707969 -0.2297566 -0.1279059 -0.1924931 -0.2512977
## timing 0.03638727 -0.08419075 -0.2362274 -0.1197964 -0.1748915 -0.2812563
## [,41] [,42] [,43] [,44] [,45] [,46]
## time -0.10382315 -0.01779083 0.4809303 0.08179988 -0.1925488 -0.13714731
## timing -0.08868551 0.02378480 0.4893288 0.09510561 -0.2041904 -0.09322326
## [,47] [,48] [,49] [,50] [,51] [,52]
## time 0.010338202 0.2487794 -0.06054345 0.03952054 0.09150095 -0.09987199
## timing 0.009296207 0.2033333 -0.10158393 0.06320308 0.09833961 -0.10175142
## [,53] [,54] [,55] [,56] [,57] [,58]
## time 0.1699462 0.08838340 -0.2158270 -0.008011733 -0.08851113 -0.3705585
## timing 0.2526238 0.06378453 -0.2260311 -0.018755538 -0.10650443 -0.3642533
## [,59] [,60] [,61] [,62] [,63] [,64]
## time 0.04588198 -0.01582638 -0.1702349 0.04900502 -0.1353486 0.4526037
## timing 0.02069537 -0.01828872 -0.1766143 0.02959018 -0.1763782 0.4583665
## [,65] [,66] [,67] [,68] [,69] [,70]
## time 0.1785897 -0.1104875 0.001167171 -0.2017725 -0.09577073 0.04960832
## timing 0.1521341 -0.1233444 -0.014244801 -0.1996749 -0.13736786 0.03397766
## [,71] [,72] [,73] [,74] [,75]
## time 0.002605274 0.1097845 -0.06954949 -0.2513508 -0.06048902
## timing -0.001418267 0.1040714 -0.05710314 -0.2230393 -0.08617350
## [,76] [,77] [,78] [,79] [,80]
## time 0.03735455 0.005735692 0.01011794 0.08748085 0.02329874
## timing 0.07941721 0.012174603 -0.04015623 0.06538074 0.01149520
## [,81] [,82] [,83] [,84] [,85] [,86]
## time -0.05500054 0.1812908 -0.07001343 0.1544676 -0.09727329 -0.1552624
## timing -0.02101557 0.1562615 -0.06231448 0.1751388 -0.11129720 -0.1879628
## [,87] [,88] [,89] [,90] [,91] [,92]
## time 0.013606191 0.1476225 -0.10499682 0.005484045 0.2271290 0.1721083
## timing -0.009402145 0.1339587 -0.09369148 0.015345438 0.2563034 0.1663923
## [,93] [,94] [,95] [,96] [,97] [,98]
## time 0.1915366 -0.1987056 -0.1243258 0.1142971 -0.1431325 -0.2081325
## timing 0.1433011 -0.1725464 -0.1056151 0.1517264 -0.1042065 -0.2216748
## [,99] [,100]
## time -0.1489273 -0.024113297
## timing -0.1327253 0.003199333
# test word distance
get_word_distance(model, "time", "timing")
## [,1]
## [1,] 0.01655479
# free memory
unlink(tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 555738 29.7 940480 50.3 940480 50.3
## Vcells 1157172 8.9 1946338 14.9 1548136 11.9