Unsupervised learning
M. Benesty
2018-01-04
library(fastrtext)
data("train_sentences")
data("test_sentences")
texts <- tolower(train_sentences[,"text"])
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = texts, con = tmp_file_txt)
execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
##
Read 0M words
## Number of words: 2061
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 18692 lr: 0.000000 loss: 2.815933 ETA: 0h 0m
model <- load_model(tmp_file_model)
## add .bin extension to the path
# test word extraction
dict <- get_dictionary(model)
print(head(dict, 5))
## [1] "the" "</s>" "of" "to" "and"
# print vector
print(get_word_vectors(model, c("time", "timing")))
## [,1] [,2] [,3] [,4] [,5] [,6]
## time 0.10612551 0.07148871 -0.01400366 0.08170515 0.2107516 -0.0856003
## timing 0.09404954 0.01110726 -0.07381661 0.14503674 0.1630365 -0.0743561
## [,7] [,8] [,9] [,10] [,11] [,12]
## time 0.06452757 -0.09309069 0.122324 -0.1623787 -0.1547531 0.004745292
## timing 0.07033922 -0.08068457 0.136004 -0.1814308 -0.1790269 0.007925374
## [,13] [,14] [,15] [,16] [,17]
## time -0.12772675 -0.037346292 0.01983134 0.2867689 0.06779584
## timing -0.05574913 -0.002046481 0.07241297 0.2812291 0.12423315
## [,18] [,19] [,20] [,21] [,22] [,23]
## time -0.12172756 -0.1413808 0.294580 -0.1018153 -0.1055674 -0.04908620
## timing -0.09207927 -0.1609025 0.343444 -0.1130185 -0.1195261 -0.06930766
## [,24] [,25] [,26] [,27] [,28] [,29]
## time 0.1014775 -0.06203054 0.024576681 0.3084509 -0.1508369 0.2751614
## timing 0.1310402 -0.04119042 -0.005296993 0.2946844 -0.1915578 0.3021637
## [,30] [,31] [,32] [,33] [,34] [,35]
## time 0.2015223 -0.1567805 0.1789271 -0.2121336 0.1109511 0.17252681
## timing 0.2049126 -0.1890568 0.1500673 -0.2708564 0.1567058 0.09886294
## [,36] [,37] [,38] [,39] [,40]
## time -0.049850523 -0.2123505 -0.13707161 -0.1758844 -0.3161319
## timing 0.005593059 -0.2113849 -0.09012935 -0.1475967 -0.3382239
## [,41] [,42] [,43] [,44] [,45] [,46]
## time -0.08585676 0.06876358 0.3835841 0.03887715 -0.1559960 -0.09014536
## timing -0.06691644 0.13491063 0.3886381 0.05718888 -0.1793154 -0.06344484
## [,47] [,48] [,49] [,50] [,51] [,52]
## time -0.004909037 0.2260344 -0.1501742 0.1084805 0.1115248 -0.001711588
## timing -0.021227960 0.1787634 -0.1597113 0.1651420 0.1235560 0.015674848
## [,53] [,54] [,55] [,56] [,57] [,58]
## time 0.2276037 0.1644684 -0.3088567 0.03679692 -0.04613351 -0.3499355
## timing 0.3194697 0.1295134 -0.3159224 0.05223772 -0.03112178 -0.3311987
## [,59] [,60] [,61] [,62] [,63] [,64]
## time 0.07351421 0.08378514 -0.1161545 0.06963819 -0.1031450 0.3446571
## timing 0.04180297 0.08577771 -0.1244836 0.06173485 -0.1001682 0.3035475
## [,65] [,66] [,67] [,68] [,69]
## time 0.14940004 -0.1716473 0.01368067 -0.08428457 -0.009838303
## timing 0.07556404 -0.1996288 -0.01908610 -0.05758634 -0.037646051
## [,70] [,71] [,72] [,73] [,74]
## time 0.08910245 -0.0124581112 0.1475237 -0.005907365 -0.3198989
## timing 0.04353718 0.0006954131 0.1313596 0.016816046 -0.2922211
## [,75] [,76] [,77] [,78] [,79]
## time 0.03249507 0.01471741 0.01742277 0.001484244 0.0426888950
## timing 0.01686049 0.05143612 0.06074193 -0.084721111 -0.0004637837
## [,80] [,81] [,82] [,83] [,84] [,85]
## time 0.04099551 -0.09909890 0.039027784 -0.2211936 0.2067799 -0.07377546
## timing 0.02473989 -0.02969332 0.005983395 -0.2555504 0.2272858 -0.10093042
## [,86] [,87] [,88] [,89] [,90] [,91]
## time -0.1851192 0.10437235 0.1726995 -0.1654970 -0.01782419 0.2476783
## timing -0.2301955 0.09857988 0.1706954 -0.1782061 0.01217376 0.2932041
## [,92] [,93] [,94] [,95] [,96] [,97]
## time 0.2673696 0.1590543 -0.1627807 -0.2106414 0.02332520 -0.12616825
## timing 0.2699354 0.1204549 -0.1377168 -0.2183254 0.06358713 -0.09300115
## [,98] [,99] [,100]
## time -0.1453499 -0.1403460 0.004437986
## timing -0.1514001 -0.1157987 0.027878668
# test word distance
get_word_distance(model, "time", "timing")
## [,1]
## [1,] 0.02508389
# free memory
unlink(tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 559133 29.9 940480 50.3 940480 50.3
## Vcells 1160819 8.9 1943194 14.9 1548727 11.9