Unsupervised learning
M. Benesty
2017-11-06
library(fastrtext)
data("train_sentences")
data("test_sentences")
texts <- tolower(train_sentences[,"text"])
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = texts, con = tmp_file_txt)
execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
##
Read 0M words
## Number of words: 2061
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 17850 lr: 0.000000 loss: 2.565117 eta: 0h0m
model <- load_model(tmp_file_model)
## add .bin extension to the path
# test word extraction
dict <- get_dictionary(model)
print(head(dict, 5))
## [1] "the" "</s>" "of" "to" "and"
# print vector
print(get_word_vectors(model, c("time", "timing")))
## [,1] [,2] [,3] [,4] [,5] [,6]
## time 0.03395659 0.1470706 -0.1058364 0.06878183 0.2309285 0.08670054
## timing 0.07405758 0.1085439 -0.1618748 0.13510625 0.2442455 0.08400857
## [,7] [,8] [,9] [,10] [,11]
## time 0.02896640 0.005708979 -0.05033836 -0.05676518 0.08089076
## timing 0.05488665 0.011414430 -0.02430609 -0.07843672 0.08025268
## [,12] [,13] [,14] [,15] [,16] [,17]
## time -0.1100482 -0.08806141 -0.03016183 0.01264069 0.1562749 -0.07370672
## timing -0.1148377 -0.03641872 -0.01254931 0.05481669 0.1380973 -0.01840515
## [,18] [,19] [,20] [,21] [,22]
## time -0.00132384 -0.07566715 0.006581096 -0.2997985 -0.02544638
## timing 0.02809528 -0.09056149 -0.008341677 -0.3440099 -0.04793579
## [,23] [,24] [,25] [,26] [,27] [,28]
## time 0.03822010 0.05653326 -0.09892341 -0.1603222 0.2401310 -0.3608408
## timing 0.03535323 0.06697033 -0.05893456 -0.2078563 0.2188988 -0.4095490
## [,29] [,30] [,31] [,32] [,33] [,34]
## time 0.06792628 -0.03036106 0.001180083 0.08089674 -0.2813881 0.3012297
## timing 0.09410859 -0.01473445 0.010983766 0.07254659 -0.3645603 0.3640393
## [,35] [,36] [,37] [,38] [,39]
## time -0.005681373 0.007189576 -0.4793869 -0.00365201 -0.1695314
## timing -0.072563231 0.039331511 -0.4984366 0.03357036 -0.1592357
## [,40] [,41] [,42] [,43] [,44] [,45]
## time -0.1274402 -0.2056829 0.2732867 0.3806124 0.04418882 -0.1379555
## timing -0.1524590 -0.2046957 0.3321827 0.3837048 0.04897582 -0.1643560
## [,46] [,47] [,48] [,49] [,50] [,51]
## time -0.2657112 0.09444665 0.04182133 -0.2980613 0.1187810 0.05404853
## timing -0.2357648 0.09773789 -0.02136410 -0.3200164 0.1316597 0.01960943
## [,52] [,53] [,54] [,55] [,56] [,57]
## time -0.2663261 0.1983692 0.04460498 -0.3296352 -0.1995742 -0.1101850
## timing -0.2323197 0.2697556 -0.01013977 -0.3450947 -0.1700735 -0.1021873
## [,58] [,59] [,60] [,61] [,62] [,63]
## time -0.2103852 0.1975938 -0.06060537 -0.2789482 0.01219837 -0.1785361
## timing -0.1650001 0.1324280 -0.07333889 -0.2980339 -0.01236813 -0.1698531
## [,64] [,65] [,66] [,67] [,68] [,69]
## time 0.16413416 0.02298515 -0.1908897 -0.1314048 -0.3613428 0.07670279
## timing 0.09903549 -0.03361671 -0.2101583 -0.1608602 -0.3201561 0.05508030
## [,70] [,71] [,72] [,73] [,74]
## time 0.08774014 0.06846005 0.014074524 -0.2255118 -0.01239411
## timing 0.07171863 0.08590530 -0.001346468 -0.1820332 0.01407885
## [,75] [,76] [,77] [,78] [,79] [,80]
## time -0.01097302 -0.1626983 0.006974617 0.2626570 0.03390564 0.07947004
## timing -0.02266649 -0.1332968 0.054836825 0.2141165 -0.01603279 0.04765026
## [,81] [,82] [,83] [,84] [,85] [,86]
## time -0.16737209 0.1290229 -0.02671744 0.04563151 0.03909177 -0.1143934
## timing -0.09552445 0.0981615 -0.05118698 0.06089164 0.02142411 -0.1404413
## [,87] [,88] [,89] [,90] [,91] [,92]
## time 0.242874 0.1472319 -0.10438239 -0.05576196 0.2115069 0.03797328
## timing 0.227627 0.1721801 -0.07830971 -0.04612612 0.2292118 0.02529217
## [,93] [,94] [,95] [,96] [,97] [,98]
## time 0.08971343 -0.05291241 -0.2635795 0.1595984 -0.2170034 -0.07207751
## timing 0.02344621 -0.01787464 -0.2559279 0.2059559 -0.1996885 -0.04828427
## [,99] [,100]
## time 0.02693086 -0.07465150
## timing 0.07960488 -0.06226416
# test word distance
get_word_distance(model, "time", "timing")
## [,1]
## [1,] 0.02274952
# free memory
unlink(tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 556591 29.8 940480 50.3 940480 50.3
## Vcells 1156124 8.9 1943012 14.9 1551572 11.9