Unsupervised learning
M. Benesty
2017-09-18
library(fastrtext)
data("train_sentences")
data("test_sentences")
texts <- tolower(train_sentences[,"text"])
tmp_file_txt <- tempfile()
tmp_file_model <- tempfile()
writeLines(text = texts, con = tmp_file_txt)
execute(commands = c("skipgram", "-input", tmp_file_txt, "-output", tmp_file_model, "-verbose", 1))
##
Read 0M words
## Number of words: 2061
## Number of labels: 0
##
Progress: 100.0% words/sec/thread: 18593 lr: 0.000000 loss: 2.373291 eta: 0h0m
model <- load_model(tmp_file_model)
## add .bin extension to the path
# test word extraction
dict <- get_dictionary(model)
print(head(dict, 5))
## [1] "the" "</s>" "of" "to" "and"
# print vector
print(get_word_vectors(model, c("time", "timing")))
## [,1] [,2] [,3] [,4] [,5] [,6]
## time 0.1810962 0.091241419 -0.03272095 0.1411836 0.04008620 0.04082145
## timing 0.2088803 0.006214355 -0.08493112 0.2028206 -0.01282202 0.02843329
## [,7] [,8] [,9] [,10] [,11]
## time 0.1173773 -0.006488502 -0.08286844 -0.4089664 -0.09718096
## timing 0.1315300 0.009406463 -0.05926144 -0.4115326 -0.10840150
## [,12] [,13] [,14] [,15] [,16] [,17]
## time -0.08043761 -0.12648851 -0.1792859 0.1501502 0.2399595 -0.017405044
## timing -0.07798404 -0.07228893 -0.1559062 0.1958283 0.2047626 -0.003985534
## [,18] [,19] [,20] [,21] [,22]
## time -0.11286198 -0.05555766 0.06132117 -0.09349498 0.06450543
## timing -0.06139282 -0.07624672 0.06129446 -0.12990449 0.04641122
## [,23] [,24] [,25] [,26] [,27] [,28]
## time -0.01703859 0.2523808 0.1551545 -0.1912673 0.06897227 -0.1221295
## timing -0.02958799 0.2944891 0.1853036 -0.2466569 0.04911768 -0.1635324
## [,29] [,30] [,31] [,32] [,33] [,34]
## time -0.03134793 -0.06415915 0.03088924 0.05158316 -0.4464382 0.2515295
## timing -0.01831852 -0.05166331 0.04040765 0.04673322 -0.5376940 0.3356575
## [,35] [,36] [,37] [,38] [,39]
## time -0.1111264 0.03204155 -0.3211517 0.0001543164 -0.2078307
## timing -0.1817773 0.06042979 -0.3446439 0.0603916459 -0.1599268
## [,40] [,41] [,42] [,43] [,44] [,45]
## time -0.03765364 -0.002702016 0.3641257 0.2519707 -0.08028603 -0.2670054
## timing -0.04089422 0.017883314 0.4110973 0.2057763 -0.07353166 -0.2764683
## [,46] [,47] [,48] [,49] [,50] [,51]
## time -0.1647325 0.02423937 -0.1501303 -0.1554566 0.1231574 -0.07282387
## timing -0.1568905 0.04621249 -0.2406959 -0.2102277 0.1406883 -0.09688005
## [,52] [,53] [,54] [,55] [,56] [,57]
## time -0.1775701 0.1618737 0.06732297 -0.04623672 -0.1711912 -0.1153811
## timing -0.1829287 0.2544785 0.02287204 -0.05464637 -0.1499546 -0.1125418
## [,58] [,59] [,60] [,61] [,62]
## time -0.2052858 0.09949223 -0.07293445 -0.08733374 -0.06538917
## timing -0.1554236 0.04544684 -0.08035001 -0.07825585 -0.08396165
## [,63] [,64] [,65] [,66] [,67] [,68]
## time -0.2541366 0.2872204 0.004539874 -0.2434946 -0.1632082 -0.3228838
## timing -0.2538847 0.2316661 -0.064338997 -0.2582107 -0.2079943 -0.3043610
## [,69] [,70] [,71] [,72] [,73] [,74]
## time -0.01972223 -0.160838 0.006598057 0.05379357 -0.1623145 0.02683534
## timing -0.05862435 -0.178684 0.030743094 0.04776795 -0.1461382 0.05982563
## [,75] [,76] [,77] [,78] [,79] [,80]
## time 0.05596030 0.1946809 -0.06664713 0.1740213 -0.1227354 0.2281437
## timing 0.04829667 0.1922634 -0.01243966 0.1359084 -0.2028181 0.1990591
## [,81] [,82] [,83] [,84] [,85]
## time 0.04972129 0.020158198 0.02750161 0.03340785 0.1036480
## timing 0.12683350 0.005008661 -0.02642920 0.02673675 0.1098221
## [,86] [,87] [,88] [,89] [,90] [,91]
## time -0.003992621 0.03330673 0.3244937 -0.1897279 -0.1270247 0.3045833
## timing -0.034443688 0.03253958 0.3478365 -0.1766882 -0.1198853 0.3332068
## [,92] [,93] [,94] [,95] [,96] [,97]
## time 0.06891263 0.05959697 -0.05578373 -0.1272105 0.1327141 -0.1079597
## timing 0.05631062 0.01493023 -0.01612334 -0.1422063 0.1822213 -0.1058144
## [,98] [,99] [,100]
## time -0.08175591 -0.08468916 0.09661242
## timing -0.08469449 -0.03905548 0.14269169
# test word distance
get_word_distance(model, "time", "timing")
## [,1]
## [1,] 0.0258146
# free memory
unlink(tmp_file_txt)
unlink(tmp_file_model)
rm(model)
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 556989 29.8 940480 50.3 940480 50.3
## Vcells 1159542 8.9 1946338 14.9 1554547 11.9