::p_load(dplyr,quanteda,stringr) pacman
Word Embedding and Large Language Models
<- readRDS("glove.rds") glove
"fish",] glove[
[1] 0.5639300 0.2583200 0.0117840 0.0413610 0.1514700 0.7130600
[7] 0.0164700 0.3509500 0.0894680 -0.7242100 0.1238400 -0.4943800
[13] -0.5038600 0.4367500 0.0501670 -0.5058800 0.0073237 -0.0277530
[19] -0.6971000 0.7293200 -0.4678000 0.5557000 0.2325800 0.4913300
[25] -0.0193830 0.2928500 -0.0932510 -0.2155100 -0.4063900 0.0187160
[31] -0.3604400 0.3812400 -0.6636600 -0.4052400 -0.3204000 0.2872200
[37] 0.5389700 0.0159480 -0.2526000 0.1708900 -0.2293700 -0.1090000
[43] 0.4120500 0.5186800 -0.4188300 0.0459550 0.3628600 -0.2630600
[49] 0.3818300 0.6455500 -0.1191500 0.2302500 0.6056100 -0.4195000
[55] 0.0606410 0.6559200 -0.2417400 -0.0487490 -0.1855200 0.1593300
[61] 0.2186300 -0.3195100 0.9200800 0.0453260 -0.0528270 -0.6127300
[67] -0.8080600 0.0326350 -0.1097200 -0.0026901 0.4292600 -0.0438710
[73] -0.0947680 0.2366600 -0.4539600 0.2475000 0.5605900 0.7588600
[79] 0.0317250 -0.2826500 0.3488200 -0.0743580 -0.4317800 -0.3137600
[85] 0.2914400 -0.3039900 0.1303600 -0.0204770 -0.4568900 -0.4538400
[91] 0.1539100 -0.1434200 -0.0998070 -0.1660400 0.3350000 0.4276900
[97] -0.1936700 0.0634180 -0.1069900 -0.2017200 0.1252300 -0.2108300
[103] 0.5932200 -0.7355800 0.1971600 0.2821700 0.4482600 0.0926840
[109] 0.0211580 0.0666840 0.0900140 0.2199000 -0.8599200 -0.0457730
[115] 0.4328800 -0.2152800 -0.0612190 0.2717100 -0.1775200 0.2419300
[121] -0.6338800 -0.5251800 -0.1895000 0.8159100 -0.0745480 0.2997600
[127] 0.0573730 0.1185800 0.6703100 0.4748500 -0.1309800 0.2994500
[133] 0.6372200 0.0925630 -0.1655800 -0.2039900 0.7954400 -0.2074800
[139] 0.0195220 0.8102800 0.6085500 0.0575530 0.4657900 -0.8011500
[145] -0.3854500 0.3406500 -0.0291020 0.1693700 -0.1178600 -0.3242200
[151] -0.2650900 0.2306300 -0.1059500 -0.1520700 0.1745600 -0.2507200
[157] -0.2852200 -0.2451300 -0.1174000 -0.2350300 0.3866400 0.1975600
[163] 0.3297600 -0.1110300 -0.0027027 -0.2133800 0.3627200 -0.2039700
[169] -0.4137000 0.1604100 0.0028880 -0.4208900 0.0610520 0.2599200
[175] -0.0067039 0.1191500 0.0714930 0.0682870 0.4170700 -0.5688500
[181] 0.4694400 0.1801600 -0.2338100 -0.5474300 0.2188600 -0.5683400
[187] 0.7118800 0.6526800 0.6289400 -0.1496200 -0.0548750 0.8858200
[193] -0.4133700 -0.1831700 0.4951500 0.4578700 0.0133010 -0.6900300
[199] -0.3920300 -0.3443600 0.9336700 -0.0836890 -0.2674700 -0.3032400
[205] -0.3824900 0.7269000 -0.0600540 -0.3172900 -0.0627750 0.1709400
[211] -0.0044480 -0.4891200 0.3127200 -0.0265630 0.3335300 0.1052900
[217] 0.4193900 0.2240000 0.4050900 -0.3528000 0.3570500 -0.3838300
[223] -0.5392600 -0.4019500 0.2778200 0.0469420 -0.0599800 -0.1093400
[229] 0.2074800 0.4494600 0.2678000 0.4910600 0.0453970 -0.0315890
[235] 0.4444300 0.3296700 -0.2503400 0.3647000 -0.1703700 -0.0130700
[241] -0.7895500 0.4972000 -0.3121900 -0.1915500 -1.2220000 -0.0343950
[247] 0.7855400 -0.2005900 -0.5918800 -0.9835200 0.1794000 -0.4991600
[253] 0.0360820 0.2717100 0.8347900 0.5765700 0.2142200 0.8360200
[259] -0.4412500 0.1773400 -0.2473200 -0.0328380 -0.5930600 -0.3921200
[265] 0.5033800 -0.2307900 -0.1051300 0.3240000 -0.3024200 0.2460300
[271] -0.1465000 0.3592000 0.5705900 0.4909400 0.2571000 0.0756700
[277] -1.7668000 0.1332700 -0.7965300 -0.2462800 -0.1972300 0.7951200
[283] 0.0775740 0.1391900 -1.0390000 -0.3458400 -0.2325100 -0.1649600
[289] 0.7552400 0.1689600 -0.3830700 0.4089800 0.1325300 -0.2052400
[295] -0.4056700 -0.6198500 0.2983800 -0.4480600 0.2065600 -0.3885400
<- function(w1,w2,embed){
word_cos_sim %*% embed[w2,])/(norm(embed[w1,],type = "2")*norm(embed[w2,],type = "2"))
(embed[w1,] }
word_cos_sim("fish","salmon",glove)
[,1]
[1,] 0.6541996
word_cos_sim("fish","giraffe",glove)
[,1]
[1,] 0.1038395
<- function(w,embed,top_n=10){
most_similar_words if(is.character(w)){
<- apply(embed,1, function(x) (x %*% embed[w,])/(norm(x,type = "2")*norm(embed[w,],type="2")))
cos_sim_mat <- head(sort(cos_sim_mat, decreasing=TRUE), top_n+1)[2:(top_n+1)]
top_words else{
}<- apply(embed,1, function(x) (x %*% w)/(norm(x,type = "2")*norm(w,type="2")))
cos_sim_mat <- head(sort(cos_sim_mat, decreasing=TRUE), top_n)
top_words
}
top_words }
"berlin",]-glove["germany",]+glove["france",] glove[
[1] 0.1593480 -0.2449800 -0.3969100 -0.3693600 0.0408160 0.4310420
[7] 0.1436520 -0.5391020 -0.2657620 -0.8131200 0.1920340 -0.1880900
[13] 0.0948170 -0.1002900 0.3464600 0.3787300 -0.4898290 1.0012580
[19] 0.1593500 0.4696400 -0.5971800 0.1259300 0.2129760 -0.2240480
[25] 0.2632680 -0.2695700 0.4503100 -0.2765800 0.2060300 0.4942400
[31] 0.3017600 -0.7015923 -0.7801700 0.5599700 -0.6832100 -0.9021100
[37] -0.0564500 -0.2414550 -0.5759900 -0.1119110 -0.3866900 -0.2037000
[43] -0.0385500 0.2165500 -0.2987250 0.6085000 -0.1089500 0.7300900
[49] -0.3076650 -0.2265200 0.4095100 -0.1826300 0.7471180 -0.7209600
[55] 0.6595400 -0.1934100 -0.5467250 0.0000670 0.3715457 -0.8449400
[61] -0.3379400 0.3629950 0.1017800 0.2802000 -0.3472210 -0.2909400
[67] 0.4565720 -0.0773470 -0.4129500 -0.7792600 0.2421800 -0.5980787
[73] -0.0005120 0.0224300 -0.5563580 0.5590880 1.0116290 0.5257300
[79] -0.7774170 -0.0310030 0.0719760 -0.1657820 0.2271640 0.3705300
[85] -0.6682800 -0.3637330 0.2098200 0.4538230 -0.3972200 -0.4923200
[91] 0.5374700 -0.9596000 0.3615500 -0.1437200 -0.1119720 0.0389800
[97] 0.2161620 0.1561500 -0.1696000 -0.3248940 -0.3008400 0.0302080
[103] 0.8809400 -0.1185000 -0.0028100 0.4822480 -0.1735600 -0.0507585
[109] 0.6094600 0.2515470 0.2497800 0.1417580 0.4890370 -0.4979300
[115] -0.6720040 -0.5260800 0.2266420 0.5455767 0.4504060 -0.3437000
[121] -0.0172700 0.0650510 0.2775930 -0.5412500 -0.0225180 -0.5611700
[127] 0.0845912 0.0801650 -0.1238550 -0.0744600 0.3232340 0.2338270
[133] 0.1454290 -0.6090261 -0.3415400 -0.5866650 -0.2485900 -0.5493977
[139] 0.0560890 -0.5522870 -0.0957200 0.0895900 -0.1877065 0.0453630
[145] -0.0161950 -0.6878000 -0.0942600 -0.3110020 0.0114740 0.6311700
[151] 0.5532300 0.1019200 0.3364900 -0.1124431 0.4141500 0.1309670
[157] -0.4831450 0.0785995 -0.5587800 -0.4582960 0.0396200 0.3322010
[163] -0.3931000 -0.7032110 -0.1764300 -0.0059400 0.0515990 -0.8850470
[169] -0.3013100 0.5605071 -0.1683200 -0.3154500 0.1022960 -1.0115200
[175] 0.1808600 0.1590650 -0.2164100 -0.4033400 -0.7641500 0.3613000
[181] 0.2470200 0.2841900 -0.0319280 0.4181500 0.0930700 0.4908300
[187] -1.0893700 0.6234000 -0.2588890 -0.3743200 -0.2376668 -0.3697363
[193] -0.0672270 -0.7011780 -0.1848480 -0.7412900 -0.8720100 0.3357800
[199] 0.0158620 0.4829960 1.2706000 -0.2638648 -0.3453100 -0.0937720
[205] 0.2234000 -0.0841347 -0.6527100 -0.0175520 -0.1448300 -0.2698200
[211] 0.3441600 0.2884630 -0.2600100 0.3293900 0.3470260 0.3716800
[217] 0.5941800 -0.9450900 0.5246500 -0.4480200 0.7079500 -0.0763050
[223] -0.1130080 -0.1840900 -0.3533200 -0.0800510 0.1915430 0.3086800
[229] -0.3964300 -0.0375500 -0.1339010 -0.2323500 -0.1784400 -0.3056480
[235] 0.4593870 -0.0642810 0.1678270 0.1044520 0.4598000 -0.2069900
[241] -0.1113600 -0.8775360 0.4248800 -0.3236300 0.3835500 -0.0472330
[247] -0.5865810 -0.6398230 -0.1529098 -0.4405304 0.4135900 0.4489900
[253] 0.4504600 0.3509800 0.4507100 -0.0114040 -0.0025650 -0.4112900
[259] 0.0918540 -0.3576490 0.3085630 0.1729450 -0.0956200 1.3021130
[265] 1.0966500 0.2874100 -0.0489800 0.1640040 0.4866860 0.1681840
[271] 0.4557470 0.7073100 -0.3641500 -0.3238340 0.3370500 0.2279800
[277] -1.2726000 0.0138000 0.2507000 0.0519335 0.0289500 -0.0869000
[283] 0.3197600 -0.6320900 -0.0374200 0.6283400 0.1358100 0.7882100
[289] 0.0629600 0.3067200 0.4900300 -0.3634210 0.0596200 -0.7462900
[295] 0.0770500 -0.0206700 0.2015100 0.1606800 -0.4063610 -0.4658500
most_similar_words(glove["berlin",]-glove["germany",]+glove["france",],glove)
paris france prohertrib french brussels berlin le
0.7907898 0.6829652 0.6757443 0.6265681 0.5365580 0.5024956 0.4737045
parisian marseille lyon
0.4493072 0.4470702 0.4469211
most_similar_words(glove["harrisburg",]-glove["pennsylvania",]+glove["california",],glove)
california calif. san bakersfield diego bernardino
0.5974964 0.5604969 0.5552560 0.5311038 0.5256530 0.5081510
angeles sacramento fresno francisco
0.4999010 0.4914256 0.4903882 0.4636588
most_similar_words(glove["actor",]-glove["man",]+glove["woman",],glove)
actress actor actresses actors comedian singer
0.8438846 0.8072483 0.6134423 0.5958819 0.5540746 0.5529440
screenwriter starring starred woman
0.5515834 0.5486785 0.5445139 0.5426185
most_similar_words(glove["doctor",]-glove["man",]+glove["woman",],glove)
doctor physician nurse woman doctors pregnant dentist medical
0.8331191 0.6203881 0.6161286 0.6135359 0.6017279 0.5400155 0.5321170 0.5225777
mother surgeon
0.5143166 0.5109706
most_similar_words(glove["doctor",]-glove["woman",]+glove["man",],glove)
doctor man physician dr. he brother mr. medical
0.7824572 0.5426459 0.5128607 0.5042355 0.4661550 0.4635633 0.4612041 0.4424052
doctors surgeon
0.4420476 0.4393367
<- function(string1, string2, embed){
cos_similarity_avg_vec <- corpus(tolower(c(string1,string2))) %>%
tokens tokens(remove_punct = TRUE, remove_number = TRUE) %>%
tokens_remove(pattern = stopwords("en"))
<- colMeans(embed[tokens[[1]],])
vec1 <- colMeans(embed[tokens[[2]],])
vec2 %*%vec2)/(norm(vec1,type="2")*norm(vec2,type="2"))
(vec1 }
cos_similarity_avg_vec("I like fruits","I like apples and oranges",glove)
[,1]
[1,] 0.7135683
cos_similarity_avg_vec("I like fruits","I enjoy eating fruits",glove)
[,1]
[1,] 0.8200876
cos_similarity_avg_vec("I like fruits","The sky is blue",glove)
[,1]
[1,] 0.2889585