Word Embedding and Large Language Models

Author

Pei-Hsun Hsieh

pacman::p_load(dplyr,quanteda,stringr)
glove <- readRDS("glove.rds")
glove["fish",]
  [1]  0.5639300  0.2583200  0.0117840  0.0413610  0.1514700  0.7130600
  [7]  0.0164700  0.3509500  0.0894680 -0.7242100  0.1238400 -0.4943800
 [13] -0.5038600  0.4367500  0.0501670 -0.5058800  0.0073237 -0.0277530
 [19] -0.6971000  0.7293200 -0.4678000  0.5557000  0.2325800  0.4913300
 [25] -0.0193830  0.2928500 -0.0932510 -0.2155100 -0.4063900  0.0187160
 [31] -0.3604400  0.3812400 -0.6636600 -0.4052400 -0.3204000  0.2872200
 [37]  0.5389700  0.0159480 -0.2526000  0.1708900 -0.2293700 -0.1090000
 [43]  0.4120500  0.5186800 -0.4188300  0.0459550  0.3628600 -0.2630600
 [49]  0.3818300  0.6455500 -0.1191500  0.2302500  0.6056100 -0.4195000
 [55]  0.0606410  0.6559200 -0.2417400 -0.0487490 -0.1855200  0.1593300
 [61]  0.2186300 -0.3195100  0.9200800  0.0453260 -0.0528270 -0.6127300
 [67] -0.8080600  0.0326350 -0.1097200 -0.0026901  0.4292600 -0.0438710
 [73] -0.0947680  0.2366600 -0.4539600  0.2475000  0.5605900  0.7588600
 [79]  0.0317250 -0.2826500  0.3488200 -0.0743580 -0.4317800 -0.3137600
 [85]  0.2914400 -0.3039900  0.1303600 -0.0204770 -0.4568900 -0.4538400
 [91]  0.1539100 -0.1434200 -0.0998070 -0.1660400  0.3350000  0.4276900
 [97] -0.1936700  0.0634180 -0.1069900 -0.2017200  0.1252300 -0.2108300
[103]  0.5932200 -0.7355800  0.1971600  0.2821700  0.4482600  0.0926840
[109]  0.0211580  0.0666840  0.0900140  0.2199000 -0.8599200 -0.0457730
[115]  0.4328800 -0.2152800 -0.0612190  0.2717100 -0.1775200  0.2419300
[121] -0.6338800 -0.5251800 -0.1895000  0.8159100 -0.0745480  0.2997600
[127]  0.0573730  0.1185800  0.6703100  0.4748500 -0.1309800  0.2994500
[133]  0.6372200  0.0925630 -0.1655800 -0.2039900  0.7954400 -0.2074800
[139]  0.0195220  0.8102800  0.6085500  0.0575530  0.4657900 -0.8011500
[145] -0.3854500  0.3406500 -0.0291020  0.1693700 -0.1178600 -0.3242200
[151] -0.2650900  0.2306300 -0.1059500 -0.1520700  0.1745600 -0.2507200
[157] -0.2852200 -0.2451300 -0.1174000 -0.2350300  0.3866400  0.1975600
[163]  0.3297600 -0.1110300 -0.0027027 -0.2133800  0.3627200 -0.2039700
[169] -0.4137000  0.1604100  0.0028880 -0.4208900  0.0610520  0.2599200
[175] -0.0067039  0.1191500  0.0714930  0.0682870  0.4170700 -0.5688500
[181]  0.4694400  0.1801600 -0.2338100 -0.5474300  0.2188600 -0.5683400
[187]  0.7118800  0.6526800  0.6289400 -0.1496200 -0.0548750  0.8858200
[193] -0.4133700 -0.1831700  0.4951500  0.4578700  0.0133010 -0.6900300
[199] -0.3920300 -0.3443600  0.9336700 -0.0836890 -0.2674700 -0.3032400
[205] -0.3824900  0.7269000 -0.0600540 -0.3172900 -0.0627750  0.1709400
[211] -0.0044480 -0.4891200  0.3127200 -0.0265630  0.3335300  0.1052900
[217]  0.4193900  0.2240000  0.4050900 -0.3528000  0.3570500 -0.3838300
[223] -0.5392600 -0.4019500  0.2778200  0.0469420 -0.0599800 -0.1093400
[229]  0.2074800  0.4494600  0.2678000  0.4910600  0.0453970 -0.0315890
[235]  0.4444300  0.3296700 -0.2503400  0.3647000 -0.1703700 -0.0130700
[241] -0.7895500  0.4972000 -0.3121900 -0.1915500 -1.2220000 -0.0343950
[247]  0.7855400 -0.2005900 -0.5918800 -0.9835200  0.1794000 -0.4991600
[253]  0.0360820  0.2717100  0.8347900  0.5765700  0.2142200  0.8360200
[259] -0.4412500  0.1773400 -0.2473200 -0.0328380 -0.5930600 -0.3921200
[265]  0.5033800 -0.2307900 -0.1051300  0.3240000 -0.3024200  0.2460300
[271] -0.1465000  0.3592000  0.5705900  0.4909400  0.2571000  0.0756700
[277] -1.7668000  0.1332700 -0.7965300 -0.2462800 -0.1972300  0.7951200
[283]  0.0775740  0.1391900 -1.0390000 -0.3458400 -0.2325100 -0.1649600
[289]  0.7552400  0.1689600 -0.3830700  0.4089800  0.1325300 -0.2052400
[295] -0.4056700 -0.6198500  0.2983800 -0.4480600  0.2065600 -0.3885400
word_cos_sim <- function(w1,w2,embed){
  (embed[w1,] %*% embed[w2,])/(norm(embed[w1,],type = "2")*norm(embed[w2,],type = "2"))
}
word_cos_sim("fish","salmon",glove)
          [,1]
[1,] 0.6541996
word_cos_sim("fish","giraffe",glove)
          [,1]
[1,] 0.1038395
most_similar_words <- function(w,embed,top_n=10){
  if(is.character(w)){
    cos_sim_mat <- apply(embed,1, function(x) (x %*% embed[w,])/(norm(x,type = "2")*norm(embed[w,],type="2")))
    top_words <- head(sort(cos_sim_mat, decreasing=TRUE), top_n+1)[2:(top_n+1)]
  }else{
    cos_sim_mat <- apply(embed,1, function(x) (x %*% w)/(norm(x,type = "2")*norm(w,type="2")))
    top_words <- head(sort(cos_sim_mat, decreasing=TRUE), top_n)
  }
  top_words
}
glove["berlin",]-glove["germany",]+glove["france",]
  [1]  0.1593480 -0.2449800 -0.3969100 -0.3693600  0.0408160  0.4310420
  [7]  0.1436520 -0.5391020 -0.2657620 -0.8131200  0.1920340 -0.1880900
 [13]  0.0948170 -0.1002900  0.3464600  0.3787300 -0.4898290  1.0012580
 [19]  0.1593500  0.4696400 -0.5971800  0.1259300  0.2129760 -0.2240480
 [25]  0.2632680 -0.2695700  0.4503100 -0.2765800  0.2060300  0.4942400
 [31]  0.3017600 -0.7015923 -0.7801700  0.5599700 -0.6832100 -0.9021100
 [37] -0.0564500 -0.2414550 -0.5759900 -0.1119110 -0.3866900 -0.2037000
 [43] -0.0385500  0.2165500 -0.2987250  0.6085000 -0.1089500  0.7300900
 [49] -0.3076650 -0.2265200  0.4095100 -0.1826300  0.7471180 -0.7209600
 [55]  0.6595400 -0.1934100 -0.5467250  0.0000670  0.3715457 -0.8449400
 [61] -0.3379400  0.3629950  0.1017800  0.2802000 -0.3472210 -0.2909400
 [67]  0.4565720 -0.0773470 -0.4129500 -0.7792600  0.2421800 -0.5980787
 [73] -0.0005120  0.0224300 -0.5563580  0.5590880  1.0116290  0.5257300
 [79] -0.7774170 -0.0310030  0.0719760 -0.1657820  0.2271640  0.3705300
 [85] -0.6682800 -0.3637330  0.2098200  0.4538230 -0.3972200 -0.4923200
 [91]  0.5374700 -0.9596000  0.3615500 -0.1437200 -0.1119720  0.0389800
 [97]  0.2161620  0.1561500 -0.1696000 -0.3248940 -0.3008400  0.0302080
[103]  0.8809400 -0.1185000 -0.0028100  0.4822480 -0.1735600 -0.0507585
[109]  0.6094600  0.2515470  0.2497800  0.1417580  0.4890370 -0.4979300
[115] -0.6720040 -0.5260800  0.2266420  0.5455767  0.4504060 -0.3437000
[121] -0.0172700  0.0650510  0.2775930 -0.5412500 -0.0225180 -0.5611700
[127]  0.0845912  0.0801650 -0.1238550 -0.0744600  0.3232340  0.2338270
[133]  0.1454290 -0.6090261 -0.3415400 -0.5866650 -0.2485900 -0.5493977
[139]  0.0560890 -0.5522870 -0.0957200  0.0895900 -0.1877065  0.0453630
[145] -0.0161950 -0.6878000 -0.0942600 -0.3110020  0.0114740  0.6311700
[151]  0.5532300  0.1019200  0.3364900 -0.1124431  0.4141500  0.1309670
[157] -0.4831450  0.0785995 -0.5587800 -0.4582960  0.0396200  0.3322010
[163] -0.3931000 -0.7032110 -0.1764300 -0.0059400  0.0515990 -0.8850470
[169] -0.3013100  0.5605071 -0.1683200 -0.3154500  0.1022960 -1.0115200
[175]  0.1808600  0.1590650 -0.2164100 -0.4033400 -0.7641500  0.3613000
[181]  0.2470200  0.2841900 -0.0319280  0.4181500  0.0930700  0.4908300
[187] -1.0893700  0.6234000 -0.2588890 -0.3743200 -0.2376668 -0.3697363
[193] -0.0672270 -0.7011780 -0.1848480 -0.7412900 -0.8720100  0.3357800
[199]  0.0158620  0.4829960  1.2706000 -0.2638648 -0.3453100 -0.0937720
[205]  0.2234000 -0.0841347 -0.6527100 -0.0175520 -0.1448300 -0.2698200
[211]  0.3441600  0.2884630 -0.2600100  0.3293900  0.3470260  0.3716800
[217]  0.5941800 -0.9450900  0.5246500 -0.4480200  0.7079500 -0.0763050
[223] -0.1130080 -0.1840900 -0.3533200 -0.0800510  0.1915430  0.3086800
[229] -0.3964300 -0.0375500 -0.1339010 -0.2323500 -0.1784400 -0.3056480
[235]  0.4593870 -0.0642810  0.1678270  0.1044520  0.4598000 -0.2069900
[241] -0.1113600 -0.8775360  0.4248800 -0.3236300  0.3835500 -0.0472330
[247] -0.5865810 -0.6398230 -0.1529098 -0.4405304  0.4135900  0.4489900
[253]  0.4504600  0.3509800  0.4507100 -0.0114040 -0.0025650 -0.4112900
[259]  0.0918540 -0.3576490  0.3085630  0.1729450 -0.0956200  1.3021130
[265]  1.0966500  0.2874100 -0.0489800  0.1640040  0.4866860  0.1681840
[271]  0.4557470  0.7073100 -0.3641500 -0.3238340  0.3370500  0.2279800
[277] -1.2726000  0.0138000  0.2507000  0.0519335  0.0289500 -0.0869000
[283]  0.3197600 -0.6320900 -0.0374200  0.6283400  0.1358100  0.7882100
[289]  0.0629600  0.3067200  0.4900300 -0.3634210  0.0596200 -0.7462900
[295]  0.0770500 -0.0206700  0.2015100  0.1606800 -0.4063610 -0.4658500
most_similar_words(glove["berlin",]-glove["germany",]+glove["france",],glove)
     paris     france prohertrib     french   brussels     berlin         le 
 0.7907898  0.6829652  0.6757443  0.6265681  0.5365580  0.5024956  0.4737045 
  parisian  marseille       lyon 
 0.4493072  0.4470702  0.4469211 
most_similar_words(glove["harrisburg",]-glove["pennsylvania",]+glove["california",],glove)
 california      calif.         san bakersfield       diego  bernardino 
  0.5974964   0.5604969   0.5552560   0.5311038   0.5256530   0.5081510 
    angeles  sacramento      fresno   francisco 
  0.4999010   0.4914256   0.4903882   0.4636588 
most_similar_words(glove["actor",]-glove["man",]+glove["woman",],glove)
     actress        actor    actresses       actors     comedian       singer 
   0.8438846    0.8072483    0.6134423    0.5958819    0.5540746    0.5529440 
screenwriter     starring      starred        woman 
   0.5515834    0.5486785    0.5445139    0.5426185 
most_similar_words(glove["doctor",]-glove["man",]+glove["woman",],glove)
   doctor physician     nurse     woman   doctors  pregnant   dentist   medical 
0.8331191 0.6203881 0.6161286 0.6135359 0.6017279 0.5400155 0.5321170 0.5225777 
   mother   surgeon 
0.5143166 0.5109706 
most_similar_words(glove["doctor",]-glove["woman",]+glove["man",],glove)
   doctor       man physician       dr.        he   brother       mr.   medical 
0.7824572 0.5426459 0.5128607 0.5042355 0.4661550 0.4635633 0.4612041 0.4424052 
  doctors   surgeon 
0.4420476 0.4393367 
cos_similarity_avg_vec <- function(string1, string2, embed){
  tokens <- corpus(tolower(c(string1,string2))) %>% 
  tokens(remove_punct = TRUE, remove_number = TRUE) %>% 
  tokens_remove(pattern = stopwords("en"))
  vec1 <- colMeans(embed[tokens[[1]],])
  vec2 <- colMeans(embed[tokens[[2]],])
  (vec1%*%vec2)/(norm(vec1,type="2")*norm(vec2,type="2"))
}
cos_similarity_avg_vec("I like fruits","I like apples and oranges",glove)
          [,1]
[1,] 0.7135683
cos_similarity_avg_vec("I like fruits","I enjoy eating fruits",glove)
          [,1]
[1,] 0.8200876
cos_similarity_avg_vec("I like fruits","The sky is blue",glove)
          [,1]
[1,] 0.2889585