# what corpora and pre-trained models does gensim have? import gensim.downloader as gensim_api # the whole info: not easy to read print(gensim_api.info()) # okay, let's break this down: we have corpora, and models obj = gensim_api.info() print("\n\nGensim has:", obj.keys()) # what models are there? print("\n\nGensim models:\n", obj["models"].keys()) # let's find out more about one of them print("\n\nHere is the info on the word2vec google news 300 model\n", obj["models"]["word2vec-google-news-300"]) # okay, let's download that one # warning: this takes a while, it's 1.6G space = gensim_api.load("word2vec-google-news-300") # if you call the same command again, it doesn't re-download the space, # it just accesses what you have on disk. It still takes a while. # a few tests to see what we have print(space.most_similar("science", topn=3)) print(space.most_similar("football", topn=3)) print(space.most_similar("idea", topn=3)) # let's try an analogy # the idea of doing analogies this way is from Mikolov et al 2015 king_analog = space["king"] - space["man"] + space["woman"] # what are the 10 nearest neighbors? space.similar_by_vector(king_analog) # Now we look at gender bias in word embeddings. # The idea of doing "debiasing" in this way is from Andrew Ng, # and is similar to an approach in # Bolukbasi et al (2016): "Man is to Computer Programmer as Woman is to Homemaker? # Debiasing Word Embeddings" (though Ng's approach is simpler) # getting a bias vector: vec(woman) - vec(man) bias = space["woman"] - space["man"] # what are the words closest to this bias vector? space.similar_by_vector(bias) # let's check some particular occupations. # these are the top-7 "he"-associated and "she"-associated occupations # from Bolukbasi et al (2016) occupations = ["homemaker", "nurse", "receptionist", "librarian", "socialite", "hairdresser", "nanny", "maestro", "skipper", "protege", "philosopher", "captain", "architect", "financier"] # this function computes the similarity of the vector # to each of the members of the list of words. # it returns a list of pairs of (word, similarity) def vector_similarities_to_words(vec, words): # 'distances' gives you 1 - cosine_similarity distances = space.distances(vec, other_words = words) similarities = [1 - d for d in distances] return list(zip(words, similarities)) # we apply this new function to our occupations # to see how close each of them is to our "bias direction" for occupation, sim in vector_similarities_to_words(bias, occupations): print(occupation, sim) ### # Now we use vector rejections to remove bias from vectors. # For this, we need a bit of math. import math # length of a vector: square root of sum of squared entries def veclen(vec): return math.sqrt(sum(dim*dim for dim in vec)) # vector projection: # proj_p(x) = [ (x * p)/ veclen(p) ] * p # This returns a vector. def vecproject(vec, direction): factor = sum(v * d for v, d in zip(vec, direction)) / veclen(direction) return [ d * factor for d in direction ] # vector rejection: # rej_p(x) = x - proj_p(x) # This returns a vector. def vecreject(vec, direction): vecprojection = vecproject(vec, direction) return [ v - p for v, p in zip(vec, vecprojection)] # debiased vector: # vector rejection removing the bias (vec(woman) - vec(man)) from the vector # This returns a vector. def debias(vec): return vecreject(vec, bias) # try it out: # for each of our occupations above, # let's look at the similarities to "man" and "woman" # for both the original and the debiased vector for occupation in occupations: print("---") for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]): print("original", occupation, word, sim) for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]): print("debiased", occupation, word, sim) # looks like the debiasing overshoots a little. |
Courses > Python worksheets >