### # We load a pre-computed space # Feel free to choose a different one # if this one is too slow to load on your machine import gensim.downloader as gensim_api space = gensim_api.load("glove-wiki-gigaword-300") # Mikolov et al 2015 first suggested that # semantic relations in semantic space # could be consistent vector offsets # that is, # "walk" could be to "walking" as # "yell" is to "yelling" # # That was a syntactic analogy example. Here is # the most well-known one from semantics: # "king" is to "man" as # "queen" is to "woman" # # There are several datasets for evaluating analogy # in semantic spaces now. # See https://aclweb.org/aclwiki/Analogy_(State_of_the_art) # gensim has a method for evaluating analogy on files # in the format that Mikolov et al used. # WARNING: If you run this on the whole Mikolov et al data, # it will take a long time. # Here it is, commented away: # analogy_scores = space.evaluate_word_analogies('questions-words.txt') # On Canvas, you find a shortened version of the Mikolov dataset, under # mikolov-short.txt # running this yields: analogy_scores = space.evaluate_word_analogies('mikolov_short.txt') # the first part of the output is accuracy print("For this small piece of the Mikolov analogy data,") print("we get an accuracy of", analogy_scores[0]) # the second part has a dictionary for each section, showing us what was correct # and what was incorrect for section in analogy_scores[1]: print("\n\nThe section is", section["section"]) print("\nCorrect tuples:", section["correct"]) print("\nIncorrect tuples", section["incorrect"]) # Unfortunately we don't see what the incorrect guesses were. # So let's do this by hand next. ##### # Now let's do this by hand. First, for a single example. king_analog = space["queen"] - space["king"] + space["man"] # what are the 10 nearest neighbors? print("10 nearest neighbors of") print("queen - king + man:", space.similar_by_vector(king_analog)) # we do mikolov-short.txt by hand mikolov_short = [ ] with open("mikolov_short.txt") as f: for line in f: if line.startswith(":"): # we discard the labels of subcategories for now continue else: mikolov_short.append(line.split()) # Let's see what patterns we get when things go wrong. for a, b, c, d in mikolov_short: a = a.lower() b = b.lower() c = c.lower() d = d.lower() if a not in space or b not in space or c not in space or d not in space: print("discarding OOV tuple", a, b, c, d) continue vec = space[b] - space[a] + space[c] nn = space.similar_by_vector(vec) firstguess = nn[0][0] if firstguess == d: print("Correct:", a, b, c, d) else: print("Incorrect:", a, b, c, d) print("I got:", [word for word, sim in nn]) # For thoughts on the analogy tasks, and variants of # how to solve it, see: # Linzen, Issues in evaluating semantic spaces using word analogies. # https://www.aclweb.org/anthology/W16-2503/ # Levy et al, Linguistic Regularities in Sparse and Explicit Word Representations. # https://www.aclweb.org/anthology/W14-1618.pdf ##################3 # Gender bias in word embeddings has been addressed # through vector arithmetic similar to that used for analogy. # # The idea of doing "debiasing" in the way shown below is from Andrew Ng, # and is similar to an approach in # Bolukbasi et al (2016): "Man is to Computer Programmer as Woman is to Homemaker? # Debiasing Word Embeddings", # https://papers.nips.cc/paper/6228-man-is-to-computer-programmer-as-woman-is-to-homemaker-debiasing-word-embeddings.pdf # (though Ng's approach is simpler) # # getting a bias vector: vec(woman) - vec(man) bias = space["woman"] - space["man"] # what are the words closest to this bias vector? space.similar_by_vector(bias) # let's check some particular occupations. # these are the top-7 "he"-associated and "she"-associated occupations # from Bolukbasi et al (2016) occupations = ["homemaker", "nurse", "receptionist", "librarian", "socialite", "hairdresser", "nanny", "maestro", "skipper", "protege", "philosopher", "captain", "architect", "financier"] # this function computes the similarity of the vector # to each of the members of the list of words. # it returns a list of pairs of (word, similarity) def vector_similarities_to_words(vec, words): # 'distances' gives you 1 - cosine_similarity distances = space.distances(vec, other_words = words) similarities = [1 - d for d in distances] return list(zip(words, similarities)) # we apply this new function to our occupations # to see how close each of them is to our "bias direction" for occupation, sim in vector_similarities_to_words(bias, occupations): print(occupation, sim) # and how close are they to the words "man", "woman"? sim_man = [ (o, space.similarity("man", o)) for o in occupations] sim_woman = [ (o, space.similarity("woman", o)) for o in occupations] print('similarities of occupations to "man":') for occupation, sim in sorted(sim_man, key = lambda pair:pair[1], reverse = True): print(occupation, sim) print('similarities of occupations to "woman":') for occupation, sim in sorted(sim_woman, key = lambda pair:pair[1], reverse = True): print(occupation, sim) ### # removing bias from vectors, first attempt: # we simply subtract the bias vector. # debiased vector: subtract the bias vector def debias(vec): return vec - bias # try it out: # for each of our occupations above, # let's look at the similarities to "man" and "woman" # for both the original and the debiased vector for occupation in occupations: print("---") for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]): print("original", occupation, word, sim) for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]): print("debiased", occupation, word, sim) # This overshoots by a lot. ### # Removing bias from vectors, second attempt: # we use vector rejections to remove bias from vectors. # For this, we need a bit of math. import math # length of a vector: square root of sum of squared entries def veclen(vec): return math.sqrt(sum(dim*dim for dim in vec)) # vector projection: # proj_p(x) = [ (x * p)/ veclen(p) ] * p # This returns a vector. def vecproject(vec, direction): factor = sum(v * d for v, d in zip(vec, direction)) / veclen(direction) return [ d * factor for d in direction ] # vector rejection: # rej_p(x) = x - proj_p(x) # This returns a vector. def vecreject(vec, direction): vecprojection = vecproject(vec, direction) return [ v - p for v, p in zip(vec, vecprojection)] # debiased vector: # vector rejection removing the bias (vec(woman) - vec(man)) from the vector # This returns a vector. def debias(vec): return vecreject(vec, bias) # try it out: # for each of our occupations above, # let's look at the similarities to "man" and "woman" # for both the original and the debiased vector for occupation in occupations: print("---") for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]): print("original", occupation, word, sim) for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]): print("debiased", occupation, word, sim) # looks like the debiasing overshoots again. |
Courses > Python worksheets >