Courses‎ > ‎Python worksheets‎ > ‎

Using embeddings: vector arithmetic

###
# We load a pre-computed space
# Feel free to choose a different one
# if this one is too slow to load on your machine
import gensim.downloader as gensim_api

space = gensim_api.load("glove-wiki-gigaword-300")

# Mikolov et al 2015 first suggested that
# semantic relations in semantic space
# could be consistent vector offsets
# that is,
# "walk" could be to "walking" as
# "yell" is to "yelling"
#
# That was a syntactic analogy example. Here is
# the most well-known one from semantics:
# "king" is to "man" as
# "queen" is to "woman"
#
# There are several datasets for evaluating analogy
# in semantic spaces now.
# See https://aclweb.org/aclwiki/Analogy_(State_of_the_art)

# gensim has a method for evaluating analogy on files
# in the format that Mikolov et al used.
# WARNING: If you run this on the whole Mikolov et al data,
# it will take a long time.
# Here it is, commented away:
# analogy_scores = space.evaluate_word_analogies('questions-words.txt')

# On Canvas, you find a shortened version of the Mikolov dataset, under
# mikolov-short.txt
# running this yields:
analogy_scores = space.evaluate_word_analogies('mikolov_short.txt')

# the first part of the output is accuracy
print("For this small piece of the Mikolov analogy data,")
print("we get an accuracy of", analogy_scores[0])

# the second part has a dictionary for each section, showing us what was correct
# and what was incorrect
for section in analogy_scores[1]:
    print("\n\nThe section is", section["section"])
    print("\nCorrect tuples:", section["correct"])
    print("\nIncorrect tuples", section["incorrect"])

# Unfortunately we don't see what the incorrect guesses were.
# So let's do this by hand next.

#####
# Now let's do this by hand. First, for a single example.
king_analog = space["queen"] - space["king"] + space["man"]               
# what are the 10 nearest neighbors?
print("10 nearest neighbors of")
print("queen - king + man:", space.similar_by_vector(king_analog))

# we do mikolov-short.txt by hand
mikolov_short = [ ]
with open("mikolov_short.txt") as f:
    for line in f:
        if line.startswith(":"):
            # we discard the labels of subcategories for now
            continue
        else:
            mikolov_short.append(line.split())

# Let's see what patterns we get when things go wrong.
for a, b, c, d in mikolov_short:
    a = a.lower()
    b = b.lower()
    c = c.lower()
    d = d.lower()
    if a not in space or b not in space or c not in space or d not in space:
        print("discarding OOV tuple", a, b, c, d)
        continue

    vec = space[b] - space[a] + space[c]
    nn = space.similar_by_vector(vec)
    firstguess = nn[0][0]
    if firstguess == d:
        print("Correct:", a, b, c, d)
    else:
        print("Incorrect:", a, b, c, d)
        print("I got:", [word for word, sim in nn])
   
# For thoughts on the analogy tasks, and variants of
# how to solve it, see:
# Linzen, Issues in evaluating semantic spaces using word analogies.
#        https://www.aclweb.org/anthology/W16-2503/
# Levy et al, Linguistic Regularities in Sparse and Explicit Word Representations.
# https://www.aclweb.org/anthology/W14-1618.pdf

##################3

# Gender bias in word embeddings has been addressed
# through vector arithmetic similar to that used for analogy.
#
# The idea of doing "debiasing" in the way shown below is from Andrew Ng,
# and is similar to an approach in
# Bolukbasi et al (2016): "Man is to Computer Programmer as Woman is to Homemaker?
# Debiasing Word Embeddings",
# https://papers.nips.cc/paper/6228-man-is-to-computer-programmer-as-woman-is-to-homemaker-debiasing-word-embeddings.pdf
# (though Ng's approach is simpler)
#

# getting a bias vector: vec(woman) - vec(man)
bias = space["woman"] - space["man"]
# what are the words closest to this bias vector?
space.similar_by_vector(bias)

# let's check some particular occupations.
# these are the top-7 "he"-associated and "she"-associated occupations
# from Bolukbasi et al (2016)
occupations = ["homemaker", "nurse", "receptionist", "librarian", "socialite",
               "hairdresser", "nanny", "maestro", "skipper", "protege",
               "philosopher", "captain", "architect", "financier"]

# this function computes the similarity of the vector
# to each of the members of the list of words.
# it returns a list of pairs of (word, similarity)
def vector_similarities_to_words(vec, words):
    # 'distances' gives you 1 - cosine_similarity
    distances = space.distances(vec,  other_words = words)
    similarities = [1 - d for d in distances]
    return list(zip(words, similarities))

# we apply this new function to our occupations
# to see how close each of them is to our "bias direction"
for occupation, sim in vector_similarities_to_words(bias, occupations):
    print(occupation, sim)

# and how close are they to the words "man", "woman"?
sim_man = [ (o, space.similarity("man", o)) for o in occupations]
sim_woman = [ (o, space.similarity("woman", o)) for o in occupations]

print('similarities of occupations to "man":')
for occupation, sim in sorted(sim_man, key = lambda pair:pair[1], reverse = True):
    print(occupation, sim)

print('similarities of occupations to "woman":')
for occupation, sim in sorted(sim_woman, key = lambda pair:pair[1], reverse = True):
    print(occupation, sim)


###
# removing bias from vectors, first attempt:
# we simply subtract the bias vector.


# debiased vector: subtract the bias vector
def debias(vec):
    return vec - bias

# try it out:
# for each of our occupations above,
# let's look at the similarities to "man" and "woman"
# for both the original and the debiased vector
for occupation in occupations:
    print("---")
    for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]):
        print("original", occupation, word, sim)

    for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]):
        print("debiased", occupation, word, sim)

# This overshoots by a lot.

###
# Removing bias from vectors, second attempt:
# we use vector rejections to remove bias from vectors.
# For this, we need a bit of math.
import math

# length of a vector: square root of sum of squared entries
def veclen(vec):
    return math.sqrt(sum(dim*dim for dim in vec))

# vector projection:
# proj_p(x) = [ (x * p)/ veclen(p) ] * p
# This returns a vector.
def vecproject(vec, direction):
    factor = sum(v * d for v, d in zip(vec, direction)) / veclen(direction)
    return [ d * factor for d in direction ]

# vector rejection:
# rej_p(x) = x - proj_p(x)
# This returns a vector.
def vecreject(vec, direction):
    vecprojection = vecproject(vec, direction)

    return [ v - p for v, p in zip(vec, vecprojection)]

# debiased vector:
# vector rejection removing the bias (vec(woman) - vec(man)) from the vector
# This returns a vector.
def debias(vec):
    return vecreject(vec, bias)

# try it out:
# for each of our occupations above,
# let's look at the similarities to "man" and "woman"
# for both the original and the debiased vector
for occupation in occupations:
    print("---")
    for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]):
        print("original", occupation, word, sim)

    for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]):
        print("debiased", occupation, word, sim)
   
# looks like the debiasing overshoots again.



Comments