Using embeddings: vector arithmetic

###

# We load a pre-computed space

# Feel free to choose a different one

# if this one is too slow to load on your machine

import gensim.downloader as gensim_api

space = gensim_api.load("glove-wiki-gigaword-300")

# Mikolov et al 2015 first suggested that

# semantic relations in semantic space

# could be consistent vector offsets

# that is,

# "walk" could be to "walking" as

# "yell" is to "yelling"

#

# That was a syntactic analogy example. Here is

# the most well-known one from semantics:

# "king" is to "man" as

# "queen" is to "woman"

#

# There are several datasets for evaluating analogy

# in semantic spaces now.

# See https://aclweb.org/aclwiki/Analogy_(State_of_the_art)

# gensim has a method for evaluating analogy on files

# in the format that Mikolov et al used.

# WARNING: If you run this on the whole Mikolov et al data,

# it will take a long time.

# Here it is, commented away:

# analogy_scores = space.evaluate_word_analogies('questions-words.txt')

# On Canvas, you find a shortened version of the Mikolov dataset, under

# mikolov-short.txt

# running this yields:

analogy_scores = space.evaluate_word_analogies('mikolov_short.txt')

# the first part of the output is accuracy

print("For this small piece of the Mikolov analogy data,")

print("we get an accuracy of", analogy_scores[0])

# the second part has a dictionary for each section, showing us what was correct

# and what was incorrect

for section in analogy_scores[1]:

print("\n\nThe section is", section["section"])

print("\nCorrect tuples:", section["correct"])

print("\nIncorrect tuples", section["incorrect"])

# Unfortunately we don't see what the incorrect guesses were.

# So let's do this by hand next.

#####

# Now let's do this by hand. First, for a single example.

king_analog = space["queen"] - space["king"] + space["man"]

# what are the 10 nearest neighbors?

print("10 nearest neighbors of")

print("queen - king + man:", space.similar_by_vector(king_analog))

# we do mikolov-short.txt by hand

mikolov_short = [ ]

with open("mikolov_short.txt") as f:

for line in f:

if line.startswith(":"):

# we discard the labels of subcategories for now

continue

else:

mikolov_short.append(line.split())

# Let's see what patterns we get when things go wrong.

for a, b, c, d in mikolov_short:

a = a.lower()

b = b.lower()

c = c.lower()

d = d.lower()

if a not in space or b not in space or c not in space or d not in space:

print("discarding OOV tuple", a, b, c, d)

continue

vec = space[b] - space[a] + space[c]

nn = space.similar_by_vector(vec)

firstguess = nn[0][0]

if firstguess == d:

print("Correct:", a, b, c, d)

else:

print("Incorrect:", a, b, c, d)

print("I got:", [word for word, sim in nn])

# For thoughts on the analogy tasks, and variants of

# how to solve it, see:

# Linzen, Issues in evaluating semantic spaces using word analogies.

# https://www.aclweb.org/anthology/W16-2503/

# Levy et al, Linguistic Regularities in Sparse and Explicit Word Representations.

# https://www.aclweb.org/anthology/W14-1618.pdf

##################3

# Gender bias in word embeddings has been addressed

# through vector arithmetic similar to that used for analogy.

#

# The idea of doing "debiasing" in the way shown below is from Andrew Ng,

# and is similar to an approach in

# Bolukbasi et al (2016): "Man is to Computer Programmer as Woman is to Homemaker?

# Debiasing Word Embeddings",

# https://papers.nips.cc/paper/6228-man-is-to-computer-programmer-as-woman-is-to-homemaker-debiasing-word-embeddings.pdf

# (though Ng's approach is simpler)

#

# getting a bias vector: vec(woman) - vec(man)

bias = space["woman"] - space["man"]

# what are the words closest to this bias vector?

space.similar_by_vector(bias)

# let's check some particular occupations.

# these are the top-7 "he"-associated and "she"-associated occupations

# from Bolukbasi et al (2016)

occupations = ["homemaker", "nurse", "receptionist", "librarian", "socialite",

"hairdresser", "nanny", "maestro", "skipper", "protege",

"philosopher", "captain", "architect", "financier"]

# this function computes the similarity of the vector

# to each of the members of the list of words.

# it returns a list of pairs of (word, similarity)

def vector_similarities_to_words(vec, words):

# 'distances' gives you 1 - cosine_similarity

distances = space.distances(vec, other_words = words)

similarities = [1 - d for d in distances]

return list(zip(words, similarities))

# we apply this new function to our occupations

# to see how close each of them is to our "bias direction"

for occupation, sim in vector_similarities_to_words(bias, occupations):

print(occupation, sim)

# and how close are they to the words "man", "woman"?

sim_man = [ (o, space.similarity("man", o)) for o in occupations]

sim_woman = [ (o, space.similarity("woman", o)) for o in occupations]

print('similarities of occupations to "man":')

for occupation, sim in sorted(sim_man, key = lambda pair:pair[1], reverse = True):

print(occupation, sim)

print('similarities of occupations to "woman":')

for occupation, sim in sorted(sim_woman, key = lambda pair:pair[1], reverse = True):

print(occupation, sim)

###

# removing bias from vectors, first attempt:

# we simply subtract the bias vector.

# debiased vector: subtract the bias vector

def debias(vec):

return vec - bias

# try it out:

# for each of our occupations above,

# let's look at the similarities to "man" and "woman"

# for both the original and the debiased vector

for occupation in occupations:

print("---")

for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]):

print("original", occupation, word, sim)

for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]):

print("debiased", occupation, word, sim)

# This overshoots by a lot.

###

# Removing bias from vectors, second attempt:

# we use vector rejections to remove bias from vectors.

# For this, we need a bit of math.

import math

# length of a vector: square root of sum of squared entries

def veclen(vec):

return math.sqrt(sum(dim*dim for dim in vec))

# vector projection:

# proj_p(x) = [ (x * p)/ veclen(p) ] * p

# This returns a vector.

def vecproject(vec, direction):

factor = sum(v * d for v, d in zip(vec, direction)) / veclen(direction)

return [ d * factor for d in direction ]

# vector rejection:

# rej_p(x) = x - proj_p(x)

# This returns a vector.

def vecreject(vec, direction):

vecprojection = vecproject(vec, direction)

return [ v - p for v, p in zip(vec, vecprojection)]

# debiased vector:

# vector rejection removing the bias (vec(woman) - vec(man)) from the vector

# This returns a vector.

def debias(vec):

return vecreject(vec, bias)

# try it out:

# for each of our occupations above,

# let's look at the similarities to "man" and "woman"

# for both the original and the debiased vector

for occupation in occupations:

print("---")

for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]):

print("original", occupation, word, sim)

for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]):

print("debiased", occupation, word, sim)

# looks like the debiasing overshoots again.