Python demo: using a pre-trained space
# what corpora and pre-trained models does gensim have?
import gensim.downloader as gensim_api
# the whole info: not easy to read
print(gensim_api.info())
# okay, let's break this down: we have corpora, and models
obj = gensim_api.info()
print("\n\nGensim has:", obj.keys())
# what models are there?
print("\n\nGensim models:\n", obj["models"].keys())
# let's find out more about one of them
print("\n\nHere is the info on the word2vec google news 300 model\n", obj["models"]["word2vec-google-news-300"])
# okay, let's download that one
# warning: this takes a while, it's 1.6G
space = gensim_api.load("word2vec-google-news-300")
# if you call the same command again, it doesn't re-download the space,
# it just accesses what you have on disk. It still takes a while.
# a few tests to see what we have
print(space.most_similar("science", topn=3))
print(space.most_similar("football", topn=3))
print(space.most_similar("idea", topn=3))
# let's try an analogy
# the idea of doing analogies this way is from Mikolov et al 2015
king_analog = space["king"] - space["man"] + space["woman"]
# what are the 10 nearest neighbors?
space.similar_by_vector(king_analog)
# Now we look at gender bias in word embeddings.
# The idea of doing "debiasing" in this way is from Andrew Ng,
# and is similar to an approach in
# Bolukbasi et al (2016): "Man is to Computer Programmer as Woman is to Homemaker?
# Debiasing Word Embeddings" (though Ng's approach is simpler)
# getting a bias vector: vec(woman) - vec(man)
bias = space["woman"] - space["man"]
# what are the words closest to this bias vector?
space.similar_by_vector(bias)
# let's check some particular occupations.
# these are the top-7 "he"-associated and "she"-associated occupations
# from Bolukbasi et al (2016)
occupations = ["homemaker", "nurse", "receptionist", "librarian", "socialite",
"hairdresser", "nanny", "maestro", "skipper", "protege",
"philosopher", "captain", "architect", "financier"]
# this function computes the similarity of the vector
# to each of the members of the list of words.
# it returns a list of pairs of (word, similarity)
def vector_similarities_to_words(vec, words):
# 'distances' gives you 1 - cosine_similarity
distances = space.distances(vec, other_words = words)
similarities = [1 - d for d in distances]
return list(zip(words, similarities))
# we apply this new function to our occupations
# to see how close each of them is to our "bias direction"
for occupation, sim in vector_similarities_to_words(bias, occupations):
print(occupation, sim)
###
# Now we use vector rejections to remove bias from vectors.
# For this, we need a bit of math.
import math
# length of a vector: square root of sum of squared entries
def veclen(vec):
return math.sqrt(sum(dim*dim for dim in vec))
# vector projection:
# proj_p(x) = [ (x * p)/ veclen(p) ] * p
# This returns a vector.
def vecproject(vec, direction):
factor = sum(v * d for v, d in zip(vec, direction)) / veclen(direction)
return [ d * factor for d in direction ]
# vector rejection:
# rej_p(x) = x - proj_p(x)
# This returns a vector.
def vecreject(vec, direction):
vecprojection = vecproject(vec, direction)
return [ v - p for v, p in zip(vec, vecprojection)]
# debiased vector:
# vector rejection removing the bias (vec(woman) - vec(man)) from the vector
# This returns a vector.
def debias(vec):
return vecreject(vec, bias)
# try it out:
# for each of our occupations above,
# let's look at the similarities to "man" and "woman"
# for both the original and the debiased vector
for occupation in occupations:
print("---")
for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]):
print("original", occupation, word, sim)
for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]):
print("debiased", occupation, word, sim)
# looks like the debiasing overshoots a little.