Courses‎ > ‎Python worksheets‎ > ‎

Python demo: using a pre-trained space

# what corpora and pre-trained models does gensim have?
import gensim.downloader as gensim_api

# the whole info: not easy to read

# okay, let's break this down: we have corpora, and models
obj =
print("\n\nGensim has:", obj.keys())

# what models are there?
print("\n\nGensim models:\n", obj["models"].keys())

# let's find out more about one of them
print("\n\nHere is the info on the word2vec google news 300 model\n", obj["models"]["word2vec-google-news-300"])

# okay, let's download that one
# warning: this takes a while, it's 1.6G
space = gensim_api.load("word2vec-google-news-300")
# if you call the same command again, it doesn't re-download the space,
# it just accesses what you have on disk. It still takes a while.

# a few tests to see what we have
print(space.most_similar("science", topn=3))
print(space.most_similar("football", topn=3))
print(space.most_similar("idea", topn=3))

# let's try an analogy
# the idea of doing analogies this way is from Mikolov et al 2015
king_analog = space["king"] - space["man"] + space["woman"]               
# what are the 10 nearest neighbors?

# Now we look at gender bias in word embeddings.
# The idea of doing "debiasing" in this way is from Andrew Ng,
# and is similar to an approach in
# Bolukbasi et al (2016): "Man is to Computer Programmer as Woman is to Homemaker?
# Debiasing Word Embeddings" (though Ng's approach is simpler)

# getting a bias vector: vec(woman) - vec(man)
bias = space["woman"] - space["man"]
# what are the words closest to this bias vector?

# let's check some particular occupations.
# these are the top-7 "he"-associated and "she"-associated occupations
# from Bolukbasi et al (2016)
occupations = ["homemaker", "nurse", "receptionist", "librarian", "socialite",
               "hairdresser", "nanny", "maestro", "skipper", "protege",
               "philosopher", "captain", "architect", "financier"]

# this function computes the similarity of the vector
# to each of the members of the list of words.
# it returns a list of pairs of (word, similarity)
def vector_similarities_to_words(vec, words):
    # 'distances' gives you 1 - cosine_similarity
    distances = space.distances(vec,  other_words = words)
    similarities = [1 - d for d in distances]
    return list(zip(words, similarities))

# we apply this new function to our occupations
# to see how close each of them is to our "bias direction"
for occupation, sim in vector_similarities_to_words(bias, occupations):
    print(occupation, sim)

# Now we use vector rejections to remove bias from vectors.
# For this, we need a bit of math.
import math

# length of a vector: square root of sum of squared entries
def veclen(vec):
    return math.sqrt(sum(dim*dim for dim in vec))

# vector projection:
# proj_p(x) = [ (x * p)/ veclen(p) ] * p
# This returns a vector.
def vecproject(vec, direction):
    factor = sum(v * d for v, d in zip(vec, direction)) / veclen(direction)
    return [ d * factor for d in direction ]

# vector rejection:
# rej_p(x) = x - proj_p(x)
# This returns a vector.
def vecreject(vec, direction):
    vecprojection = vecproject(vec, direction)

    return [ v - p for v, p in zip(vec, vecprojection)]

# debiased vector:
# vector rejection removing the bias (vec(woman) - vec(man)) from the vector
# This returns a vector.
def debias(vec):
    return vecreject(vec, bias)

# try it out:
# for each of our occupations above,
# let's look at the similarities to "man" and "woman"
# for both the original and the debiased vector
for occupation in occupations:
    for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]):
        print("original", occupation, word, sim)

    for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]):
        print("debiased", occupation, word, sim)
# looks like the debiasing overshoots a little.