Python demo: using a pre-trained space

# what corpora and pre-trained models does gensim have?

import gensim.downloader as gensim_api

# the whole info: not easy to read

print(gensim_api.info())

# okay, let's break this down: we have corpora, and models

obj = gensim_api.info()

print("\n\nGensim has:", obj.keys())

# what models are there?

print("\n\nGensim models:\n", obj["models"].keys())

# let's find out more about one of them

print("\n\nHere is the info on the word2vec google news 300 model\n", obj["models"]["word2vec-google-news-300"])

# okay, let's download that one

# warning: this takes a while, it's 1.6G

space = gensim_api.load("word2vec-google-news-300")

# if you call the same command again, it doesn't re-download the space,

# it just accesses what you have on disk. It still takes a while.

# a few tests to see what we have

print(space.most_similar("science", topn=3))

print(space.most_similar("football", topn=3))

print(space.most_similar("idea", topn=3))

# let's try an analogy

# the idea of doing analogies this way is from Mikolov et al 2015

king_analog = space["king"] - space["man"] + space["woman"]               

# what are the 10 nearest neighbors?

space.similar_by_vector(king_analog)

# Now we look at gender bias in word embeddings.

# The idea of doing "debiasing" in this way is from Andrew Ng,

# and is similar to an approach in

# Bolukbasi et al (2016): "Man is to Computer Programmer as Woman is to Homemaker?

# Debiasing Word Embeddings" (though Ng's approach is simpler)

# getting a bias vector: vec(woman) - vec(man)

bias = space["woman"] - space["man"]

# what are the words closest to this bias vector?

space.similar_by_vector(bias)

# let's check some particular occupations.

# these are the top-7 "he"-associated and "she"-associated occupations

# from Bolukbasi et al (2016)

occupations = ["homemaker", "nurse", "receptionist", "librarian", "socialite",

               "hairdresser", "nanny", "maestro", "skipper", "protege",

               "philosopher", "captain", "architect", "financier"]

# this function computes the similarity of the vector

# to each of the members of the list of words.

# it returns a list of pairs of (word, similarity)

def vector_similarities_to_words(vec, words):

    # 'distances' gives you 1 - cosine_similarity

    distances = space.distances(vec,  other_words = words)

    similarities = [1 - d for d in distances]

    return list(zip(words, similarities))

# we apply this new function to our occupations

# to see how close each of them is to our "bias direction"

for occupation, sim in vector_similarities_to_words(bias, occupations):

    print(occupation, sim)

###

# Now we use vector rejections to remove bias from vectors.

# For this, we need a bit of math.

import math

# length of a vector: square root of sum of squared entries

def veclen(vec):

    return math.sqrt(sum(dim*dim for dim in vec))

# vector projection:

# proj_p(x) = [ (x * p)/ veclen(p) ] * p

# This returns a vector.

def vecproject(vec, direction):

    factor = sum(v * d for v, d in zip(vec, direction)) / veclen(direction)

    return [ d * factor for d in direction ]

# vector rejection:

# rej_p(x) = x - proj_p(x)

# This returns a vector.

def vecreject(vec, direction):

    vecprojection = vecproject(vec, direction)

    return [ v - p for v, p in zip(vec, vecprojection)]

# debiased vector:

# vector rejection removing the bias (vec(woman) - vec(man)) from the vector

# This returns a vector.

def debias(vec):

    return vecreject(vec, bias)

# try it out:

# for each of our occupations above,

# let's look at the similarities to "man" and "woman"

# for both the original and the debiased vector

for occupation in occupations:

    print("---")

    for word, sim in vector_similarities_to_words(space[occupation], ["man", "woman"]):

        print("original", occupation, word, sim)

    for word, sim in vector_similarities_to_words(debias(space[occupation]), ["man", "woman"]):

        print("debiased", occupation, word, sim)

   

# looks like the debiasing overshoots a little.