Python demo: gensim

This code uses the Gensim and NLTK packages to build tiny distributional models from two datasets, the Brown corpus and the NLTK movie review data. It then showcases Gensim functions for computing nearest neighbors and similarity.

# building on code from the NLTK cookbook

from gensim.models import Word2Vec

from nltk.corpus import brown

# WARNING: the following datasets are too small to make reasonable embeddings.

# I use them for demo purposes because they compute fast.

# On my machine, each of these spaces was computed in about 4 seconds.

#

# we make a space from the Brown corpus

# fiction subcorpus, 68488 words

# iter: number of iterations (epochs) over the corpus

# min_count: only include words with a minimum frequency

#            as given here.

# size: number of dimensions of the embeddings

# sg = 1: use skipgram rather than CBOW

word2vec_fiction = Word2Vec(brown.sents(categories = "fiction"),

                                iter=10, min_count=10,

                                size=300, sg = 1)

# and another one: romance subcorpus, 70,022 words

word2vec_romance = Word2Vec(brown.sents(categories = "romance"),

                                iter=10, min_count=10,

                                size=300, sg = 1)

# let's look at some most similar words.

# to get to the actual space, use word2vec_fiction.wv / word2vec_romance.wv

# money

print("We have entries for 'money' in fiction", "money" in word2vec_fiction.wv, "and romance", "money" in word2vec_romance.wv)

print("nearest neighbors of 'money' in fiction:")

for pair in word2vec_fiction.wv.most_similar("money"):

    print(pair)

print("nearest neighbors of 'money' in romance:")

for pair in word2vec_romance.wv.most_similar("money"):

    print(pair)

# love

print("We have entries for 'love' in fiction", "love" in word2vec_fiction.wv, "and romance", "love" in word2vec_romance.wv)

print("nearest neighbors of 'love' in fiction:")

for pair in word2vec_fiction.wv.most_similar("love"):

    print(pair)

print("nearest neighbors of 'love' in romance:")

for pair in word2vec_romance.wv.most_similar("love"):

    print(pair)

# friend

print("We don't have entries for 'friend' in fiction", "friend" in word2vec_fiction.wv, "and romance", "friend" in word2vec_romance.wv)