# Word sense induction demo
# Katrin Erk February 2020
# Warning: this method does not work particularly well as is.
# There is a lot of literature on how to make this better.
# We download a pre-computed space
import gensim.downloader as gensim_api
space = gensim_api.load("glove-wiki-gigaword-300")
# This gives us an embedding for a word, across all of its
# How can I get an embedding for a single context?
# The simplest option:
# Get a context embedding by averaging
# over the word embeddings in the context.
# let's get a list of medium-frequency content words
fd = nltk.FreqDist(nltk.corpus.brown.words())
print("some medium-frequency words in the Brown corpus")
for word, freq in fd.items():
if freq >= 50 and freq <= 1000:
# for example:
# fall, sweet, empty, side, show, character, box, window, feet, plants, fire
word_of_interest = "bar"
sentences_of_interest = [s for s in nltk.corpus.brown.sents() if word_of_interest in s]
sent0 = sentences_of_interest
print("The first sentence with the word", word_of_interest, "is:", sent0)
stopwords = nltk.corpus.stopwords.words("english")
sent0_cleaned = 
for w in sent0:
if w.lower() == word_of_interest:
# don't keep the target word
elif w.lower() in stopwords or w.strip(string.punctuation) == "":
# drop this
elif w not in fd or fd[w] <= 20:
# drop this
print("Our cleaned-up first context of the word", word_of_interest, "is:", sent0_cleaned)
# Here is how we make an embedding for this context of the word of interest
words_with_embeddings = [w for w in sent0_cleaned if w in space]
embedding0 = sum(space[w] for w in words_with_embeddings) / len(words_with_embeddings)
# what does this tell us?
# since this has the same dimensionality as word vectors, we can check
# which word vectors are close to this
print("The embedding of this sentence is somewhat similar to:\n", space.similar_by_vector(embedding0))
# We make embeddings for all the sentences
# with the word of interest
def make_contextvector(sent, word_of_interest, space):
sent_lower = [w.lower() for w in sent]
sent_cleaned = [w for w in sent_lower if w != word_of_interest and\
w not in nltk.corpus.stopwords.words("english") and\
w.strip(string.punctuation) != "" and\
w in fd and fd[w] >= 20 and\
w in space]
if len(sent_cleaned) > 0:
return sum(space[w] for w in sent_cleaned) / len(sent_cleaned)
contextvectors = [ ]
sentences_with_vectors = [ ]
for sent in sentences_of_interest:
embedding = make_contextvector(sent, word_of_interest, space)
if embedding is not None:
contextvectors.append( embedding )
print("We got embeddings for", len(contextvectors), "out of", len(sentences_of_interest), "contexts.")
# now we cluster.
# We have to decide how many clusters to make.
# Let's try 4.
from sklearn.cluster import KMeans
numclusters = 4
kmeans_obj = KMeans(n_clusters=numclusters)
label_list = kmeans_obj.labels_
# Let's print the sentences that got clustered together.
for clusternumber in range(numclusters):
print("Sentences in cluster", clusternumber)
for index, sent in enumerate(sentences_with_vectors):
if label_list[index] == clusternumber: