Demo on word sense induction

# Word sense induction demo

# Katrin Erk February 2020

# Warning: this method does not work particularly well as is.

# There is a lot of literature on how to make this better.

# We download a pre-computed space

import gensim.downloader as gensim_api

space = gensim_api.load("glove-wiki-gigaword-300")

# This gives us an embedding for a word, across all of its

# context.

# How can I get an embedding for a single context?

# The simplest option:

# Get a context embedding by averaging

# over the word embeddings in the context.

import nltk

# let's get a list of medium-frequency content words

fd = nltk.FreqDist(nltk.corpus.brown.words())

print("some medium-frequency words in the Brown corpus")

for word, freq in fd.items():

if freq >= 50 and freq <= 1000:

print(word, freq)

# for example:

# fall, sweet, empty, side, show, character, box, window, feet, plants, fire

word_of_interest = "bar"

sentences_of_interest = [s for s in nltk.corpus.brown.sents() if word_of_interest in s]

sent0 = sentences_of_interest[0]

print("The first sentence with the word", word_of_interest, "is:", sent0)

stopwords = nltk.corpus.stopwords.words("english")

import string

sent0_cleaned = []

for w in sent0:

if w.lower() == word_of_interest:

# don't keep the target word

pass

elif w.lower() in stopwords or w.strip(string.punctuation) == "":

# drop this

pass

elif w not in fd or fd[w] <= 20:

# drop this

pass

else:

sent0_cleaned.append(w.lower())

print("Our cleaned-up first context of the word", word_of_interest, "is:", sent0_cleaned)

# Here is how we make an embedding for this context of the word of interest

words_with_embeddings = [w for w in sent0_cleaned if w in space]

embedding0 = sum(space[w] for w in words_with_embeddings) / len(words_with_embeddings)

# what does this tell us?

# since this has the same dimensionality as word vectors, we can check

# which word vectors are close to this

print("The embedding of this sentence is somewhat similar to:\n", space.similar_by_vector(embedding0))

# We make embeddings for all the sentences

# with the word of interest

def make_contextvector(sent, word_of_interest, space):

sent_lower = [w.lower() for w in sent]

sent_cleaned = [w for w in sent_lower if w != word_of_interest and\

w not in nltk.corpus.stopwords.words("english") and\

w.strip(string.punctuation) != "" and\

w in fd and fd[w] >= 20 and\

w in space]

if len(sent_cleaned) > 0:

return sum(space[w] for w in sent_cleaned) / len(sent_cleaned)

else:

return None

contextvectors = [ ]

sentences_with_vectors = [ ]

for sent in sentences_of_interest:

embedding = make_contextvector(sent, word_of_interest, space)

if embedding is not None:

sentences_with_vectors.append(sent)

contextvectors.append( embedding )

print("We got embeddings for", len(contextvectors), "out of", len(sentences_of_interest), "contexts.")

##########3

# now we cluster.

# We have to decide how many clusters to make.

# Let's try 4.

from sklearn.cluster import KMeans

numclusters = 4

kmeans_obj = KMeans(n_clusters=numclusters)

kmeans_obj.fit(contextvectors)

label_list = kmeans_obj.labels_

# Let's print the sentences that got clustered together.

for clusternumber in range(numclusters):

print("\n\n")

print("Sentences in cluster", clusternumber)

for index, sent in enumerate(sentences_with_vectors):

if label_list[index] == clusternumber:

print(" ".join(sent))