Courses‎ > ‎Python worksheets‎ > ‎

Demo on word sense induction

# Word sense induction demo
# Katrin Erk February 2020
# Warning: this method does not work particularly well as is.
# There is a lot of literature on how to make this better.

# We download a pre-computed space
import gensim.downloader as gensim_api
space = gensim_api.load("glove-wiki-gigaword-300")

# This gives us an embedding for a word, across all of its
# context.
# How can I get an embedding for a single context?
# The simplest option:
# Get a context embedding by averaging
# over the word embeddings in the context.
import nltk

# let's get a list of medium-frequency content words
fd = nltk.FreqDist(nltk.corpus.brown.words())

print("some medium-frequency words in the Brown corpus")
for word, freq in fd.items():
    if freq >= 50 and freq <= 1000:
            print(word, freq)

# for example:
# fall, sweet, empty, side, show, character, box, window, feet, plants, fire

word_of_interest = "bar"

sentences_of_interest = [s for s in nltk.corpus.brown.sents() if word_of_interest in s]

sent0 = sentences_of_interest[0]
print("The first sentence with the word", word_of_interest, "is:", sent0)

stopwords = nltk.corpus.stopwords.words("english")
import string

sent0_cleaned = []
for w in sent0:
    if w.lower() == word_of_interest:
        # don't keep the target word
        pass
    elif w.lower() in stopwords or w.strip(string.punctuation) == "":
        # drop this
        pass
    elif w not in fd or fd[w] <= 20:
        # drop this
        pass
    else:
        sent0_cleaned.append(w.lower())
       
print("Our cleaned-up first context of the word", word_of_interest, "is:", sent0_cleaned)

# Here is how we make an embedding for this context of the word of interest
words_with_embeddings = [w for w in sent0_cleaned if w in space]
embedding0 = sum(space[w] for w in words_with_embeddings) / len(words_with_embeddings)

# what does this tell us?
# since this has the same dimensionality as word vectors, we can check
# which word vectors are close to this
print("The embedding of this sentence is somewhat similar to:\n", space.similar_by_vector(embedding0))

# We make embeddings for all the sentences
# with the word of interest

def make_contextvector(sent, word_of_interest, space):
    sent_lower = [w.lower() for w in sent]
    sent_cleaned = [w for w in sent_lower if w != word_of_interest and\
                        w not in nltk.corpus.stopwords.words("english") and\
                        w.strip(string.punctuation) != "" and\
                        w in fd and fd[w] >= 20 and\
                        w in space]

    if len(sent_cleaned) > 0:
        return sum(space[w] for w in sent_cleaned) / len(sent_cleaned)
    else:
        return None

contextvectors = [ ]
sentences_with_vectors = [ ]
for sent in sentences_of_interest:
    embedding = make_contextvector(sent, word_of_interest, space)
    if embedding is not None:
        sentences_with_vectors.append(sent)
        contextvectors.append( embedding )

print("We got embeddings for", len(contextvectors), "out of", len(sentences_of_interest), "contexts.")

##########3
# now we cluster.
# We have to decide how many clusters to make.
# Let's try 4.

from sklearn.cluster import KMeans

numclusters = 4
kmeans_obj = KMeans(n_clusters=numclusters)
kmeans_obj.fit(contextvectors)
label_list = kmeans_obj.labels_

# Let's print the sentences that got clustered together.
for clusternumber in range(numclusters):
    print("\n\n")
    print("Sentences in cluster", clusternumber)
    for index, sent in enumerate(sentences_with_vectors):
        if label_list[index] == clusternumber:
            print(" ".join(sent))

Comments