# Word sense induction demo # Katrin Erk February 2020 # Warning: this method does not work particularly well as is. # There is a lot of literature on how to make this better. # We download a pre-computed space import gensim.downloader as gensim_api space = gensim_api.load("glove-wiki-gigaword-300") # This gives us an embedding for a word, across all of its # context. # How can I get an embedding for a single context? # The simplest option: # Get a context embedding by averaging # over the word embeddings in the context. import nltk # let's get a list of medium-frequency content words fd = nltk.FreqDist(nltk.corpus.brown.words()) print("some medium-frequency words in the Brown corpus") for word, freq in fd.items(): if freq >= 50 and freq <= 1000: print(word, freq) # for example: # fall, sweet, empty, side, show, character, box, window, feet, plants, fire word_of_interest = "bar" sentences_of_interest = [s for s in nltk.corpus.brown.sents() if word_of_interest in s] sent0 = sentences_of_interest[0] print("The first sentence with the word", word_of_interest, "is:", sent0) stopwords = nltk.corpus.stopwords.words("english") import string sent0_cleaned = [] for w in sent0: if w.lower() == word_of_interest: # don't keep the target word pass elif w.lower() in stopwords or w.strip(string.punctuation) == "": # drop this pass elif w not in fd or fd[w] <= 20: # drop this pass else: sent0_cleaned.append(w.lower()) print("Our cleaned-up first context of the word", word_of_interest, "is:", sent0_cleaned) # Here is how we make an embedding for this context of the word of interest words_with_embeddings = [w for w in sent0_cleaned if w in space] embedding0 = sum(space[w] for w in words_with_embeddings) / len(words_with_embeddings) # what does this tell us? # since this has the same dimensionality as word vectors, we can check # which word vectors are close to this print("The embedding of this sentence is somewhat similar to:\n", space.similar_by_vector(embedding0)) # We make embeddings for all the sentences # with the word of interest def make_contextvector(sent, word_of_interest, space): sent_lower = [w.lower() for w in sent] sent_cleaned = [w for w in sent_lower if w != word_of_interest and\ w not in nltk.corpus.stopwords.words("english") and\ w.strip(string.punctuation) != "" and\ w in fd and fd[w] >= 20 and\ w in space] if len(sent_cleaned) > 0: return sum(space[w] for w in sent_cleaned) / len(sent_cleaned) else: return None contextvectors = [ ] sentences_with_vectors = [ ] for sent in sentences_of_interest: embedding = make_contextvector(sent, word_of_interest, space) if embedding is not None: sentences_with_vectors.append(sent) contextvectors.append( embedding ) print("We got embeddings for", len(contextvectors), "out of", len(sentences_of_interest), "contexts.") ##########3 # now we cluster. # We have to decide how many clusters to make. # Let's try 4. from sklearn.cluster import KMeans numclusters = 4 kmeans_obj = KMeans(n_clusters=numclusters) kmeans_obj.fit(contextvectors) label_list = kmeans_obj.labels_ # Let's print the sentences that got clustered together. for clusternumber in range(numclusters): print("\n\n") print("Sentences in cluster", clusternumber) for index, sent in enumerate(sentences_with_vectors): if label_list[index] == clusternumber: print(" ".join(sent)) |
Courses > Python worksheets >