Demo: count-based distributional models

####################3

# Demo of how to compute a count-based distributional models

# Katrin Erk February 2020

# This code is meant to demonstrate a simple distributional model.

# It is *not* optimized th deal with large amounts of data!

###########

# Gathering counts for a count-based model from scratch

#

# Step 1:

# Word counting with the Natural Language Toolkit

# counting individual words

corpus = """I am Sam

Sam I am

I do not like green eggs and ham"""

words = corpus.split()

import nltk

unigram_counts = nltk.FreqDist(words)

print("Word counts in the Green Eggs corpus:", unigram_counts)

# Step 2: we need to get the context around a target word.

# The following is not intended for use at scale,

# but will demonstrate what a context looks like:

# a Trigram is a sequence of 3 words. If the middle one is the target,

# the ones on either side are the context.

# (The same would work with 5-grams)

print("Trigrams of words in the Green Eggs corpus:\n", nltk.trigrams(words))

# Step 3: For each target word, count context words

context_counts = nltk.ConditionalFreqDist()

for precontext, target, postcontext in nltk.trigrams(words):

context_counts[ target][precontext] += 1

context_counts[ target][postcontext] += 1

# since we counted each middle word of a trigram as the target, and

# counted its left and right neighbors as a context, we undercount:

# We are missing the very first word as a target, and the very last.

# Fix that.

# leftmost word as target, right neighbor as context

context_counts[words[0]][words[1]] += 1

# rightmost word as target, left neighbor as context

context_counts[words[-1]][words[-2]] += 1

print("Contexts counts from the Green Eggs corpus:\n",list(context_counts.items()))

print("The same thing as a counts matrix")

colnames = sorted(unigram_counts.keys())

print("TARGET", end = "\t")

for c in colnames: print(c, end=" ")

print()

for target in context_counts.conditions():

print(target, end = "\t")

for c in colnames:

print(context_counts[target][c], end = " ")

print()

####################3

# context counts in a somewhat larger corpus

import nltk

print("The first few trigrams from the Brown corpus:\n", list(nltk.trigrams(list(nltk.corpus.brown.words())))[:10])

brown_counts = nltk.ConditionalFreqDist()

words = list(nltk.corpus.brown.words())

for precontext, target, postcontext in nltk.trigrams(words):

brown_counts[ target][precontext] += 1

brown_counts[ target][postcontext] += 1

# again, fixing undercounts of very first and very last word as target

# leftmost word as target, right neighbor as context

brown_counts[words[0]][words[1]] += 1

# rightmost word as target, left neighbor as context

brown_counts[words[-1]][words[-2]] += 1

# 10 most frequent context words: similar across many items

# (what can we do about that?)

print("10 most frequent contexts for some targets:")

print("election:\n", brown_counts["election"].most_common(10))

print("love:\n", brown_counts["love"].most_common(10))

print("car:", brown_counts["car"].most_common(10))

# 100 most frequent context words: now we are starting to see differences.

# We also see that many of the 100 most frequent context words only have counts of one.

print("100 most frequent contexts for some targets:")

print("election:\n", brown_counts["election"].most_common(100))

print("love:\n", brown_counts["love"].most_common(100))

print("car:\n", brown_counts["car"].most_common(100))

# some ambiguous words

print("Some ambiguous words:")

print("bat:\n", brown_counts["bat"].most_common(100))

print("bank:\n", brown_counts["bank"].most_common(100))

print("bar:\n", brown_counts["bar"].most_common(100))

print("leave:\n", brown_counts["leave"].most_common(100))

##############

# Changing the weights to reflect degree of association

# rather than raw counts:

# down-weight words like "of", "the", up-weight words like

# "romantic" as context for "love"

# pointwise mutual information (PMI):

# P(t, c)

# PMI(t, c) = log --------------

# P(t) P(c)

#

# #(t, c): the co-occurrence count of t with c

# #(_, _): the sum of counts in the whole table, across all targets

# #(t, _): the sum of counts in the row of target t

# #(_, c): the sum of counts in the column of context item c

#

# then

# P(t, c) = #(t, c) / #(_, _)

# P(t) = #(t, _) / #(_, _)

# P(c) = #(_, c) / #(_, _)

#

# PPMI(t, c) = { PMI(t, c) if PMI(t, c) >= 0

# 0, else

#

import math

# #(t, _)

count_t = { }

for target in brown_counts.conditions():

count_t[ target ] = brown_counts[target].N()

# #(_, _)

count_all = sum(count_t.values())

# #(_, c)

count_c = { }

for target in brown_counts.conditions():

for context, count in brown_counts[target].items():

count_c[ context ] = count_c.get(context, 0) + 1

brown_ppmi = { }

for target in brown_counts.conditions():

for context in brown_counts[target].keys():

p_t_c = brown_counts[target][context] / count_all

p_t = count_t.get(target, 0) / count_all

p_c = count_c.get(context, 0) / count_all

pmi = math.log( p_t_c / (p_t * p_c))

if pmi >= 0:

brown_ppmi[ (target, context) ] = pmi

else:

brown_ppmi[ (target, context) ] = 0.0

# checking on 'love'

love_weights = [ ]

for target, context in brown_ppmi.keys():

if target == "love":

love_weights.append( (brown_ppmi[ (target, context) ], context))

# sort contexts of "love" by ppmi

print("Contexts of 'love', by PPMI weight")

for weight, context in sorted(love_weights, reverse = True):

print(context, weight)

###############

##############3

# Instead of having words as context, have documents as context

documents = [ "I am Sam", "Sam I am", "I do not like green eggs and ham"]

document_counts = nltk.ConditionalFreqDist()

for documentindex, document in enumerate(documents):

for word in document.split():

document_counts[word][documentindex] += 1

print("Document counts from the Green Eggs multi-document corpus:\n", list(document_counts.items()))

print("The same as a term/document matrix")

colnames = ["doc0", "doc1", "doc2"]

print("TARGET", end = "\t")

for c in colnames: print(c, end=" ")

print()

for target in document_counts.conditions():

print(target, end = "\t")

for docindex, doclabel in enumerate(colnames):

print(document_counts[target][docindex], end = " ")

print()

##################3

# gensim can do document/term matrices for you

import gensim

# running gensim's preprocessing

tokenized_documents = [gensim.utils.simple_preprocess(doc) for doc in documents]

print("This is what gensim makes of our documents:\n", tokenized_documents)

# make an object that can index words in the documents

gensim_dict = gensim.corpora.Dictionary()

# and do the counting

gensim_counts = [gensim_dict.doc2bow(doc, allow_update=True) for doc in tokenized_documents]

# what did we get?

print("Here are the counts that gensim got for the Green Eggs multi-document corpus:\n", gensim_counts)

# er... what does that mean?

print("The whole thing more readably:")

for docindex, doc_counts in enumerate(gensim_counts):

print("document", docindex)

for wordid, count in doc_counts:

print("\t", gensim_dict[ wordid], count)

###################3

# TF/IDF in gensim

# making a TF/IDF model. This does not yet create the space!

tfidf_model = gensim.models.TfidfModel(gensim_counts)

tfidf_space = tfidf_model[gensim_counts]

for docindex, doc_weights in enumerate(tfidf_space):

print("document", docindex)

for wordid, weight in doc_weights:

print("\t", gensim_dict[wordid], weight)

# This does not work well with our Green Eggs corpus.

# Let's use a different one

# where words differ in how many documents they occur in,

# and where they differ in how often they appear in a document.

animalcorpus = ["hippo armadillo badger", "badger elephant", "armadillo mouse mouse"]

tokenized_animals = [gensim.utils.simple_preprocess(doc) for doc in animalcorpus]

gensim_dict_animals = gensim.corpora.Dictionary()

gensim_counts_animals = [gensim_dict_animals.doc2bow(doc, allow_update=True) for doc in tokenized_animals]

tfidf_model_animals = gensim.models.TfidfModel(gensim_counts_animals)

tfidf_space_animals = tfidf_model_animals[gensim_counts_animals]

print("raw counts")

for docindex, doc_counts in enumerate(gensim_counts_animals):

print("document", docindex)

for wordid, count in doc_counts:

print("\t", gensim_dict_animals[ wordid], count)

print("tf/idf weights")

for docindex, doc_counts in enumerate(tfidf_space_animals):

print("document", docindex)

for wordid, count in doc_counts:

print("\t", gensim_dict_animals[ wordid], count)