Courses‎ > ‎Python worksheets‎ > ‎

Demo: count-based distributional models

####################3
# Demo of how to compute a count-based distributional models
# Katrin Erk February 2020
# This code is meant to demonstrate a simple distributional model.
# It is *not* optimized th deal with large amounts of data!

###########
# Gathering counts for a count-based model from scratch
#
# Step 1:
# Word counting with the Natural Language Toolkit
# counting individual words
corpus = """I am Sam
Sam I am
I do not like green eggs and ham"""

words = corpus.split()

import nltk
unigram_counts = nltk.FreqDist(words)

print("Word counts in the Green Eggs corpus:", unigram_counts)

# Step 2: we need to get the context around a target word.
# The following is not intended for use at scale,
# but will demonstrate what a context looks like:
# a Trigram is a sequence of 3 words. If the middle one is the target,
# the ones on either side are the context.
# (The same would work with 5-grams)
print("Trigrams of words in the Green Eggs corpus:\n", nltk.trigrams(words))

# Step 3: For each target word, count context words
context_counts = nltk.ConditionalFreqDist()

for precontext, target, postcontext in nltk.trigrams(words):
    context_counts[ target][precontext] += 1
    context_counts[ target][postcontext] += 1

# since we counted each middle word of a trigram as the target, and
# counted its left and right neighbors as a context, we undercount:
# We are missing the very first word as a target, and the very last.
# Fix that.

# leftmost word as target, right neighbor as context
context_counts[words[0]][words[1]] += 1
# rightmost word as target, left neighbor as context
context_counts[words[-1]][words[-2]] += 1


print("Contexts counts from the Green Eggs corpus:\n",list(context_counts.items()))

print("The same thing as a counts matrix")
colnames = sorted(unigram_counts.keys())
print("TARGET", end = "\t")
for c in colnames: print(c, end=" ")
print()
for target in context_counts.conditions():
    print(target, end = "\t")
    for c in colnames:
        print(context_counts[target][c], end = " ")
    print()

####################3
# context counts in a somewhat larger corpus
import nltk

print("The first few trigrams from the Brown corpus:\n", list(nltk.trigrams(list(nltk.corpus.brown.words())))[:10])

brown_counts = nltk.ConditionalFreqDist()
words = list(nltk.corpus.brown.words())
for precontext, target, postcontext in nltk.trigrams(words):
    brown_counts[ target][precontext] += 1
    brown_counts[ target][postcontext] += 1

# again, fixing undercounts of very first and very last word as target
# leftmost word as target, right neighbor as context
brown_counts[words[0]][words[1]] += 1
# rightmost word as target, left neighbor as context
brown_counts[words[-1]][words[-2]] += 1

   
# 10 most frequent context words: similar across many items
# (what can we do about that?)
print("10 most frequent contexts for some targets:")
print("election:\n", brown_counts["election"].most_common(10))
print("love:\n", brown_counts["love"].most_common(10))
print("car:", brown_counts["car"].most_common(10))

# 100 most frequent context words: now we are starting to see differences.
# We also see that many of the 100 most frequent context words only have counts of one.
print("100 most frequent contexts for some targets:")
print("election:\n", brown_counts["election"].most_common(100))
print("love:\n", brown_counts["love"].most_common(100))
print("car:\n", brown_counts["car"].most_common(100))

# some ambiguous words
print("Some ambiguous words:")
print("bat:\n", brown_counts["bat"].most_common(100))
print("bank:\n", brown_counts["bank"].most_common(100))
print("bar:\n", brown_counts["bar"].most_common(100))
print("leave:\n", brown_counts["leave"].most_common(100))


##############
# Changing the weights to reflect degree of association
# rather than raw counts:
# down-weight words like "of", "the", up-weight words like
# "romantic" as context for "love"

# pointwise mutual information (PMI):
#                    P(t, c)
# PMI(t, c) = log --------------
#                   P(t) P(c)
#
#    #(t, c): the co-occurrence count of t with c
#    #(_, _): the sum of counts in the whole table, across all targets
#    #(t, _): the sum of counts in the row of target t
#    #(_, c): the sum of counts in the column of context item c
#
# then
# P(t, c) = #(t, c) / #(_, _)
# P(t) = #(t, _) / #(_, _)
# P(c) = #(_, c) / #(_, _)
#
# PPMI(t, c) = { PMI(t, c) if PMI(t, c) >= 0
#                0, else
#
import math

# #(t, _)
count_t = { }
for target in brown_counts.conditions():
    count_t[ target ] = brown_counts[target].N()

# #(_, _)
count_all = sum(count_t.values())

# #(_, c)
count_c = { }
for target in brown_counts.conditions():
    for context, count in brown_counts[target].items():
        count_c[ context ] = count_c.get(context, 0) + 1

brown_ppmi = { }
for target in brown_counts.conditions():
    for context in brown_counts[target].keys():
        p_t_c = brown_counts[target][context] / count_all
        p_t = count_t.get(target, 0) / count_all
        p_c = count_c.get(context, 0) / count_all

        pmi = math.log( p_t_c / (p_t * p_c))
        if pmi >= 0:
            brown_ppmi[ (target, context) ] = pmi
        else:
            brown_ppmi[ (target, context) ] = 0.0



# checking on 'love'
love_weights = [ ]
for target, context in brown_ppmi.keys():
    if target == "love":
        love_weights.append( (brown_ppmi[ (target, context) ], context))
# sort contexts of "love" by ppmi
print("Contexts of 'love', by PPMI weight")
for weight, context in sorted(love_weights, reverse = True):
    print(context, weight)

###############
##############3
# Instead of having words as context, have documents as context

documents = [ "I am Sam", "Sam I am", "I do not like green eggs and ham"]
document_counts = nltk.ConditionalFreqDist()

for documentindex, document in enumerate(documents):
    for word in document.split():
        document_counts[word][documentindex] += 1

print("Document counts from the Green Eggs multi-document corpus:\n", list(document_counts.items()))

print("The same as a term/document matrix")
colnames = ["doc0", "doc1", "doc2"]
print("TARGET", end = "\t")
for c in colnames: print(c, end=" ")
print()
for target in document_counts.conditions():
    print(target, end = "\t")
    for docindex, doclabel in enumerate(colnames):
        print(document_counts[target][docindex], end = " ")
    print()

##################3
# gensim can do document/term matrices for you
import gensim

# running gensim's preprocessing
tokenized_documents = [gensim.utils.simple_preprocess(doc) for doc in documents]
print("This is what gensim makes of our documents:\n", tokenized_documents)
# make an object that can index words in the documents
gensim_dict = gensim.corpora.Dictionary()
# and do the counting
gensim_counts = [gensim_dict.doc2bow(doc, allow_update=True) for doc in tokenized_documents]
# what did we get?
print("Here are the counts that gensim got for the Green Eggs multi-document corpus:\n", gensim_counts)
# er... what does that mean?
print("The whole thing more readably:")
for docindex, doc_counts in enumerate(gensim_counts):
    print("document", docindex)
    for wordid, count in doc_counts:
        print("\t", gensim_dict[ wordid], count)


###################3
# TF/IDF in gensim

# making a TF/IDF model. This does not yet create the space!
tfidf_model = gensim.models.TfidfModel(gensim_counts)

tfidf_space = tfidf_model[gensim_counts]

for docindex, doc_weights in enumerate(tfidf_space):
    print("document", docindex)
    for wordid, weight in doc_weights:
        print("\t", gensim_dict[wordid], weight)

# This does not work well with our Green Eggs corpus.
# Let's use a different one
# where words differ in how many documents they occur in,
# and where they differ in how often they appear in a document.

animalcorpus = ["hippo armadillo badger", "badger elephant", "armadillo mouse mouse"]
tokenized_animals = [gensim.utils.simple_preprocess(doc) for doc in animalcorpus]
gensim_dict_animals = gensim.corpora.Dictionary()
gensim_counts_animals = [gensim_dict_animals.doc2bow(doc, allow_update=True) for doc in tokenized_animals]
tfidf_model_animals = gensim.models.TfidfModel(gensim_counts_animals)
tfidf_space_animals = tfidf_model_animals[gensim_counts_animals]

print("raw counts")
for docindex, doc_counts in enumerate(gensim_counts_animals):
    print("document", docindex)
    for wordid, count in doc_counts:
        print("\t", gensim_dict_animals[ wordid], count)

print("tf/idf weights")
for docindex, doc_counts in enumerate(tfidf_space_animals):
    print("document", docindex)
    for wordid, count in doc_counts:
        print("\t", gensim_dict_animals[ wordid], count)



Comments