Courses‎ > ‎Python worksheets‎ > ‎

### Demo: count-based distributional models

 ####################3# Demo of how to compute a count-based distributional models# Katrin Erk February 2020# This code is meant to demonstrate a simple distributional model.# It is *not* optimized th deal with large amounts of data!############ Gathering counts for a count-based model from scratch## Step 1: # Word counting with the Natural Language Toolkit# counting individual wordscorpus = """I am SamSam I amI do not like green eggs and ham"""words = corpus.split()import nltkunigram_counts = nltk.FreqDist(words)print("Word counts in the Green Eggs corpus:", unigram_counts)# Step 2: we need to get the context around a target word.# The following is not intended for use at scale,# but will demonstrate what a context looks like:# a Trigram is a sequence of 3 words. If the middle one is the target,# the ones on either side are the context.# (The same would work with 5-grams)print("Trigrams of words in the Green Eggs corpus:\n", nltk.trigrams(words))# Step 3: For each target word, count context wordscontext_counts = nltk.ConditionalFreqDist()for precontext, target, postcontext in nltk.trigrams(words):    context_counts[ target][precontext] += 1    context_counts[ target][postcontext] += 1# since we counted each middle word of a trigram as the target, and# counted its left and right neighbors as a context, we undercount:# We are missing the very first word as a target, and the very last.# Fix that.# leftmost word as target, right neighbor as contextcontext_counts[words[0]][words[1]] += 1# rightmost word as target, left neighbor as contextcontext_counts[words[-1]][words[-2]] += 1print("Contexts counts from the Green Eggs corpus:\n",list(context_counts.items()))print("The same thing as a counts matrix")colnames = sorted(unigram_counts.keys())print("TARGET", end = "\t")for c in colnames: print(c, end=" ")print()for target in context_counts.conditions():    print(target, end = "\t")    for c in colnames:        print(context_counts[target][c], end = " ")    print()####################3# context counts in a somewhat larger corpusimport nltkprint("The first few trigrams from the Brown corpus:\n", list(nltk.trigrams(list(nltk.corpus.brown.words())))[:10])brown_counts = nltk.ConditionalFreqDist()words = list(nltk.corpus.brown.words())for precontext, target, postcontext in nltk.trigrams(words):    brown_counts[ target][precontext] += 1    brown_counts[ target][postcontext] += 1# again, fixing undercounts of very first and very last word as target# leftmost word as target, right neighbor as contextbrown_counts[words[0]][words[1]] += 1# rightmost word as target, left neighbor as contextbrown_counts[words[-1]][words[-2]] += 1    # 10 most frequent context words: similar across many items# (what can we do about that?)print("10 most frequent contexts for some targets:")print("election:\n", brown_counts["election"].most_common(10))print("love:\n", brown_counts["love"].most_common(10))print("car:", brown_counts["car"].most_common(10))# 100 most frequent context words: now we are starting to see differences.# We also see that many of the 100 most frequent context words only have counts of one.print("100 most frequent contexts for some targets:")print("election:\n", brown_counts["election"].most_common(100))print("love:\n", brown_counts["love"].most_common(100))print("car:\n", brown_counts["car"].most_common(100))# some ambiguous wordsprint("Some ambiguous words:")print("bat:\n", brown_counts["bat"].most_common(100))print("bank:\n", brown_counts["bank"].most_common(100))print("bar:\n", brown_counts["bar"].most_common(100))print("leave:\n", brown_counts["leave"].most_common(100))############### Changing the weights to reflect degree of association# rather than raw counts:# down-weight words like "of", "the", up-weight words like# "romantic" as context for "love"# pointwise mutual information (PMI):#                    P(t, c)# PMI(t, c) = log --------------#                   P(t) P(c)##    #(t, c): the co-occurrence count of t with c#    #(_, _): the sum of counts in the whole table, across all targets#    #(t, _): the sum of counts in the row of target t#    #(_, c): the sum of counts in the column of context item c## then# P(t, c) = #(t, c) / #(_, _)# P(t) = #(t, _) / #(_, _)# P(c) = #(_, c) / #(_, _)## PPMI(t, c) = { PMI(t, c) if PMI(t, c) >= 0#                0, else#import math# #(t, _)count_t = { }for target in brown_counts.conditions():    count_t[ target ] = brown_counts[target].N()# #(_, _)count_all = sum(count_t.values())# #(_, c)count_c = { }for target in brown_counts.conditions():    for context, count in brown_counts[target].items():        count_c[ context ] = count_c.get(context, 0) + 1brown_ppmi = { }for target in brown_counts.conditions():    for context in brown_counts[target].keys():        p_t_c = brown_counts[target][context] / count_all        p_t = count_t.get(target, 0) / count_all        p_c = count_c.get(context, 0) / count_all        pmi = math.log( p_t_c / (p_t * p_c))        if pmi >= 0:            brown_ppmi[ (target, context) ] = pmi        else:            brown_ppmi[ (target, context) ] = 0.0# checking on 'love'love_weights = [ ]for target, context in brown_ppmi.keys():    if target == "love":        love_weights.append( (brown_ppmi[ (target, context) ], context))# sort contexts of "love" by ppmiprint("Contexts of 'love', by PPMI weight")for weight, context in sorted(love_weights, reverse = True):    print(context, weight)#############################3# Instead of having words as context, have documents as contextdocuments = [ "I am Sam", "Sam I am", "I do not like green eggs and ham"]document_counts = nltk.ConditionalFreqDist()for documentindex, document in enumerate(documents):    for word in document.split():        document_counts[word][documentindex] += 1print("Document counts from the Green Eggs multi-document corpus:\n", list(document_counts.items()))print("The same as a term/document matrix")colnames = ["doc0", "doc1", "doc2"]print("TARGET", end = "\t")for c in colnames: print(c, end=" ")print()for target in document_counts.conditions():    print(target, end = "\t")    for docindex, doclabel in enumerate(colnames):        print(document_counts[target][docindex], end = " ")    print()##################3# gensim can do document/term matrices for youimport gensim# running gensim's preprocessingtokenized_documents = [gensim.utils.simple_preprocess(doc) for doc in documents]print("This is what gensim makes of our documents:\n", tokenized_documents)# make an object that can index words in the documentsgensim_dict = gensim.corpora.Dictionary()# and do the countinggensim_counts = [gensim_dict.doc2bow(doc, allow_update=True) for doc in tokenized_documents]# what did we get?print("Here are the counts that gensim got for the Green Eggs multi-document corpus:\n", gensim_counts)# er... what does that mean?print("The whole thing more readably:")for docindex, doc_counts in enumerate(gensim_counts):    print("document", docindex)    for wordid, count in doc_counts:        print("\t", gensim_dict[ wordid], count)###################3# TF/IDF in gensim# making a TF/IDF model. This does not yet create the space!tfidf_model = gensim.models.TfidfModel(gensim_counts)tfidf_space = tfidf_model[gensim_counts]for docindex, doc_weights in enumerate(tfidf_space):    print("document", docindex)    for wordid, weight in doc_weights:        print("\t", gensim_dict[wordid], weight)# This does not work well with our Green Eggs corpus.# Let's use a different one# where words differ in how many documents they occur in,# and where they differ in how often they appear in a document.animalcorpus = ["hippo armadillo badger", "badger elephant", "armadillo mouse mouse"]tokenized_animals = [gensim.utils.simple_preprocess(doc) for doc in animalcorpus]gensim_dict_animals = gensim.corpora.Dictionary()gensim_counts_animals = [gensim_dict_animals.doc2bow(doc, allow_update=True) for doc in tokenized_animals]tfidf_model_animals = gensim.models.TfidfModel(gensim_counts_animals)tfidf_space_animals = tfidf_model_animals[gensim_counts_animals]print("raw counts")for docindex, doc_counts in enumerate(gensim_counts_animals):    print("document", docindex)    for wordid, count in doc_counts:        print("\t", gensim_dict_animals[ wordid], count)print("tf/idf weights")for docindex, doc_counts in enumerate(tfidf_space_animals):    print("document", docindex)    for wordid, count in doc_counts:        print("\t", gensim_dict_animals[ wordid], count)