Courses‎ > ‎

Demo: a very simple distributional model

import nltk

def compute_space(context_size, corpus):
    space = nltk.ConditionalFreqDist()
    #
    for index in range(len(corpus)):
        # current word
        current = corpus[ index ]
        #    
        # context before the current word: count each item
        # but no preceding context for index 0
        if index > 0:
            # don't start from a cxword_index < 0 in case index < context_size
            for cxword_index in range(max(index - context_size, 0), index):
                cxword = corpus[ cxword_index ]
                # In a ConditionalFreqDist, if 'current' is not a condition yet,
                # then accessing it creates a new empty FreqDist for 'current'
                # The FreqDist method inc() increments the count for the given item by one.
                space[ current ].update([cxword])
        #
        # context after the current word: count each item
        # but no succeeding context for the last item (index len(corpus - 1))
        if index < len(corpus) - 1:
            # don't run until a cxword_index > len(corpus) in case
            # index + context_size > len(corpus)
            for cxword_index in range(index + 1, min(index + context_size + 1, len(corpus))):
                cxword = corpus[ cxword_index ]
                # In a ConditionalFreqDist, if 'current' is not a condition yet,
                # then accessing it creates a new empty FreqDist for 'current'
                # The FreqDist method inc() increments the count for the given item by one.
                space[ current ].update([cxword])
    #
    return space

###############

print( "reading Brown corpus...")
brown_words = list(nltk.corpus.brown.words())

print( "computing space...")

sp = compute_space(2, brown_words)

# 10 most frequent context words: similar across many items
# (what can we do about that?)
print("election:\n", sp["election"].most_common(10))
print("love:\n", sp["love"].most_common(10))
print("car:", sp["car"].most_common(10))

# 100 most frequent context words: now we are starting to see differences
print("election:\n", sp["election"].most_common(100))
print("love:\n", sp["love"].most_common(100))
print("car:\n", sp["car"].most_common(100))

# some ambiguous words
print("bat:\n", sp["bat"].most_common(100))
print("bank:\n", sp["bank"].most_common(100))
print("bar:\n", sp["bar"].most_common(100))
print("leave:\n", sp["leave"].most_common(100))


Comments