Using Python dictionaries: mapping strings to lists

The following code collects, for each part-of-speech tag, the list of words that have been observed to occur with it.

# we obtain a list of word/tag pairs from the Brown corpus
# using news data only

import nltk
brown_news = nltk.corpus.brown.tagged_words(categories="news")

# This dictionary maps each tag to a list of words that have been observed with it
tag_wordlist = { }

# Now we fill the dictionary

for word, tag in brown_news:
    if tag not in tag_wordlist:
        tag_wordlist[ tag ] = [ ]

    tag_wordlist[ tag ].append(word)

# what words have we observed as nouns?
print(tag_wordlist[ "NN"])





The following code maps prepositions to the words that precede them:



# For the prepositions "in", "on", "up", we collect the words
# that precede them.
# Ideally, we would collect the verbs that form particle verbs
# with these prepositions, like "check in", "take on", "look up",
# but we assume we don't have part-of-speech tags available,
# so collecting preceding words is the next best thing.

import nltk
import sys
import string

prepositions = [ "in", "on", "up"]

# we ask the user for a filename from which we can read  text
print "Please enter a filename"
filename = raw_input()

# we try to open the file,
# but are prepared for the case
# that the user may have mistyped
try:
    f = open(filename)
except IOError:
    print "sorry, could not open", filename
    sys.exit(0) # this leaves the program

# we have successfully opened the file, now we read it
contents = f.read()
f.close()

words = [ w.strip(string.punctuation).lower() for w in contents.split() ]
bigrams = nltk.bigrams(words)

prepositions_preceding = { }

for w1, w2 in bigrams:
    if w2 in prepositions:
        # store w1 as a word that preceded a preposition
        if w2 in prepositions_preceding:
            prepositions_preceding[ w2 ].append( w1)
        else:
            prepositions_preceding[ w2 ] = [ w1 ]

for preposition, preceding in prepositions_preceding.items():
    print preposition, preceding


Comments