NLTK demo code

import nltk

####
# text to work with
bltext = """It was a dark and stormy night; the rain fell in torrents, except at
occasional intervals, when it was checked by a violent gust of wind
which swept up the streets (for it is in London that our scene lies),
rattling along the house-tops, and fiercely agitating the scanty flame
of the lamps that struggled against the darkness. Through one of the
obscurest quarters of London, and among haunts little loved by the
gentlemen of the police, a man, evidently of the lowest orders, was
wending his solitary way. He stopped twice or thrice at different shops
and houses of a description correspondent with the appearance of the
quartier in which they were situated, and tended inquiry for some
article or another which did not seem easily to be met with. All the
answers he received were couched in the negative; and as he turned from
each door he muttered to himself, in no very elegant phraseology, his
disappointment and discontent. At length, at one house, the landlord, a
sturdy butcher, after rendering the same reply the inquirer had hitherto
received, added, "But if this vill do as vell, Dummie, it is quite at
your sarvice!" Pausing reflectively for a moment, Dummie responded that
he thought the thing proffered might do as well; and thrusting it into
his ample pocket, he strode away with as rapid a motion as the wind and
the rain would allow. He soon came to a nest of low and dingy buildings,
at the entrance to which, in half-effaced characters, was written
"Thames Court." Halting at the most conspicuous of these buildings, an
inn or alehouse, through the half-closed windows of which blazed out in
ruddy comfort the beams of the hospitable hearth, he knocked hastily at
the door. He was admitted by a lady of a certain age, and endowed with a
comely rotundity of face and person."""

# split up into sentences
sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
blsentences = sent_detector.tokenize(bltext.strip())

print "Split into sentences."
print blsentences[0]
raw_input()

###
# split up each sentence into words, taking punctuation into account
blsent_words = [nltk.word_tokenize(sent) for sent in blsentences]

print "Split into words."
print blsent_words[0]
raw_input()

###
# add part-of-speech tags: a list of sentences, each of which is a list of word/tag pairs
blsent_tagged = [nltk.pos_tag(sent) for sent in blsent_words]

print "Added part-of-speech tags."
print blsent_tagged[0]
raw_input()

###
# add named-entity tags
# blsent_ne: list of sentences, each of which is a list of word/tag/NEtag triples
# (we do not go on using these below)
blsent_ne = [nltk.chunk.tree2conlltags(nltk.ne_chunk(s)) for s in blsent_tagged]

print "Added named-entity tags."
print blsent_ne[0]
raw_input()

###
# chunking
# IOB format
# code from the NLTK book
class UnigramChunker(nltk.ChunkParserI):
    def __init__(self, train_sents):
        train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]
                      for sent in train_sents]
        self.tagger = nltk.UnigramTagger(train_data)
            
    def parse(self, sentence):
        return nltk.chunk.conlltags2tree(self.parse_iob(sentence))
   
    def parse_iob(self, sentence):
        pos_tags = [pos for (word,pos) in sentence]
        tagged_pos_tags = self.tagger.tag(pos_tags)
        chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]
        conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)
                     in zip(sentence, chunktags)]
        return conlltags

from nltk.corpus import conll2000
train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])
chunker = UnigramChunker(train_sents)

# blsent_chunked: triples (word, postag, chunktag)
print "Added chunk tags to word/tag pairs (version 1)"
blsent_chunked = [ chunker.parse_iob(s) for s in blsent_tagged ]
print blsent_chunked[0]
raw_input()

# or, second option for chunking: via regular expressions
grammar = r"""
  NP: {<DT|PP\$>?<JJ.*>*<NN.*>+}   # chunk determiner/possessive, adjectives and noun sequences
"""
chunker2 = nltk.RegexpParser(grammar)

# blsent_chunked2: triples (word, postag, chunktag)
blsent_chunked2= [nltk.chunk.tree2conlltags(chunker2.parse(s)) for s in blsent_tagged ]

print "Added chunk tags to word/tag pairs (version 2)"
print blsent_chunked2[0]
raw_input()

###
#lemmatization
lobj = nltk.stem.wordnet.WordNetLemmatizer()

# a list of sentences, each of which is a list of  word/tag/chunktag/lemma tuples
blsent_lemmatized = [ ]

for sent in blsent_chunked:
    onesentence = [ ]
   
    for word, postag, chunktag in sent:
        if postag.startswith("V"): shorttag = "v"
        elif postag.startswith("N"): shorttag = "n"
        elif postag.startswith("J"): shorttag = "a"
        else: shorttag = "n"
        onesentence.append( (word, postag, chunktag, lobj.lemmatize(word.lower(), shorttag)))
       
    blsent_lemmatized.append(onesentence)

print "Added lemmas."
print blsent_lemmatized[0]
raw_input()
   
###
# positive and negative sentiment tags
def read_sentiment_lexicon(filename):
    f = open(filename)
    words = [ ]
    for line in f:
        if not(line.startswith(";")) and not(line.strip() == ""):
            words.append(line.strip())
    f.close()
    return set(words)

poswords = read_sentiment_lexicon("/Users/katrinerk/Data/sentiment/Bing_Liu_opinion-lexicon-English/positive-words.txt")
negwords = read_sentiment_lexicon("/Users/katrinerk/Data/sentiment/Bing_Liu_opinion-lexicon-English/negative-words.txt")

blsent_sentiment = [ ]
for sent in blsent_lemmatized:
    onesentence = [ ]
   
    for word, postag, chunktag, lemma in sent:
        if lemma in poswords:
            onesentence.append( (word, postag, chunktag, lemma, "POS"))
        elif lemma in negwords:
            onesentence.append( (word, postag, chunktag, lemma, "NEG"))
        else:
            onesentence.append( (word, postag, chunktag, lemma, "O"))

    blsent_sentiment.append(onesentence)

print "Added pos/neg sentiment tags"
print blsent_sentiment[0]
raw_input()

#how many positive and negative sentiment words in the text?
num_pos = len([wtuple for sent in blsent_sentiment for wtuple in sent if wtuple[-1] == "POS"])
num_neg = len([wtuple for sent in blsent_sentiment for wtuple in sent if wtuple[-1] == "NEG"])
print "Count of positive sentiment words:", num_pos
print "Count of negative sentiment words:", num_neg



Comments