NLTK demo code

import nltk


# text to work with

bltext = """It was a dark and stormy night; the rain fell in torrents, except at

occasional intervals, when it was checked by a violent gust of wind

which swept up the streets (for it is in London that our scene lies),

rattling along the house-tops, and fiercely agitating the scanty flame

of the lamps that struggled against the darkness. Through one of the

obscurest quarters of London, and among haunts little loved by the

gentlemen of the police, a man, evidently of the lowest orders, was

wending his solitary way. He stopped twice or thrice at different shops

and houses of a description correspondent with the appearance of the

quartier in which they were situated, and tended inquiry for some

article or another which did not seem easily to be met with. All the

answers he received were couched in the negative; and as he turned from

each door he muttered to himself, in no very elegant phraseology, his

disappointment and discontent. At length, at one house, the landlord, a

sturdy butcher, after rendering the same reply the inquirer had hitherto

received, added, "But if this vill do as vell, Dummie, it is quite at

your sarvice!" Pausing reflectively for a moment, Dummie responded that

he thought the thing proffered might do as well; and thrusting it into

his ample pocket, he strode away with as rapid a motion as the wind and

the rain would allow. He soon came to a nest of low and dingy buildings,

at the entrance to which, in half-effaced characters, was written

"Thames Court." Halting at the most conspicuous of these buildings, an

inn or alehouse, through the half-closed windows of which blazed out in

ruddy comfort the beams of the hospitable hearth, he knocked hastily at

the door. He was admitted by a lady of a certain age, and endowed with a

comely rotundity of face and person."""

# split up into sentences

sent_detector ='tokenizers/punkt/english.pickle')

blsentences = sent_detector.tokenize(bltext.strip())

print "Split into sentences."

print blsentences[0]



# split up each sentence into words, taking punctuation into account

blsent_words = [nltk.word_tokenize(sent) for sent in blsentences]

print "Split into words."

print blsent_words[0]



# add part-of-speech tags: a list of sentences, each of which is a list of word/tag pairs

blsent_tagged = [nltk.pos_tag(sent) for sent in blsent_words]

print "Added part-of-speech tags."

print blsent_tagged[0]



# add named-entity tags

# blsent_ne: list of sentences, each of which is a list of word/tag/NEtag triples

# (we do not go on using these below)

blsent_ne = [nltk.chunk.tree2conlltags(nltk.ne_chunk(s)) for s in blsent_tagged]

print "Added named-entity tags."

print blsent_ne[0]



# chunking

# IOB format

# code from the NLTK book

class UnigramChunker(nltk.ChunkParserI):

def __init__(self, train_sents):

train_data = [[(t,c) for w,t,c in nltk.chunk.tree2conlltags(sent)]

for sent in train_sents]

self.tagger = nltk.UnigramTagger(train_data)

def parse(self, sentence):

return nltk.chunk.conlltags2tree(self.parse_iob(sentence))

def parse_iob(self, sentence):

pos_tags = [pos for (word,pos) in sentence]

tagged_pos_tags = self.tagger.tag(pos_tags)

chunktags = [chunktag for (pos, chunktag) in tagged_pos_tags]

conlltags = [(word, pos, chunktag) for ((word,pos),chunktag)

in zip(sentence, chunktags)]

return conlltags

from nltk.corpus import conll2000

train_sents = conll2000.chunked_sents('train.txt', chunk_types=['NP'])

chunker = UnigramChunker(train_sents)

# blsent_chunked: triples (word, postag, chunktag)

print "Added chunk tags to word/tag pairs (version 1)"

blsent_chunked = [ chunker.parse_iob(s) for s in blsent_tagged ]

print blsent_chunked[0]


# or, second option for chunking: via regular expressions

grammar = r"""

NP: {<DT|PP\$>?<JJ.*>*<NN.*>+} # chunk determiner/possessive, adjectives and noun sequences


chunker2 = nltk.RegexpParser(grammar)

# blsent_chunked2: triples (word, postag, chunktag)

blsent_chunked2= [nltk.chunk.tree2conlltags(chunker2.parse(s)) for s in blsent_tagged ]

print "Added chunk tags to word/tag pairs (version 2)"

print blsent_chunked2[0]




lobj = nltk.stem.wordnet.WordNetLemmatizer()

# a list of sentences, each of which is a list of word/tag/chunktag/lemma tuples

blsent_lemmatized = [ ]

for sent in blsent_chunked:

onesentence = [ ]

for word, postag, chunktag in sent:

if postag.startswith("V"): shorttag = "v"

elif postag.startswith("N"): shorttag = "n"

elif postag.startswith("J"): shorttag = "a"

else: shorttag = "n"

onesentence.append( (word, postag, chunktag, lobj.lemmatize(word.lower(), shorttag)))


print "Added lemmas."

print blsent_lemmatized[0]



# positive and negative sentiment tags

def read_sentiment_lexicon(filename):

f = open(filename)

words = [ ]

for line in f:

if not(line.startswith(";")) and not(line.strip() == ""):



return set(words)

poswords = read_sentiment_lexicon("/Users/katrinerk/Data/sentiment/Bing_Liu_opinion-lexicon-English/positive-words.txt")

negwords = read_sentiment_lexicon("/Users/katrinerk/Data/sentiment/Bing_Liu_opinion-lexicon-English/negative-words.txt")

blsent_sentiment = [ ]

for sent in blsent_lemmatized:

onesentence = [ ]

for word, postag, chunktag, lemma in sent:

if lemma in poswords:

onesentence.append( (word, postag, chunktag, lemma, "POS"))

elif lemma in negwords:

onesentence.append( (word, postag, chunktag, lemma, "NEG"))


onesentence.append( (word, postag, chunktag, lemma, "O"))


print "Added pos/neg sentiment tags"

print blsent_sentiment[0]


#how many positive and negative sentiment words in the text?

num_pos = len([wtuple for sent in blsent_sentiment for wtuple in sent if wtuple[-1] == "POS"])

num_neg = len([wtuple for sent in blsent_sentiment for wtuple in sent if wtuple[-1] == "NEG"])

print "Count of positive sentiment words:", num_pos

print "Count of negative sentiment words:", num_neg