Here is a small Python program that demonstrates word n-grams and conditional probabilities. To run it, you need Python 3 and the natural language toolkit. Please make sure also to download the NLTK data. i mport nltk from nltk.corpus import brown # quick look at typical word pairs: via pre-given collocations function brown_nltk = nltk.Text(brown.words()) brown_nltk.collocations() # bigrams in the Brown corpus: # fd is a data structure that tabulates frequencies of strings. # in this case, frequencies of word bigrams from Brown brown_bigrams = [a + " " + b for a, b in nltk.bigrams(brown.words())] fd = nltk.FreqDist(brown_bigrams) # frequent stuff fd.tabulate(10) # infrequent stuff for h in fd.hapaxes(): print(h) # P(word2 | word1) = frequency of word1 word2 / frequency of word1 SOMETHING # out of all times we have seen bigrams starting in word1, # what percentage was word2? # cfd is a data structure that tabulates the frequencies of pairs: # In our case, it maps words word1 to words word2 that appeared after them, # and records how often each word2 was seen to follow word1 cfd = nltk.ConditionalFreqDist(nltk.bigrams(brown.words())) # this is a data structure that tabulates frequencies of words that followed "The". # Note that the words word2 are ordered by frequency: cfd["The"] # overall, we have seen "The" 7258 times cfd["The"].N() # ... and we have seen "The first" 96 times. cfd["The"]["first"] # The probability P(first | The) is 96 / 7258 cfd["The"]["first"] / cfd["The"].N() # Let's type a text by starting at "The" and then # always using the most frequent word that could follow. # You may have done this on your phone. # But your phone is certainly not trained on the Brown corpus. cfd["The"].max() #... # or, for short, like this: word = "The" for i in range(20): print(word) word = cfd[word].max() # whoops, this got us a never-ending sentence |