Demo: conditional probability

# Word counts in the Brown corpus
# Demo for conditional probability
import nltk
from nltk.corpus import brown

# number of words in the Brown corpus
brown_numwords = len(brown.words())

# number of word pairs in the Brown corpus
# word pairs = "bigrams"
brown_num_bigrams = len(list(nltk.bigrams(brown.words())))

# quick look at typical word pairs: via pre-given collocations function
brown_nltk = nltk.Text(brown.words())

# Now we look at one of those collocations in more detail:
# How likely is it that the next word will be "ago",
# GIVEN THAT the word we just read was "years"?

# First we determine the
# frequencies for "years" and "ago"
count_years = brown.words().count("years")
count_ago = brown.words().count("ago")

# Probability for "years": relative frequency
prob_years = count_years / brown_numwords

# Now we determine the frequency of "years ago"
count_years_ago = 0
for word1, word2 in nltk.bigrams(brown.words()):
    if word1 == "years" and word2 == "ago":
        count_years_ago += 1

# Again, we estimate the probability as relative frequency
prob_years_ago = count_years_ago / brown_num_bigrams

# now check the probability of "ago" given "years"
prob_years_ago / prob_years

# This will come out the same as when we compute:
# out of all occurrences of "years _", what percentage is "years ago"?
count_years_something = 0
for word1, word2, in nltk.bigrams(brown.words()):
    if word1 == "years":
        count_years_something += 1

count_years_ago / count_years_something