Courses‎ > ‎Python worksheets‎ > ‎

Demo: generative models

#
##
# Generating data from a mixture of
# Gaussians


import numpy as np

# numpy can generate random numbers
# from a 2-dimensional Gaussian.
# we choose three different ones.

mean1 = (1,1)
mean2 = (5,2)
mean3 = (1,4)
cov1 = ((0.1, 0), (0, 0.1))
cov2 = ((0.5, 0), (0, 0.4))
cov3 = ((1, 0), (0, 0.1))

numpoints = 20

# drawing values from each Gaussian
x1, y1 = np.random.multivariate_normal(mean1, cov1, numpoints).T
x2, y2 = np.random.multivariate_normal(mean2, cov2, numpoints).T
x3, y3 = np.random.multivariate_normal(mean3, cov3, numpoints).T

x = np.concatenate([x1, x2, x3])
y = np.concatenate([y1, y2, y3])

# plotting the points,
# first gaussian in green, second in red, third in blue.
import matplotlib.pyplot as plt
plt.scatter(x, y,
            c = ["green"]*numpoints + ["red"]*numpoints + ["blue"]*numpoints)
plt.show()

# but how about if we plot them all in the same color?
# Could you still guess which points came from the same
# underlying Gaussian?
plt.scatter(x, y)
plt.show()

# The way to figure this out is to
# reason back to the *most likely* Gaussians
# to have generated the data.
# For a demo, see https://lukapopijac.github.io/gaussian-mixture-model/

#########################3
# Now let's do distributions over words instead
# of a distribution over two-dimensional points.

# first word list: kid birthday
wordgroup1 = ["balloon", "happy", "run", "merry", "cake", "jump", "pizza"]

# a categorial distribution describes what happens when you roll
# a die once: how likely is each side to come up?
# if the die is fair, each side is equally likely.
print("for a fair die with 6 sides, the probability of each side is", 1/6)
print("and the list of probabilities of all sides is", [ 1/6] * 6)

print("If we could build a fair a die with 7 sides, then")
print("the list of probabilities of all sides would be", [1/7]*7)

# So if we had a die with the words of wordgroup1 written on the sides,
# the probabilities would be

prob1 = [ 1 / len(wordgroup1) ] * len(wordgroup1)

# For simplicity, we make all words in wordgroup1
# equally likely. We could also give different
# weights to different words.


# here is how to roll a die once in numpy
# You have to use a multinomial distribution,
# which says what happens when you roll a die n times,
# and set it to n=1.

onedraw = np.random.multinomial(1, prob1, 1)
print("Rolling a fair 7-sided die once, we got", onedraw)

# This is a list of outcomes:
# how often did we roll side 1, how often did we roll side 2,
# how often did we roll side 3, and so on.
# Since we only rolled the die once,
# we get a list with all 0's and one 1.
# Where is the 1?
# we can use numpy's argmax() for that

print("The index we rolled is ", np.argmax(onedraw))

# This is the index of the word we rolled.
# But what is the actual word?

index =  np.argmax(onedraw)
print("And the word on the top side of the die was:", wordgroup1[ index ])


# Let's wrap this into a function that does one draw
# from a die with num_faces sides
def one_categorial_draw(num_faces):
    draw = np.random.multinomial(1, [ 1 / num_faces] * num_faces, 1)
    return np.argmax(draw)

another_index = one_categorial_draw(len(wordgroup1))
print("Rolling the die again, we got:", wordgroup1[ another_index ])

# Now let's roll the die 20 times to generate a whole "document"
# about a kids' birthday party.
# We make another function for this

def n_categorial_draws(num_faces, num_draws):
    draws = np.random.multinomial(1, [1/num_faces] * num_faces, num_draws)
    return [ np.argmax(draw) for draw in draws ]

document1_indices = n_categorial_draws(len(wordgroup1), 20)
print("The word indices we got are", document1_indices)

# now we print the document
print("here is document 1:")
for index in document1_indices:
    print(wordgroup1[index], end = " ")


# second word group: Lord of the Rings
wordgroup2 = ["spider", "ring", "drums", "orc",
                  "battle", "merry", "run", "frodo"]

# let's make a Lord of the Rings document
document2_indices = n_categorial_draws(len(wordgroup2), 20)
print("The word indices we got are", document2_indices)

# now we print the document
print("here is document 2:")
for index in document2_indices:
    print(wordgroup2[index], end = " ")

# Now we  make a mixture of these two "topics".
# We again draw 20 words. For each word, we first
# probabilistically decide to either draw from
# wordgroup1 or wordgroup2. Then we draw a word.

# make 20 words
for wordnumber in range(20):
    # first we sample which topic to use, 0 or 1.
    # we again use a categorial distribution,
    # basically a coin flip
    which_topic = one_categorial_draw(2)

    # now we draw one word from the topic we just decided on.
    if which_topic == 0:
        wordindex = one_categorial_draw(len(wordgroup1))
        print(wordgroup1[wordindex], end = " ")
    else:
        wordindex = one_categorial_draw(len(wordgroup2))
        print(wordgroup2[wordindex], end = " ")
       


# So, now we can ask the same question as above: If you saw only the
# 'document' with the mix of words,
# how would you be able to guess that it came from a mixture of a
# kids' birthday topic and a Lord of the Rings topic?
# The method is the same as above: We assume we know the process
# that was used to generate the data (for each word, first randomly choose a topic,
# then randomly choose a word from that topic), now we just need to find
# the topics and probabilities that will make the data most likely.

Comments