Courses‎ > ‎Python worksheets‎ > ‎

Topic modeling example

# demo of topic modeling with LDA
# We first use a tiny document collection that comes with gensim
import gensim
from gensim.test.utils import common_texts

print("This is the tiny corpus we use:")
print(common_texts)

# Transformation into Gensim's internal format

# make a mapping from words to IDs
gensim_dictionary = gensim.corpora.Dictionary(common_texts)
# we allow gensim to transform our corpus to its internal format
gensim_corpus = [ ]
for document in common_texts:
    gensim_corpus.append( gensim_dictionary.doc2bow(document))

# now comes the topic modeling
ldamodel = gensim.models.ldamodel.LdaModel(gensim_corpus,
                                           num_topics = 3,
                                           id2word=gensim_dictionary)


# Let's inspect the topics
print("\n\nThese are the topics that LDA found:")
for topic in ldamodel.print_topics(num_words=8):
    print(topic)

# We can also see the degree to which each document
# draws on each topic
for index, document in enumerate(common_texts):
    print("Document:")
    print(document)
    print("Topics:", end = " ")
    for topicnumber, prob in ldamodel.get_document_topics(gensim_corpus[index]):
        print(topicnumber, ":", prob, end = ", ")
    print("\n")


Comments