Topic modeling: a demo
library(tm)
library(topicmodels)
# Reading in the 20-newsgroup text:
# We have one big file, with one line per article
news.raw = scan("/Users/kee252/Teaching/2018/analyzing linguistic data/materials/reuters20newsgroups/r8-train-data.txt", what="character", sep='\n')
# making this into a "Corpus" object for tm
news.corpus = Corpus(VectorSource(news.raw))
# extracting counts for how often each term appears in each document
# This yields a DocumentTermMatrix
# We lowercase, remove punctuation, and remove stopwords.
news.dtm = DocumentTermMatrix(news.corpus, control = list(tolower = TRUE,removePunctuation = TRUE, stopwords = TRUE))
# Computing a topic model:
# This learns topics as distributions over words
# such that words that tend to co-occur in the same documents
# will get high probabilities on the same topics.
# At the same time, topics are constructed in such a way
# that each document only participates in few topics
#
# This call makes 20 topics.
news.lda = LDA(news.dtm, k = 20, control = list(alpha = 0.1))
# Inspecting the topic model:
# The 20 highest-probability terms for each topic
terms(news.lda, 20)
# ... and the five highest-probability topics for each document
# as long as each of them has a probability of 0.1 or higher
topics(news.lda, 5, threshold = 0.1)
# Or you can get all the information:
# for each term, what is its probability under each topic,
# and for each document, what is the probability of each topic.
news.inf <- posterior(news.lda, news.dtm)
# news.inf$terms is a gigantic matrix
# with one column for each word in the vocabulary
# and one row for each of the k topics.
# Entry in row i and column j is the probability of term j under topic i.
# Here we see the probabilities of words 1-10 under topic 1:
news.inf$terms[1, 1:10]
# news.inf$topics is a matrix with a row for each document
# and a column for each topic. Entry in row i and column j is
# probability of topic j for document i.
# For example, here is how we get the probability of each of the
# 20 topics for document number 1:
news.inf$topics[1,1:20]
# We see that almost all the probability mass is on topic 17:
# 1 2 3 4
# 0.0004160435 0.0004160435 0.0004160435 0.0004160435
# 5 6 7 8
# 0.0004160435 0.0004160435 0.0004160435 0.0004160435
# 9 10 11 12
# 0.0004160435 0.0004160435 0.0004160435 0.0004160435
# 13 14 15 16
# 0.0891427517 0.0004160435 0.0004160435 0.0004160435
# 17 18 19 20
# 0.9033684652 0.0004160435 0.0004160435 0.0004160435
##
# We also have information about the newsgroup from which
# each of the documents in our collection was taken:
news.meta = read.table("/Users/kee252/Teaching/2018/analyzing linguistic data/materials/reuters20newsgroups/r8-train-meta.txt")
# The first document comes from the "earn" newsgroup, the second one
# the second document from the "acq" newsgroup, and so on.
head(news.meta)
# we obtain the strongest topic for each of the five thousand and some
# documents in our collection:
strongest.topic = topics(news.lda, 1)
# for example, for document 1 the strongest topic is 17:
# > strongest.topic[1]
# 1
# 17
# we combine this with the meta-data to
# link each document's newsgroup to the strongest topic
# in that document
topic.newsgroup = data.frame(topic = strongest.topic, newsgroup = news.meta)
colnames(topic.newsgroup) = c("topic", "newsgroup")
# and look how often each topic was the strongest topic
# for each of the newsgroups
xtabs( ~ newsgroup + topic, data = topic.newsgroup)
# This is easier to read as a table of percentages:
# what percentage of documents from this newsgroup
# had this topic as their strongest topic?
# In the table, we round numbers to two decimal points
round(prop.table(xtabs( ~ newsgroup + topic, data = topic.newsgroup), 1), 2)