Topic modeling: a demo

library(tm)

library(topicmodels)

# Reading in the 20-newsgroup text:

# We have one big file, with one line per article

news.raw = scan("/Users/kee252/Teaching/2018/analyzing linguistic data/materials/reuters20newsgroups/r8-train-data.txt", what="character", sep='\n')

# making this into a "Corpus" object for tm

news.corpus = Corpus(VectorSource(news.raw))

# extracting counts for how often each term appears in each document

# This yields a DocumentTermMatrix

# We lowercase, remove punctuation, and remove stopwords.

news.dtm = DocumentTermMatrix(news.corpus, control = list(tolower = TRUE,removePunctuation = TRUE, stopwords = TRUE))

# Computing a topic model:

# This learns topics as distributions over words

# such that words that tend to co-occur in the same documents

# will get high probabilities on the same topics.

# At the same time, topics are constructed in such a way

# that each document only participates in few topics

#

# This call makes 20 topics.

news.lda = LDA(news.dtm, k = 20, control = list(alpha = 0.1))

# Inspecting the topic model:

# The 20 highest-probability terms for each topic

terms(news.lda, 20)

# ... and the five highest-probability topics for each document

# as long as each of them has a probability of 0.1 or higher

topics(news.lda, 5, threshold = 0.1)

# Or you can get all the information:

# for each term, what is its probability under each topic,

# and for each document, what is the probability of each topic.

news.inf <- posterior(news.lda, news.dtm)

# news.inf$terms is a gigantic matrix

# with one column for each word in the vocabulary

# and one row for each of the k topics.

# Entry in row i and column j is the probability of term j under topic i.

# Here we see the probabilities of words 1-10 under topic 1:

news.inf$terms[1, 1:10]

# news.inf$topics is a matrix with a row for each document

# and a column for each topic. Entry in row i and column j is

# probability of topic j for document i.

# For example, here is how we get the probability of each of the

# 20 topics for document number 1:

news.inf$topics[1,1:20]

# We see that almost all the probability mass is on topic 17:

#            1            2            3            4

# 0.0004160435 0.0004160435 0.0004160435 0.0004160435

#            5            6            7            8

# 0.0004160435 0.0004160435 0.0004160435 0.0004160435

#            9           10           11           12

# 0.0004160435 0.0004160435 0.0004160435 0.0004160435

#           13           14           15           16

# 0.0891427517 0.0004160435 0.0004160435 0.0004160435

#           17           18           19           20

# 0.9033684652 0.0004160435 0.0004160435 0.0004160435

##

# We also have information about the newsgroup from which

# each of the documents in our collection was taken:

news.meta = read.table("/Users/kee252/Teaching/2018/analyzing linguistic data/materials/reuters20newsgroups/r8-train-meta.txt")

# The first document comes from the "earn" newsgroup, the second one

# the second document from the "acq" newsgroup, and so on.

head(news.meta)

# we obtain the strongest topic for each of the five thousand and some

# documents in our collection:

strongest.topic = topics(news.lda, 1)

# for example, for document 1 the strongest topic is 17:

# > strongest.topic[1]

#  1

# 17

# we combine this with the meta-data to

# link each document's newsgroup to the strongest topic

# in that document

topic.newsgroup = data.frame(topic = strongest.topic, newsgroup = news.meta)

colnames(topic.newsgroup) = c("topic", "newsgroup")

# and look how often each topic was the strongest topic

# for each of the newsgroups

xtabs( ~ newsgroup + topic, data = topic.newsgroup)

# This is easier to read as a table of percentages:

# what percentage of documents from this newsgroup

# had this topic as their strongest topic?

# In the table, we round numbers to two decimal points

round(prop.table(xtabs( ~ newsgroup + topic, data = topic.newsgroup), 1), 2)