library(tm) library(topicmodels) # Reading in the 20-newsgroup text: # We have one big file, with one line per article news.raw = scan("/Users/kee252/Teaching/2018/analyzing linguistic data/materials/reuters20newsgroups/r8-train-data.txt", what="character", sep='\n') # making this into a "Corpus" object for tm news.corpus = Corpus(VectorSource(news.raw)) # extracting counts for how often each term appears in each document # This yields a DocumentTermMatrix # We lowercase, remove punctuation, and remove stopwords. news.dtm = DocumentTermMatrix(news.corpus, control = list(tolower = TRUE,removePunctuation = TRUE, stopwords = TRUE)) # Computing a topic model: # This learns topics as distributions over words # such that words that tend to co-occur in the same documents # will get high probabilities on the same topics. # At the same time, topics are constructed in such a way # that each document only participates in few topics # # This call makes 20 topics. news.lda = LDA(news.dtm, k = 20, control = list(alpha = 0.1)) # Inspecting the topic model: # The 20 highest-probability terms for each topic terms(news.lda, 20) # ... and the five highest-probability topics for each document # as long as each of them has a probability of 0.1 or higher topics(news.lda, 5, threshold = 0.1) # Or you can get all the information: # for each term, what is its probability under each topic, # and for each document, what is the probability of each topic. news.inf <- posterior(news.lda, news.dtm) # news.inf$terms is a gigantic matrix # with one column for each word in the vocabulary # and one row for each of the k topics. # Entry in row i and column j is the probability of term j under topic i. # Here we see the probabilities of words 1-10 under topic 1: news.inf$terms[1, 1:10] # news.inf$topics is a matrix with a row for each document # and a column for each topic. Entry in row i and column j is # probability of topic j for document i. # For example, here is how we get the probability of each of the # 20 topics for document number 1: news.inf$topics[1,1:20] # We see that almost all the probability mass is on topic 17: # 1 2 3 4 # 0.0004160435 0.0004160435 0.0004160435 0.0004160435 # 5 6 7 8 # 0.0004160435 0.0004160435 0.0004160435 0.0004160435 # 9 10 11 12 # 0.0004160435 0.0004160435 0.0004160435 0.0004160435 # 13 14 15 16 # 0.0891427517 0.0004160435 0.0004160435 0.0004160435 # 17 18 19 20 # 0.9033684652 0.0004160435 0.0004160435 0.0004160435 ## # We also have information about the newsgroup from which # each of the documents in our collection was taken: news.meta = read.table("/Users/kee252/Teaching/2018/analyzing linguistic data/materials/reuters20newsgroups/r8-train-meta.txt") # The first document comes from the "earn" newsgroup, the second one # the second document from the "acq" newsgroup, and so on. head(news.meta) # we obtain the strongest topic for each of the five thousand and some # documents in our collection: strongest.topic = topics(news.lda, 1) # for example, for document 1 the strongest topic is 17: # > strongest.topic[1] # 1 # 17 # we combine this with the meta-data to # link each document's newsgroup to the strongest topic # in that document topic.newsgroup = data.frame(topic = strongest.topic, newsgroup = news.meta) colnames(topic.newsgroup) = c("topic", "newsgroup") # and look how often each topic was the strongest topic # for each of the newsgroups xtabs( ~ newsgroup + topic, data = topic.newsgroup) # This is easier to read as a table of percentages: # what percentage of documents from this newsgroup # had this topic as their strongest topic? # In the table, we round numbers to two decimal points round(prop.table(xtabs( ~ newsgroup + topic, data = topic.newsgroup), 1), 2) |
Courses > R worksheets >