Demo: Document analysis

###
# making a term-document matrix by hand:
import numpy

def testme_termdoc_space():
    documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]

   

    # getting a list of all the words we have.
    # We start out with a set, so no word gets included twice
    allwords_set = set()
    for doc in documents:
        for word in doc.split():
            allwords_set.add(word)
       
    # and now we make our words list
    allwords = list(allwords_set)
    alldocs = range(len(documents))

    # making the space

    # wordspace: a mapping from relevant_words to an array of integers (raw counts)
    wordspace = { }
    # fill the space with all zeros.
    for word in allwords:
        wordspace[ word ] = numpy.zeros(len(alldocs), dtype = numpy.int)

    # now we count how often each target word
    # appears in each document
    for docindex, doc in enumerate(documents):
        for word in doc.split():
            wordspace[word][docindex] += 1
           
    # "and" occurs in documents 3 and 7
    print("document counts for 'and':", wordspace["and"])
    # "computer" occurs in documents 0 and 1
    print("document counts for 'computer':", wordspace["computer"])
    # "interface" occurs in documents 0 and 2
    print("document counts for 'interface':", wordspace["interface"])
   
#############################
# We can use cosine similarity, as before, to compute
# the similarity of two words in terms of the documents
# in which they appear
import math

def veclen(vector):
    return math.sqrt(numpy.sum(numpy.square(vector)))

def cosine(word1, word2, space):
    vec1 = space[ word1 ]
    vec2 = space[word2]

    veclen1 = veclen(vec1)
    veclen2 = veclen(vec2)

    if veclen1 == 0.0 or veclen2 == 0.0:
        # one of the vectors is empty. make the cosine zero.
        return 0.0

    else:
        # we could also simply do:
        # dotproduct = numpy.dot(vec1, vec2)
        dotproduct = numpy.sum(vec1 * vec2)

        return dotproduct / (veclen1 * veclen2) 


def testme_sim():
    documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]

   

    # getting a list of all the words we have.
    # We start out with a set, so no word gets included twice
    allwords_set = set()
    for doc in documents:
        for word in doc.split():
            allwords_set.add(word)
       
    # and now we make our words list
    allwords = list(allwords_set)
    alldocs = range(len(documents))

    # making the space
    # wordspace: a mapping from relevant_words to an array of integers (raw counts)
    wordspace = { }
    # fill the space with all zeros.
    for word in allwords:
        wordspace[ word ] = numpy.zeros(len(alldocs), dtype = numpy.int)

    # now we count how often each target word
    # appears in each document
    for docindex, doc in enumerate(documents):
        for word in doc.split():
            wordspace[word][docindex] += 1
   
    print("similarity of 'computer' and 'interface':", cosine("computer", "interface", wordspace))

   

#####
# a document/term matrix has target documents, and the vector for a document
# counts how often each word has appeared in it.

# space: a mapping from relevant_words to an array of integers (raw counts)
def testme_docterm_space():
    documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]

   
    # getting a list of all the words we have.
    # We start out with a set, so no word gets included twice
    allwords_set = set()
    for doc in documents:
        for word in doc.split():
            allwords_set.add(word)
       
    # and now we make our words list
    allwords = list(allwords_set)
    alldocs = range(len(documents))

    docspace = { }
    # fill the space with all zeros.
    for docindex in range(len(documents)):
        docspace[ docindex ] = numpy.zeros(len(allwords), dtype = numpy.int)


    # now we count how often each target word
    # appears in each document
    for docindex, doc in enumerate(documents):
        for word in doc.split():
            wordindex = allwords.index(word)
            docspace[docindex][wordindex] += 1


    # and we can apply cosine to this space just like the other
    print("similarity of documents 0 and 1:", cosine(0, 1, docspace))
    print("similarity of documents 2 and 3:", cosine(2, 3, docspace))

################
# Actual matrices:
# So far, we have represented a vector space as a dictionary
# that maps from targets to vectors (except when we were doing SVD).
# If we represent the word space as an actual term/document matrix,
# with a row for each target word and a column for each document,
# then we can get the document/term matrix
# by simply swapping rows and columns in the term/document matrix.
def testme_matrices():
    documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]

    allwords_set = set()
    for doc in documents:
        for word in doc.split():
            allwords_set.add(word)
       
    # and now we make our words list
    allwords = list(allwords_set)
    alldocs = range(len(documents))

    termdocmatrix = numpy.zeros((len(allwords), len(alldocs)), dtype = numpy.int)

    for docindex, doc in enumerate(documents):
        for word in doc.split():
            wordindex = allwords.index(word)
            termdocmatrix[ wordindex, docindex ] += 1

    # The term/document matrix has a row for each word
    # and a column for each document
    print("this is the term/document matrix:\n", termdocmatrix, "\n")
   
    # and now we swap rows and columns:
    # The document/term matrix has a row for each document
    # and a column for each term
    doctermmatrix = termdocmatrix.transpose()
    print("this is the document/term matrix:\n", doctermmatrix, "\n")

###################################################
# doing the same with gensim
# This is straight from the gensim tutorial

# We can safely ignore the following warning, as we are not currently using word2vec
# for more info on the word2vec issue, see
# https://radimrehurek.com/gensim/models/word2vec.html
import gensim

def testme_gensim_initialdemo():
    documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]
   
    # lowercasing, stopword removal
    stoplist = set('for a of the and to in'.split())
    # texts: a list of word lists, one for each documents
    # all_tokens: a list of words, concatenation of all texts
    texts = [ ]
    all_tokens = [ ]
    for document in documents:
        docwords = [word for word in document.lower().split() if word not in stoplist]
        texts.append(docwords)
        all_tokens = all_tokens + docwords
       
    # make a dictionary of all the words that appear in the texts, and save it.
    # this assigns all wors a unique ID
    gdict = gensim.corpora.Dictionary(texts)
    # this prints all the terms with their unique number ID that has been assigned to them
    print("Here is the dictionary that gensim made. It maps words to indices.")
    print(gdict.token2id, "\n")
   
    # Here is how to save the dictionary to a file
    gdict.save('/Users/katrinerk/Desktop/gensimdemo.dict')
    # here is how to load a dictionary from file
    # (not that that makes sense here, as we already have the
    # dictionary in memory; this is just to show you how it works.)
    gdict = gensim.corpora.Dictionary.load('/Users/katrinerk/Desktop/gensimdemo.dict')
   
    # this dictionary object can then be used to
    # represent documents in the space of these words
    # (documents represented through terms)
    new_doc = "Human computer interaction"
    new_text = new_doc.lower().split()
    new_vec = gdict.doc2bow(new_text)
    print("Representation, based on the dictionary gdict, of")
    print("the new document 'human computer interaction':")
    print(new_vec, "\n")
    # I got:
    # [(1, 1), (5, 1)]
    # This says: word 1 (computer) occurs once, word 5 (human) occurs once.
    # no other words form the dictionary appeared.
   
    # convert all our documents to this format
    corpus = [ ]
    for text in texts:
        corpus.append(gdict.doc2bow(text))
   
    ##
    # Here is how you can write a gensim-created space to a file.
    # store the data in Matrix Market format:
    # storing only the nonzero entries in a line of format
    # rowno colno entry
    gensim.corpora.MmCorpus.serialize('/Users/katrinerk/Desktop/gensimdemo.mm', corpus)
    # and load it back in
    # (again, we would not have had to do this; this is just to demonstrate
    # how to store intermediate results in files)
    corpus = gensim.corpora.MmCorpus('/Users/katrinerk/Desktop/gensimdemo.mm')
   

    ###
    # tf/idf conversion: This is similar in its effect
    # to doing PPMI transformation.
    # The Gensim TfidModel() does not convert the corpus,
    # it just generates an object that can do the conversion
    tfidf = gensim.models.TfidfModel(corpus)
    corpus[0]
    tfidf[corpus[0]]
    # Here is how you actually do the conversion.
    corpus_tfidf = tfidf[corpus]
   
    # we can iterate over texts in the corpus, and individual weights in each text
    print("TF/IDF transformed space:")
    for doc_id, doc in enumerate(corpus_tfidf):
        for term_id, value in doc:
            print(doc_id, term_id, value)
    print("\n")
   
    # creating an object that does SVD
    # (here called LSI for Latent Sematnic Indexing).
    # We are only making 2 latent dimensions
    # We create a wrapper around the tfidf wrapper that creates these numbers on the fly
    lsi = gensim.models.LsiModel(corpus_tfidf, id2word=gdict, num_topics=2)
    # and then apply it.
    corpus_lsi = lsi[corpus_tfidf]

    print("SVD transformed space:")
    for doc in corpus_lsi:
        print( doc )
    print("\n")

    ###
    # now we do the same thing in numpy, without gensim.
    # storing the document/term matrix in a numpy matrix:
    dtm = numpy.zeros((len(corpus), len(gdict)))
    for docid, doc in enumerate(corpus_tfidf):
        for termid, val in doc:
            dtm[docid, termid] = val
   
    # SVD transformation
    def svd_transform(space, keepnumdimensions):
        umatrix, sigmavector, vmatrix = numpy.linalg.svd(space)
   
        # remove the last few dimensions of u and sigma
        utrunc = umatrix[:, :keepnumdimensions]
        sigmatrunc = sigmavector[ :keepnumdimensions]
   
        # new space: U %matrixproduct% Sigma_as_diagonal_matrix   
        return numpy.dot(utrunc, numpy.diag(sigmatrunc))
   
    # this does not give us the same thing as the LSI transform,
    # the first dimension is inverted
    dtm_svd = svd_transform(dtm, 2)
   
    print("And here is an SVD-transformed space done by numpy rather than gensim")
    for row in dtm_svd:
        print(row)
    print("\n")
   
    ####
    # The dimensions of the SVD-transformed space
    # can be viewed as "latent semantic classes".
    # When our targets are documents, this is something like topics,
    # though not very good topics. (The LDA approach below
    # gives you much more interpretable topics.)
    print("Inspecting the SVD dimensions (here called topics):")
    lsi.print_topics()
    for topicindex, topic in enumerate(lsi.print_topics()):
        print( "topic", topicindex, ":")
        print("\t", topic, "\n")


   
############
# LDA topic models
# This is a probabilistic approach
# that generally gives you much more readable topics.
import nltk
import string

##
# preprocessing a collection of texts,
# that is, a list of word lists
def preprocess(texts):
    stopword_filename = "/Users/katrinerk/Teaching/classes/compsemantics/demos/stopwords-augmented.txt"
    f = open(stopword_filename)
    stopwords = set(f.read().split())
    f.close()
    stopwords.add("--")
    stopwords.add("``")
    stopwords.add("''")
    for punct in string.punctuation:
        stopwords.add(punct)

    newtexts = [ ]
    for text in texts:
        newtexts.append( [w.lower() for w in text if w.lower() not in stopwords])
    return newtexts


##
# making and displaying an LDA topic model
# input: a collection of texts,
# that is, a list of word lists
def make_and_show_lda_model(texts, numtopics, textlabels, show_docs = True, show_sims = True):
    # map terms to IDs
    gdict = gensim.corpora.Dictionary(texts)
    # and represent the corpus in sparse matrix format, bag-of-words
    corpus = [gdict.doc2bow(text) for text in texts]
    # now we make an LDA object.
    # in case we have a larger text collection (such as the Brown corpus),
    # make sure to set "passes" to a reasonably high number in order not to have all topics
    # come out equal. 20 seems to work.
    lda_obj = gensim.models.ldamodel.LdaModel(corpus, id2word=gdict, num_topics=numtopics, passes = 20)


    # how do our texts look: how important is each topic there?
    if show_docs:
        print("Showing how important each topic is for each document")
        lda_corpus = lda_obj[corpus]
        for docindex, doc in enumerate(lda_corpus):
            print( "Document:", docindex, end = " ")
            for word in texts[docindex][:20]: print( word, end = " ")
            print("\n")
            for topic, weight in doc:
                print( "Topic", str(topic) + ":", "weight", round(weight, 2))
            print("\n\n")

    # similarities between texts?
    if show_sims:
        sim_obj = gensim.similarities.MatrixSimilarity(lda_corpus)

        print("Showing, for each document, the 2 most similar other documents")
        for docindex, doc in enumerate(lda_corpus):
            # determine degree of similarity of this document to all other documents
            sims = sim_obj[ doc ]
            # pair similarities with document labels, sort by similarity,
            # highest first
            sims_and_labels = sorted(zip(sims, textlabels), reverse=True)
       
            print( "Similarities for", textlabels[ docindex])
            # print nonzero similarities
            for sim, textlabel in sims_and_labels[:2]:
                if textlabel != textlabels[docindex] and sim > 0.0:
                    print( "\t", textlabel, sim)
            print("\n")

    # a look at the topics
    print("Here is what the topics look like:")
    for index, t in enumerate(lda_obj.print_topics(numtopics, 20)):
        print( "topic", index, t)
    print("\n")



def testme_lda_mini():
    documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]
   
    split_docs = [d.split() for d in documents]
    texts = preprocess(split_docs)
    # making: 2 topics, document labels are indices
    make_and_show_lda_model(texts, 2, list(range(len(documents))))
                      
def testme_lda_inaugural():
    ###########
    # making an LDA topic model from the inaugural addresses
    texts = preprocess([nltk.corpus.inaugural.words(fileid) for fileid in nltk.corpus.inaugural.fileids()])
    # skip address 54 for some non-ascii character
    fileids = nltk.corpus.inaugural.fileids()
    # try doing the following step twice: You will get different models each time.
    make_and_show_lda_model(texts[:54] + texts[55:], 10, fileids[:54] + fileids[55:])

def testme_lda_gutenberg():
    # make an LDA topic model for 18 Project Gutenberg books:
    # 'austen-emma.txt',
    # 'austen-persuasion.txt',
    # 'austen-sense.txt',
    # 'bible-kjv.txt',
    # 'blake-poems.txt',
    # 'bryant-stories.txt',
    # 'burgess-busterbrown.txt',
    # 'carroll-alice.txt',
    # 'chesterton-ball.txt',
    # 'chesterton-brown.txt',
    # 'chesterton-thursday.txt',
    # 'edgeworth-parents.txt',
    # 'melville-moby_dick.txt',
    # 'milton-paradise.txt',
    # 'shakespeare-caesar.txt',
    # 'shakespeare-hamlet.txt',
    # 'shakespeare-macbeth.txt',
    # 'whitman-leaves.txt'
    texts = preprocess([nltk.corpus.gutenberg.words(fileid) for fileid in nltk.corpus.gutenberg.fileids()])
    make_and_show_lda_model(texts, 6, nltk.corpus.gutenberg.fileids())
 

def testme_lda_pirates():
    # turn each paragraph from "Pirates of the Caribbean" into its
    # own document
    pirates = nltk.corpus.webtext.paras(fileids = "pirates.txt")[0]
    texts = preprocess(pirates)
    make_and_show_lda_model(texts, 10, range(len(pirates)))
      


def testme_lda_brown():
    #############
    # making an LDA topic model from the whole Brown corpus
    # Don't do that unless you have a machine in good working order
    # with sufficient memory
    # and patience for seeing the output generate.
   
    # categories?
    print( "Brown categories:" ,nltk.corpus.brown.categories())
    # let's build a corpus out of them
    texts = preprocess([list(nltk.corpus.brown.words(fileid)) for fileid in nltk.corpus.brown.fileids()])
    make_and_show_lda_model(texts, 50, nltk.corpus.brown.fileids(), show_docs = False, show_sims = False)
   
   



Comments