This demo shows how to compare the human ratings on word similarity from the WordSim353 dataset to predictions from a (tiny) distributional model. It uses the Brown corpus, available with NLTK, and the Gensim package to train a distributional model. It also showcases the pandas package, which is convenient for handling data that comes in the shape of a table. # handling data that comes in tables import pandas # read wordsim353. separating two columns is whitespace filename = "/Users/kee252/Data/VS_Datasets/WordSim353/combined.tab" wordsim353 = pandas.read_csv(filename, sep="\s+") # accessing this table print(wordsim353) # 1st column: words print(wordsim353["Word1"]) # 2nd column:words print(wordsim353["Word2"]) # 3rd column: numbers print(wordsim353["Human_mean"]) # now we build a toy distributional space from gensim.models import Word2Vec from nltk.corpus import brown brownmodel1 = Word2Vec(brown.sents(), iter=10, min_count=10, size=300, workers=4) brownmodel2 = Word2Vec(brown.sents(), iter=100, min_count=10, size=100, workers=4) # pulling similarity ratings from the model: # if a word is missing, we want to just return a similarity of zero def sim_or_zero(word1, word2, model): if word1 in model and word2 in model: return model.wv.similarity(word1, word2) else: return 0.0 # making predictions for the wordsim353 data, # storing them in the column "modelpredict" # as one-liners: # wordsim353["modelpredict1"] = [sim_or_zero(row["Word1"], row["Word2"], brownmodel1) for index, row in wordsim353.iterrows()] # wordsim353["modelpredict2"] = [sim_or_zero(row["Word1"], row["Word2"], brownmodel2) for index, row in wordsim353.iterrows()] # or less compactly: modelpredict1 = [ ] modelpredict2 = [ ] for index, row in wordsim353.iterrows(): modelpredict1.append( sim_or_zero(row["Word1"], row["Word2"], brownmodel1) ) modelpredict2.append( sim_or_zero(row["Word1"], row["Word2"], brownmodel2) ) wordsim353["modelpredict1"] = modelpredict1 wordsim353["modelpredict2"] = modelpredict2 # computing correlation: # pandas can do that, but doesn't return a p-value, so let's use scipy import scipy # we print pairs of correlation and pvalue # brownmodel1 is beyond miserable print(scipy.stats.spearmanr(wordsim353["Human_mean"], wordsim353["modelpredict1"])) # brownmodel2 has looked at the data a greater number of times, # and compresses its information into fewer dimensions. # It does better, though also not great. # the words with missing entries are really harming the model scipy.stats.spearmanr(wordsim353["Human_mean"], wordsim353["modelpredict2"]) |
Courses > Python worksheets >