Python demo: predicting word similarity
This demo shows how to compare the human ratings on word similarity from the WordSim353 dataset to predictions from a (tiny) distributional model. It uses the Brown corpus, available with NLTK, and the Gensim package to train a distributional model. It also showcases the pandas package, which is convenient for handling data that comes in the shape of a table.
# handling data that comes in tables
import pandas
# read wordsim353. separating two columns is whitespace
filename = "/Users/kee252/Data/VS_Datasets/WordSim353/combined.tab"
wordsim353 = pandas.read_csv(filename, sep="\s+")
# accessing this table
print(wordsim353)
# 1st column: words
print(wordsim353["Word1"])
# 2nd column:words
print(wordsim353["Word2"])
# 3rd column: numbers
print(wordsim353["Human_mean"])
# now we build a toy distributional space
from gensim.models import Word2Vec
from nltk.corpus import brown
brownmodel1 = Word2Vec(brown.sents(), iter=10, min_count=10, size=300, workers=4)
brownmodel2 = Word2Vec(brown.sents(), iter=100, min_count=10, size=100, workers=4)
# pulling similarity ratings from the model:
# if a word is missing, we want to just return a similarity of zero
def sim_or_zero(word1, word2, model):
if word1 in model and word2 in model:
return model.wv.similarity(word1, word2)
else:
return 0.0
# making predictions for the wordsim353 data,
# storing them in the column "modelpredict"
# as one-liners:
# wordsim353["modelpredict1"] = [sim_or_zero(row["Word1"], row["Word2"], brownmodel1) for index, row in wordsim353.iterrows()]
# wordsim353["modelpredict2"] = [sim_or_zero(row["Word1"], row["Word2"], brownmodel2) for index, row in wordsim353.iterrows()]
# or less compactly:
modelpredict1 = [ ]
modelpredict2 = [ ]
for index, row in wordsim353.iterrows():
modelpredict1.append( sim_or_zero(row["Word1"], row["Word2"], brownmodel1) )
modelpredict2.append( sim_or_zero(row["Word1"], row["Word2"], brownmodel2) )
wordsim353["modelpredict1"] = modelpredict1
wordsim353["modelpredict2"] = modelpredict2
# computing correlation:
# pandas can do that, but doesn't return a p-value, so let's use scipy
import scipy
# we print pairs of correlation and pvalue
# brownmodel1 is beyond miserable
print(scipy.stats.spearmanr(wordsim353["Human_mean"], wordsim353["modelpredict1"]))
# brownmodel2 has looked at the data a greater number of times,
# and compresses its information into fewer dimensions.
# It does better, though also not great.
# the words with missing entries are really harming the model
scipy.stats.spearmanr(wordsim353["Human_mean"], wordsim353["modelpredict2"])