Python demo: predicting word similarity

This demo shows how to compare the human ratings on word similarity from the WordSim353 dataset to predictions from a (tiny) distributional model. It uses the Brown corpus, available with NLTK, and the Gensim package to train a distributional model. It also showcases the pandas package, which is convenient for handling data that comes in the shape of a table.

# handling data that comes in tables

import pandas

# read wordsim353. separating two columns is whitespace

filename = "/Users/kee252/Data/VS_Datasets/WordSim353/"

wordsim353 = pandas.read_csv(filename, sep="\s+")

# accessing this table


# 1st column: words


# 2nd column:words


# 3rd column: numbers


# now we build a toy distributional space

from gensim.models import Word2Vec

from nltk.corpus import brown

brownmodel1 = Word2Vec(brown.sents(), iter=10, min_count=10, size=300, workers=4)

brownmodel2 = Word2Vec(brown.sents(), iter=100, min_count=10, size=100, workers=4)

# pulling similarity ratings from the model:

# if a word is missing, we want to just return a similarity of zero

def sim_or_zero(word1, word2, model):

    if word1 in model and word2 in model:

        return model.wv.similarity(word1, word2)


        return 0.0

# making predictions for the wordsim353 data,

# storing them in the column "modelpredict"

# as one-liners:

# wordsim353["modelpredict1"] = [sim_or_zero(row["Word1"], row["Word2"], brownmodel1) for index, row in wordsim353.iterrows()]

# wordsim353["modelpredict2"] = [sim_or_zero(row["Word1"], row["Word2"], brownmodel2) for index, row in wordsim353.iterrows()]

# or less compactly:

modelpredict1 = [ ]

modelpredict2 = [ ]

for index, row in wordsim353.iterrows():

    modelpredict1.append( sim_or_zero(row["Word1"], row["Word2"], brownmodel1) )

    modelpredict2.append( sim_or_zero(row["Word1"], row["Word2"], brownmodel2) )

wordsim353["modelpredict1"] = modelpredict1

wordsim353["modelpredict2"] = modelpredict2

# computing correlation:

# pandas can do that, but doesn't return a p-value, so let's use scipy

import scipy

# we print pairs of correlation and pvalue

# brownmodel1 is beyond miserable

print(scipy.stats.spearmanr(wordsim353["Human_mean"], wordsim353["modelpredict1"]))

# brownmodel2 has looked at the data a greater number of times,

# and compresses its information into fewer dimensions.

# It does better, though also not great.

# the words with missing entries are really harming the model

scipy.stats.spearmanr(wordsim353["Human_mean"], wordsim353["modelpredict2"])