from gensim.models import Word2Vec from nltk.corpus import brown # let's get spaces. # first, a handmade one, tiny. space_romance = Word2Vec(brown.sents(categories = "romance"), iter=10, min_count=10, size=300, sg = 1).wv # and a precomputed one # (the only word2vec in English is too large to download right now, # so I'm using GloVE) import gensim.downloader as gensim_api space = gensim_api.load("glove-wiki-gigaword-300") # please download wordsim353 from Canvas # (The official link to Wordsim353 seems to have gone away) # then we can test correlation between # human ratings and similarity predictions of each of our spaces # very comfortably with gensim: pearson, spearman, oov_ratio = space_romance.evaluate_word_pairs("wordsim353.txt") pearson_r, pearson_p = pearson spearman_rho, spearman_p = spearman print("Evaluating our tiny romance space against WordSim353") print("Pearson r:", pearson_r, "with p-value", pearson_p) print("Spearman's rho:", spearman_rho, "with p-value", spearman_p) print("Ratio of out-of-vocabulary words is gigantic:", oov_ratio) pearson, spearman, oov_ratio = space_large.evaluate_word_pairs("wordsim353.txt") pearson_r, pearson_p = pearson spearman_rho, spearman_p = spearman print("Evaluating the large space against WordSim353") print("Pearson r:", pearson_r, "with p-value", pearson_p) print("Spearman's rho:", spearman_rho, "with p-value", spearman_p) print("Ratio of out-of-vocabulary words is now much smaller:", oov_ratio) #########3 # or doing the same by hand wordsim353 = [ ] with open("wordsim353.txt") as f: # discard legend f.readline() # and read all other lines for line in f: word1, word2, rating_string = line.split() wordsim353.append(( word1, word2, float(rating_string))) # pulling similarity ratings from the model: # if a word is missing, we want to just return a similarity of zero def sim_or_zero(word1, word2, model): if word1 in model and word2 in model: return model.similarity(word1, word2) else: return 0.0 # making predictions for the wordsim353 data predictions = [ sim_or_zero(w1, w2, space_large) for w1, w2, rating in wordsim353 ] # here are the gold values gold = [ rating for w1, w2, rating in wordsim353] # computing correlation: import scipy print("Performance of the large space on predicting wordsim353 similarities:") print("Pearson:", scipy.stats.pearsonr(gold, predictions)) print("Spearman:", scipy.stats.spearmanr(gold, predictions)) |
Courses > Python worksheets >