Demo: using a pre-trained space in gensim

# what corpora and pre-trained models does gensim have?

import gensim.downloader as gensim_api

# the whole info: not easy to read

print(gensim_api.info())

# This is, in fact, a dictionary with two entries:

# corpora, and models

print("Two types of information in the gensim downloader:", end = " ")

for infotype in gensim_api.info().keys():

print(infotype, end = " ")

print

# Let's look in 'models'. This is, again, a dictionary, where the

# keys are model labels

print("Gensim has the following models", end = "\n")

for modelname in gensim_api.info()["models"].keys():

print(modelname, end = "\n")

print()

# The entry for each model is, again, a dictionary.

# Let's look at one

print("Info for glove-wiki-gigaword-50:")

for entry, value in gensim_api.info()["models"]["glove-wiki-gigaword-50"].items():

print(entry, ":", value)

# The gensim datasets differ in their sizes.

# This is relevant when you want to choose one for download.

# Here are the sizes:

print("Sizes of all models available with gensim:")

info_obj = gensim_api.info()

for modelname in info_obj["models"].keys():

print(modelname, "\tsize: ", end= "")

if "file_size" in info_obj["models"][modelname]:

print(int(info_obj["models"][modelname]["file_size"])/1000000, "M")

else:

print("\n")

###

# We now load one space

# Feel free to choose a different one

# if this one is too slow to load on your machine

space = gensim_api.load("glove-wiki-gigaword-300")

###########

# exploring this space

# computing cosine similarity

space.similarity("amiable", "grouchy")

space.similarity("amiable", "affable")

space.similarity("amiable", "obliging")

# A good way to explore the meaning of a word:

# via its nearest neighbors in space

space.most_similar("amiable")

space.most_similar("grouchy", topn = 20)

# you can get direct access to the vector associated with each word

# using straight brackets, as if the space was a dictionary:

space["amiable"]

# then we can re-compute cosine by hand:

# (gensim's is probably faster though)

import math

import numpy as np

def veclen(vec):

return math.sqrt(np.sum(np.square(vec)))

def cosine(vec1, vec2):

return np.sum(vec1 * vec2) / (veclen(vec1) * veclen(vec2))

print("gensim's cosine of 'amiable' and 'grumpy':", space.similarity("amiable", "grumpy"))

print("and ours:", cosine(space["amiable"], space["grumpy"]))