Courses‎ > ‎Python worksheets‎ > ‎

Demo: using a pre-trained space in gensim

# what corpora and pre-trained models does gensim have?
import gensim.downloader as gensim_api

# the whole info: not easy to read
print(gensim_api.info())

# This is, in fact, a dictionary with two entries:
# corpora, and models
print("Two types of information in the gensim downloader:", end = " ")
for infotype in gensim_api.info().keys():
    print(infotype, end = " ")
print

# Let's look in 'models'. This is, again, a dictionary, where the
# keys are model labels
print("Gensim has the following models", end = "\n")
for modelname in gensim_api.info()["models"].keys():
    print(modelname, end = "\n")
print()

# The entry for each model is, again, a dictionary.
# Let's look at one
print("Info for glove-wiki-gigaword-50:")
for entry, value in gensim_api.info()["models"]["glove-wiki-gigaword-50"].items():
    print(entry, ":", value)

# The gensim datasets differ in their sizes.
# This is relevant when you want to choose one for download.
# Here are the sizes:
print("Sizes of all models available with gensim:")
info_obj = gensim_api.info()
for modelname in info_obj["models"].keys():
    print(modelname, "\tsize: ", end= "")
    if "file_size" in info_obj["models"][modelname]:
        print(int(info_obj["models"][modelname]["file_size"])/1000000, "M")
    else:
        print("\n")
###
# We now load one space
# Feel free to choose a different one
# if this one is too slow to load on your machine
space = gensim_api.load("glove-wiki-gigaword-300")

###########
# exploring this space
# computing cosine similarity
space.similarity("amiable", "grouchy")
space.similarity("amiable", "affable")
space.similarity("amiable", "obliging")

# A good way to explore the meaning of a word:
# via its nearest neighbors in space
space.most_similar("amiable")
space.most_similar("grouchy", topn = 20)

# you can get direct access to the vector associated with each word
# using straight brackets, as if the space was a dictionary:
space["amiable"]

# then we can re-compute cosine by hand:
# (gensim's is probably faster though)
import math
import numpy as np
def veclen(vec):
    return math.sqrt(np.sum(np.square(vec)))

def cosine(vec1, vec2):
    return np.sum(vec1 * vec2) / (veclen(vec1) * veclen(vec2))

print("gensim's cosine of 'amiable' and 'grumpy':", space.similarity("amiable", "grumpy"))
print("and ours:", cosine(space["amiable"], space["grumpy"]))



Comments