# what corpora and pre-trained models does gensim have? import gensim.downloader as gensim_api # the whole info: not easy to read print(gensim_api.info()) # This is, in fact, a dictionary with two entries: # corpora, and models print("Two types of information in the gensim downloader:", end = " ") for infotype in gensim_api.info().keys(): print(infotype, end = " ") print # Let's look in 'models'. This is, again, a dictionary, where the # keys are model labels print("Gensim has the following models", end = "\n") for modelname in gensim_api.info()["models"].keys(): print(modelname, end = "\n") print() # The entry for each model is, again, a dictionary. # Let's look at one print("Info for glove-wiki-gigaword-50:") for entry, value in gensim_api.info()["models"]["glove-wiki-gigaword-50"].items(): print(entry, ":", value) # The gensim datasets differ in their sizes. # This is relevant when you want to choose one for download. # Here are the sizes: print("Sizes of all models available with gensim:") info_obj = gensim_api.info() for modelname in info_obj["models"].keys(): print(modelname, "\tsize: ", end= "") if "file_size" in info_obj["models"][modelname]: print(int(info_obj["models"][modelname]["file_size"])/1000000, "M") else: print("\n") ### # We now load one space # Feel free to choose a different one # if this one is too slow to load on your machine space = gensim_api.load("glove-wiki-gigaword-300") ########### # exploring this space # computing cosine similarity space.similarity("amiable", "grouchy") space.similarity("amiable", "affable") space.similarity("amiable", "obliging") # A good way to explore the meaning of a word: # via its nearest neighbors in space space.most_similar("amiable") space.most_similar("grouchy", topn = 20) # you can get direct access to the vector associated with each word # using straight brackets, as if the space was a dictionary: space["amiable"] # then we can re-compute cosine by hand: # (gensim's is probably faster though) import math import numpy as np def veclen(vec): return math.sqrt(np.sum(np.square(vec))) def cosine(vec1, vec2): return np.sum(vec1 * vec2) / (veclen(vec1) * veclen(vec2)) print("gensim's cosine of 'amiable' and 'grumpy':", space.similarity("amiable", "grumpy")) print("and ours:", cosine(space["amiable"], space["grumpy"])) |
Courses > Python worksheets >