Source code for qualia2.text.text_util

# -*- coding: utf-8 -*- 
from ..core import *
from ..autograd import Tensor
from ..functions import cosine_similarity

[docs]def most_similar(query, word2idx, wordvecs, n=5): ''' most_similar\n Look up most similar words in the embedding Args: query (str): query text word2idx (dict): word to index map wordvecs (Embedding): vector representation of words n (int): top n similar words to show ''' if query not in word2idx: raise Exception('[*] \'{}\' is unknown.'.format(query)) print('[*] query: ' + query) idx2word = {v: k for k, v in word2idx.items()} query_vec = wordvecs(word2idx[query]) similarity = np.array([float(cosine_similarity(wordvecs(i), query_vec).data) for i in range(len(idx2word))]) idx = similarity.argsort()[::-1] for i in range(n+1): if idx2word[int(idx[i])] == query: continue print('{}: {}'.format(idx2word[int(idx[i])], similarity[int(idx[i])]))
[docs]def analogy(a, b, c, word2idx, wordvec, n=5): ''' analogy\n Predicts word relationship like a:b = c:? Args: a (str): input string b (str): input string c (str): input string word2idx (dict): word to index map wordvec (Embedding): ector representation of words n (int): top n similar words to show ''' assert a in word2idx assert b in word2idx assert c in word2idx print('[*] {}:{} = {}:?'.format(a,b,c)) idx2word = {v: k for k, v in word2idx.items()} a_vec, b_vec, c_vec = wordvec(word2idx[a]), wordvec(word2idx[b]), wordvec(word2idx[c]) query_vec = b_vec - a_vec + c_vec similarity = np.array([float(cosine_similarity(wordvec(i), query_vec).data) for i in range(len(idx2word))]) idx = similarity.argsort()[::-1] for i in range(n): print('{}: {}'.format(idx2word[int(idx[i])], similarity[int(idx[i])]))