Source code for qualia2.text.text_util

# -*- coding: utf-8 -*- 
from ..core import *
from ..autograd import Tensor
from ..functions import cosine_similarity

[docs]def most_similar(query, word2idx, wordvecs, n=5):
    ''' most_similar\n
    Look up most similar words in the embedding
    Args:
        query (str): query text
        word2idx (dict): word to index map
        wordvecs (Embedding): vector representation of words
        n (int): top n similar words to show
    '''
    if query not in word2idx:
        raise Exception('[*] \'{}\' is unknown.'.format(query))
    print('[*] query: ' + query)
    idx2word = {v: k for k, v in word2idx.items()}
    query_vec = wordvecs(word2idx[query])

    similarity = np.array([float(cosine_similarity(wordvecs(i), query_vec).data) for i in range(len(idx2word))])
    idx = similarity.argsort()[::-1]
    for i in range(n+1):
        if idx2word[int(idx[i])] == query:
            continue
        print('{}: {}'.format(idx2word[int(idx[i])], similarity[int(idx[i])]))

[docs]def analogy(a, b, c, word2idx, wordvec, n=5):
    ''' analogy\n
    Predicts word relationship like a:b = c:?
    Args:
        a (str): input string
        b (str): input string
        c (str): input string
        word2idx (dict): word to index map
        wordvec (Embedding): ector representation of words
        n (int): top n similar words to show
    '''
    assert a in word2idx
    assert b in word2idx
    assert c in word2idx

    print('[*] {}:{} = {}:?'.format(a,b,c))
    idx2word = {v: k for k, v in word2idx.items()}
    a_vec, b_vec, c_vec = wordvec(word2idx[a]), wordvec(word2idx[b]), wordvec(word2idx[c])
    query_vec = b_vec - a_vec + c_vec
    
    similarity = np.array([float(cosine_similarity(wordvec(i), query_vec).data) for i in range(len(idx2word))])
    idx = similarity.argsort()[::-1]
    for i in range(n):
        print('{}: {}'.format(idx2word[int(idx[i])], similarity[int(idx[i])]))