query_positions, query_weights = get_positions_weights(query, from_db=False)
similarity_scores = []
for key in database:
key_positions, key_weights = get_positions_weights(key, from_db=True)
similarity = earth_movers_distance(query_positions, query_weights, key_positions, key_weights)
similarity_scores.append((similarity, key))
print(f"Query: {query}")
for similarity, key in sorted(similarity_scores)[:3]:
print(f"Earth Mover's Distance: {similarity}\t{key}")
Query: What food should I buy tomorrow for dinner Earth Mover's Distance: 3.1012875219130516 Book of groceries that everyone should purchase Earth Mover's Distance: 3.286581309950352 Dinner recipes for clam chowder with a soup base Earth Mover's Distance: 3.4387479479813576 Guide to cooking with Garam Masala for lunch
import gensim.models
import numpy as np
import scipy.optimize
from fractions import Fraction
word2vec_bin_path = '/Users/ginoprasad/ai_stuff/word2vec/GoogleNews-vectors-negative300.bin'
word2vec = gensim.models.keyedvectors.KeyedVectors.load_word2vec_format(word2vec_bin_path, binary=True)
print(word2vec["hello"][:20])
[-0.05419922 0.01708984 -0.00527954 0.33203125 -0.25 -0.01397705 -0.15039062 -0.265625 0.01647949 0.3828125 -0.03295898 -0.09716797 -0.16308594 -0.04443359 0.00946045 0.18457031 0.03637695 0.16601562 0.36328125 -0.25585938]
query = "What food should I buy tomorrow for dinner"
database = list(map(lambda x: x.replace(".", '').replace(",", ''),
"""
He had concluded that pigs must be able to fly in Hog Heaven.
I want a giraffe, but I'm a turtle eating waffles.
I've never seen a more beautiful brandy glass filled with wine.
The view from the lighthouse excited even the most seasoned traveler.
Eating eggs on Thursday for choir practice was recommended.
The Guinea fowl flies through the air with all the grace of a turtle.
Iguanas were falling out of the trees.
He realized there had been several deaths on this road, but his concern rose when he saw the exact number.
I'm worried by the fact that my daughter looks to the local carpet seller as a role model.
He went on a whiskey diet and immediately lost three days.
She used her own hair in the soup to give it more flavor.
The near death experience brought new ideas to light.
Red is greener than purple, for sure.
I caught my squirrel rustling through my gym bag.
Dinner recipes for clam chowder with a soup base.
At that moment he was not listening to music, he was living an experience.
She could hear him in the shower singing with a joy she hoped he would retain after she delivered the news.
The swirled lollipop had issues with the pop rock candy.
Book of groceries that everyone should purchase.
Guide to cooking with Garam Masala for lunch
""".strip().split('\n')))
def get_document_frequency(word, from_db):
return sum([word in sentence for sentence in database]) - from_db
def get_idf(word, from_db):
return np.log(len(database) / (get_document_frequency(word, from_db)+1)) / np.log(2)
def get_term_frequency(word, sentence):
return sum([word == word_ for word_ in sentence.split(' ')])
def get_tf_idf(sentence, from_db, decimals=5):
arr = []
for word in sentence.split(' '):
arr.append(get_term_frequency(word, sentence)*get_idf(word, from_db))
arr = np.array(arr)
arr /= arr.sum()
arr = np.around(arr, decimals)
arr = [Fraction(int(x * (10 ** decimals)), (10 ** decimals)) for x in arr]
arr[0] += 1 - sum(arr)
return arr
def get_positions_weights(sentence, from_db):
positions = []
weights = []
for word, tf_idf in zip(sentence.split(' '), get_tf_idf(sentence, from_db)):
word = word[0].upper() + word[1:]
positions.append(word2vec[word])
weights.append(tf_idf)
positions, weights = map(np.array, (positions, weights))
return positions, weights
Graph Balances: +query_weights concat -key_weights
Let $x_i$ be a word in the query
Let $y_j$ be a word in the key
Let b be the balance function, and d be the distance function
Let $f(x_i, y_j)$ be the b-flow
$cost(f) = \sum_{i,j} f(x_i, y_j) d(x_i, y_j)$
$b(x_i) = \sum_j f(x_i, y_j)$
$b(y_j) = -\sum_i f(x_i, y_j)$
$f(x_i, y_j) \geq 0$ for all i, j
def earth_movers_distance(query_positions, query_weights, key_positions, key_weights):
assert sum(query_weights) == 1
assert sum(key_weights) == 1
A_eq = np.zeros((len(query_weights)+len(key_weights),len(query_weights)*len(key_weights)))
b_eq = np.concatenate([query_weights, key_weights])
distances = np.array([np.linalg.norm(query_word_embedding-key_word_embedding) for query_word_embedding in query_positions for key_word_embedding in key_positions])
for i, _ in enumerate(query_weights):
A_eq[i,len(key_weights)*i:len(key_weights)*(i+1)] = 1
for j, _ in enumerate(key_weights):
A_eq[len(query_weights)+j,j::len(key_weights)] = 1
return scipy.optimize.linprog(distances, A_eq=A_eq, b_eq=b_eq).fun
query_positions, query_weights = get_positions_weights(query, from_db=False)
similarity_scores = []
for key in database:
key_positions, key_weights = get_positions_weights(key, from_db=True)
similarity = earth_movers_distance(query_positions, query_weights, key_positions, key_weights)
similarity_scores.append((similarity, key))
print(f"Query: {query}")
for similarity, key in sorted(similarity_scores)[:3]:
print(f"Earth Mover's Distance: {similarity}\t{key}")
Query: What food should I buy tomorrow for dinner Earth Mover's Distance: 3.1012875219130516 Book of groceries that everyone should purchase Earth Mover's Distance: 3.286581309950352 Dinner recipes for clam chowder with a soup base Earth Mover's Distance: 3.4387479479813576 Guide to cooking with Garam Masala for lunch