initial commit of project

This commit is contained in:
2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions

View File

@@ -0,0 +1,45 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
from pathlib import Path
import numpy as np
from annoy import AnnoyIndex
from tqdm import tqdm
# Full guide https://github.com/spotify/annoy
class ApproxKNNClassifier:
def __init__(self, all_ingredient_embeddings, max_embedding_count,
save_path=Path('data/eval/approx_knn_classifier.ann'), n_trees=10):
vector_length = all_ingredient_embeddings.shape[-1]
self.max_embedding_count = max_embedding_count
if save_path.exists():
print('Loading Existing Approx Classifier')
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')
self.approx_knn_classifier.load(str(save_path)) # super fast, will just mmap the file
else:
# To make sure we don't just get ourselves: add max_embedding_count
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular') # Length of item vector that will be indexed
for i in tqdm(range(len(all_ingredient_embeddings)), total=len(all_ingredient_embeddings), desc='Creating Approx Classifier'):
self.approx_knn_classifier.add_item(i, all_ingredient_embeddings[i])
self.approx_knn_classifier.build(n_trees)
print('Saving Approx Classifier')
self.approx_knn_classifier.save(str(save_path))
def k_nearest_neighbors(self, ingredient_embeddings):
all_indices, all_distances = [], []
for idx, ingredient_embedding in enumerate(
ingredient_embeddings): # search_k gives you a run-time tradeoff between better accuracy and speed currently defaults
indices, distances = self.approx_knn_classifier.get_nns_by_vector(ingredient_embedding, self.max_embedding_count + 200, include_distances=True)
all_indices.append(indices)
all_distances.append(distances)
return np.stack(all_distances), np.stack(all_indices)

View File

@@ -0,0 +1,152 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
import json
import pickle
import random
import re
from collections import defaultdict
from pathlib import Path
import numpy as np
import torch
from tqdm import tqdm
from evaluation.helpers.prediction_model import PredictionModel
def _generate_food_sentence_dict(model_path):
with open('data/mult_ingredients_nice.json', "r") as f:
food_items = json.load(f)
food_items_set = set(food_items.keys())
with open(model_path + 'training_data.txt', "r") as f:
train_instruction_sentences = f.read().splitlines()
# remove overlong sentences
train_instruction_sentences = [s for s in train_instruction_sentences if len(s.split()) <= 100]
with open(model_path + 'testing_data.txt', "r") as f:
test_instruction_sentences = f.read().splitlines()
# remove overlong sentences
test_instruction_sentences = [s for s in test_instruction_sentences if len(s.split()) <= 100]
instruction_sentences = train_instruction_sentences + test_instruction_sentences
food_to_sentences_dict = defaultdict(list)
for sentence in instruction_sentences:
words = re.sub("[^\w]-'", " ", sentence).split()
for word in words:
if word in food_items_set:
food_to_sentences_dict[word].append(sentence)
return food_to_sentences_dict
def _random_sample_with_min_count(population, k):
if len(population) <= k:
return population
else:
return random.sample(population, k)
def sample_random_sentence_dict(model_path, max_sentence_count):
food_to_sentences_dict = _generate_food_sentence_dict(model_path=model_path)
# only keep 100 randomly selected sentences
food_to_sentences_dict_random_samples = {food: _random_sample_with_min_count(sentences, max_sentence_count) for
food, sentences in food_to_sentences_dict.items()}
return food_to_sentences_dict_random_samples
def _map_ingredients_to_input_ids(model_path):
with open('data/mult_ingredients_nice.json', "r") as f:
ingredients = json.load(f).keys()
model = PredictionModel(model_path)
ingredient_ids = model.tokenizer.convert_tokens_to_ids(ingredients)
ingredient_ids_dict = dict(zip(ingredients, ingredient_ids))
return ingredient_ids_dict
def _merge_synonmys(food_to_embeddings_dict, max_sentence_count):
synonmy_replacements_path = Path('foodbert_embeddings/data/synonmy_replacements.json')
synonmy_replacements = {}
merged_dict = defaultdict(list)
# Merge ingredients
for key, value in food_to_embeddings_dict.items():
if key in synonmy_replacements:
key_to_use = synonmy_replacements[key]
else:
key_to_use = key
merged_dict[key_to_use].append(value)
merged_dict = {k: np.concatenate(v) for k, v in merged_dict.items()}
# When embedding count exceeds maximum allowed, reduce back to requested count
for key, value in merged_dict.items():
if len(value) > max_sentence_count:
index = np.random.choice(value.shape[0], max_sentence_count, replace=False)
new_value = value[index]
merged_dict[key] = new_value
return merged_dict
def generate_food_embedding_dict(max_sentence_count, model_path, eval_path='data/eval/', dataset_path="output"):
'''
Creates a dict where the keys are the ingredients and the values are a list of embeddings with length max_sentence_count or less if there are less occurences
These embeddings are used in generate_substitutes.py to predict substitutes
'''
food_to_embeddings_dict_path = Path(eval_path + 'food_embeddings_dict.pkl')
if food_to_embeddings_dict_path.exists():
with food_to_embeddings_dict_path.open('rb') as f:
food_to_embeddings_dict = pickle.load(f)
# # delete keys if we deleted ingredients
# old_ingredients = set(food_to_embeddings_dict.keys())
# with open('train_model/vocab/used_ingredients.json', "r") as f:
# new_ingredients = set(json.load(f))
#
# keys_to_delete = old_ingredients.difference(new_ingredients)
# for key in keys_to_delete:
# food_to_embeddings_dict.pop(key, None) # delete key if it exists
#
# # merge new synonyms
# food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
#
# with food_to_embeddings_dict_path.open('wb') as f:
# pickle.dump(food_to_embeddings_dict, f) # Overwrite dict with cleaned version
return food_to_embeddings_dict
print('Sampling Random Sentences')
food_to_sentences_dict_random_samples = sample_random_sentence_dict(model_path=dataset_path, max_sentence_count=max_sentence_count)
food_to_embeddings_dict = defaultdict(list)
print('Mapping Ingredients to Input Ids')
all_ingredient_ids = _map_ingredients_to_input_ids(model_path=model_path)
prediction_model = PredictionModel(model_path=model_path)
for food, sentences in tqdm(food_to_sentences_dict_random_samples.items(), total=len(food_to_sentences_dict_random_samples),
desc='Calculating Embeddings for Food items'):
embeddings, ingredient_ids = prediction_model.predict_embeddings(sentences)
# get embedding of food word
embeddings_flat = embeddings.view((-1, 768))
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
food_id = all_ingredient_ids[food]
food_embeddings = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
food_to_embeddings_dict[food].extend(food_embeddings)
food_to_embeddings_dict = {k: np.stack(v) for k, v in food_to_embeddings_dict.items()}
# Clean synonmy
food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
with food_to_embeddings_dict_path.open('wb') as f:
pickle.dump(food_to_embeddings_dict, f)
return food_to_embeddings_dict

View File

@@ -0,0 +1,38 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
class InstructionsDataset(Dataset):
def __init__(self, tokenizer, sentences):
self.tokenizer = tokenizer
batch_encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, max_length=512, truncation=True)
self.examples = batch_encoding["input_ids"]
self.examples = self._tensorize_batch([torch.tensor(elem) for elem in self.examples])
def _tensorize_batch(self, examples) -> torch.Tensor:
length_of_first = examples[0].size(0)
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
if are_tensors_same_length:
return torch.stack(examples, dim=0)
else:
if self.tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({self.tokenizer.__class__.__name__}) does not have one."
)
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return self.examples[i]

View File

@@ -0,0 +1,36 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
from pathlib import Path
import joblib
from sklearn.neighbors import NearestNeighbors
class KNNClassifier:
def __init__(self, all_ingredient_embeddings, max_embedding_count,
save_path=Path('data/eval/knn_classifier.joblib')):
if save_path.exists():
print('Loading Existing Classifier')
self.knn_classifier: NearestNeighbors = joblib.load(save_path)
else:
print('Training New Classifier')
# To make sure we don't just get ourselves: add max_embedding_count
self.knn_classifier: NearestNeighbors = NearestNeighbors(n_neighbors=max_embedding_count + 200, n_jobs=12,
algorithm='brute') # kd_tree, ball_tree or brute
self.knn_classifier.fit(all_ingredient_embeddings)
print('Saving Classifier')
joblib.dump(self.knn_classifier, save_path)
print(f'\nKNN with: {self.knn_classifier._fit_method} and leaf size: {self.knn_classifier.leaf_size}\n')
def k_nearest_neighbors(self, ingredient_embeddings):
distances, indices = self.knn_classifier.kneighbors(ingredient_embeddings, return_distance=True)
return distances, indices

View File

@@ -0,0 +1,53 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
import json
import torch
from torch.utils.data import DataLoader
from transformers import BertModel, BertTokenizer
from evaluation.helpers.instructions_dataset import InstructionsDataset
class PredictionModel:
def __init__(self, model_path=''):
self.model: BertModel = BertModel.from_pretrained(
pretrained_model_name_or_path=model_path)
with open('train_model/vocab/used_ingredients.json', 'r') as f:
used_ingredients = json.load(f)
self.tokenizer = BertTokenizer(vocab_file='train_model/vocab/bert_vocab.txt', do_lower_case=False,
max_len=512, never_split=used_ingredients, truncation=True)
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.model.to(self.device)
def predict_embeddings(self, sentences):
dataset = InstructionsDataset(tokenizer=self.tokenizer, sentences=sentences)
dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)
embeddings = []
ingredient_ids = []
for batch in dataloader:
batch = batch.to(self.device)
with torch.no_grad():
embeddings_batch = self.model(batch)
embeddings.extend(embeddings_batch[0])
ingredient_ids.extend(batch)
return torch.stack(embeddings), ingredient_ids
def compute_embedding_for_ingredient(self, sentence, ingredient_name):
embeddings, ingredient_ids = self.predict_embeddings([sentence])
embeddings_flat = embeddings.view((-1, 768))
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name)
food_embedding = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
return food_embedding[0]

View File

@@ -0,0 +1,166 @@
import json
synonyms_path = "data/synonyms.json"
ground_truth_path = "data/ground_truth.json"
category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
"Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
"Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Truthahn", "Wein"]
with open("final_Versions/models/vers3/eval/complete_substitute_pairs_50.json") as f:
sub_dict = json.load(f)
def engl_combined_substitutes_dict(found_substitutes_dict):
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
with open("evaluation/engl_data/revised_engl_ground_truth.json", "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
reversed_synonyms_dict = get_reversed_syn_dict(is_engl=True)
new_found_sub_dict = {}
for ingredient in found_substitutes_dict.keys():
new_found_sub_dict[ingredient] = []
current_subs = set()
for sub in found_substitutes_dict[ingredient]:
# delete substitute if it is the same as the ingredient
if sub == ingredient:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# delete substitute if it is a synonym of the ingredient
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# if substitute is a synonym of sth
if sub in reversed_synonyms_dict.keys():
if len(reversed_synonyms_dict[sub]) == 1:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
else:
current_subs.add(sub)
new_found_sub_dict[ingredient] += list(current_subs)
return new_found_sub_dict
# merges substitutes with their synonyms, replaces synonyms with base synonym
def combined_substitutes_dict(found_substitutes_dict):
with open(synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
with open(ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
reversed_synonyms_dict = get_reversed_syn_dict()
new_found_sub_dict = {}
for ingredient in found_substitutes_dict.keys():
new_found_sub_dict[ingredient] = []
current_subs = set()
for sub in found_substitutes_dict[ingredient]:
# delete substitute if it is the same as the ingredient
if sub == ingredient:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# delete substitute if it is a synonym of the ingredient
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# if substitute is a synonym of sth
if sub in reversed_synonyms_dict.keys():
if len(reversed_synonyms_dict[sub]) == 1:
if reversed_synonyms_dict[sub][0] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
if ingredient in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
current_subs.add(sub)
elif len(reversed_synonyms_dict[sub]) == 2:
if ingredient in category_subs:
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][1])
else:
print(reversed_synonyms_dict[sub])
else:
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][1])
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
print(reversed_synonyms_dict[sub])
else:
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
else:
current_subs.add(sub)
new_found_sub_dict[ingredient] += list(current_subs)
return new_found_sub_dict
# combine substitutes found for an ingredient and its synonyms
# also combine synonyms in substitutes
def combine_all_synonyms(found_substitutes_dict):
reversed_synonyms_dict = get_reversed_syn_dict_no_cat()
new_found_sub_dict = {}
for ingredient in found_substitutes_dict.keys():
if ingredient not in reversed_synonyms_dict.keys():
new_found_sub_dict[ingredient] = set()
for ingredient in found_substitutes_dict.keys():
if ingredient in reversed_synonyms_dict.keys():
new_found_sub_dict[reversed_synonyms_dict[ingredient][0]] |= set(found_substitutes_dict[ingredient])
else:
new_found_sub_dict[ingredient] |= set(found_substitutes_dict[ingredient])
new_found_sub_dict_list = {}
for ingredient in new_found_sub_dict.keys():
new_found_sub_dict_list[ingredient] = list(new_found_sub_dict[ingredient])
return combined_substitutes_dict(new_found_sub_dict_list)
def get_reversed_syn_dict(is_engl=False):
if is_engl:
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
with open(synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
reversed_synonyms_dict = {}
for ingredient in synonyms_dict.keys():
for syn in synonyms_dict[ingredient]:
if syn not in reversed_synonyms_dict.keys():
reversed_synonyms_dict[syn] = []
reversed_synonyms_dict[syn].append(ingredient)
return reversed_synonyms_dict
def get_reversed_syn_dict_no_cat():
with open(synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
reversed_synonyms_dict = {}
for ingredient in synonyms_dict.keys():
if ingredient not in category_subs:
for syn in synonyms_dict[ingredient]:
if syn not in reversed_synonyms_dict.keys():
reversed_synonyms_dict[syn] = []
reversed_synonyms_dict[syn].append(ingredient)
return reversed_synonyms_dict
combined_substitutes_dict(sub_dict)