initial commit of project

2021-04-11 23:28:41 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
@@ -0,0 +1,45 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+from pathlib import Path
+
+import numpy as np
+from annoy import AnnoyIndex
+from tqdm import tqdm
+
+
+# Full guide https://github.com/spotify/annoy
+class ApproxKNNClassifier:
+    def __init__(self, all_ingredient_embeddings, max_embedding_count,
+                 save_path=Path('data/eval/approx_knn_classifier.ann'), n_trees=10):
+
+        vector_length = all_ingredient_embeddings.shape[-1]
+        self.max_embedding_count = max_embedding_count
+        if save_path.exists():
+            print('Loading Existing Approx Classifier')
+            self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')
+            self.approx_knn_classifier.load(str(save_path))  # super fast, will just mmap the file
+        else:
+
+            # To make sure we don't just get ourselves: add max_embedding_count
+            self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')  # Length of item vector that will be indexed
+            for i in tqdm(range(len(all_ingredient_embeddings)), total=len(all_ingredient_embeddings), desc='Creating Approx Classifier'):
+                self.approx_knn_classifier.add_item(i, all_ingredient_embeddings[i])
+
+            self.approx_knn_classifier.build(n_trees)
+            print('Saving Approx Classifier')
+            self.approx_knn_classifier.save(str(save_path))
+
+    def k_nearest_neighbors(self, ingredient_embeddings):
+        all_indices, all_distances = [], []
+        for idx, ingredient_embedding in enumerate(
+                ingredient_embeddings):  # search_k gives you a run-time tradeoff between better accuracy and speed currently defaults
+            indices, distances = self.approx_knn_classifier.get_nns_by_vector(ingredient_embedding, self.max_embedding_count + 200, include_distances=True)
+            all_indices.append(indices)
+            all_distances.append(distances)
+
+        return np.stack(all_distances), np.stack(all_indices)
@@ -0,0 +1,152 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import json
+import pickle
+import random
+import re
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from evaluation.helpers.prediction_model import PredictionModel
+
+
+def _generate_food_sentence_dict(model_path):
+    with open('data/mult_ingredients_nice.json', "r") as f:
+        food_items = json.load(f)
+        food_items_set = set(food_items.keys())
+
+    with open(model_path + 'training_data.txt', "r") as f:
+        train_instruction_sentences = f.read().splitlines()
+        # remove overlong sentences
+        train_instruction_sentences = [s for s in train_instruction_sentences if len(s.split()) <= 100]
+
+    with open(model_path + 'testing_data.txt', "r") as f:
+        test_instruction_sentences = f.read().splitlines()
+        # remove overlong sentences
+        test_instruction_sentences = [s for s in test_instruction_sentences if len(s.split()) <= 100]
+
+    instruction_sentences = train_instruction_sentences + test_instruction_sentences
+
+    food_to_sentences_dict = defaultdict(list)
+    for sentence in instruction_sentences:
+        words = re.sub("[^\w]-'", " ", sentence).split()
+        for word in words:
+            if word in food_items_set:
+                food_to_sentences_dict[word].append(sentence)
+
+    return food_to_sentences_dict
+
+
+def _random_sample_with_min_count(population, k):
+    if len(population) <= k:
+        return population
+    else:
+        return random.sample(population, k)
+
+
+def sample_random_sentence_dict(model_path, max_sentence_count):
+    food_to_sentences_dict = _generate_food_sentence_dict(model_path=model_path)
+    # only keep 100 randomly selected sentences
+    food_to_sentences_dict_random_samples = {food: _random_sample_with_min_count(sentences, max_sentence_count) for
+                                             food, sentences in food_to_sentences_dict.items()}
+    return food_to_sentences_dict_random_samples
+
+
+def _map_ingredients_to_input_ids(model_path):
+    with open('data/mult_ingredients_nice.json', "r") as f:
+        ingredients = json.load(f).keys()
+    model = PredictionModel(model_path)
+    ingredient_ids = model.tokenizer.convert_tokens_to_ids(ingredients)
+
+    ingredient_ids_dict = dict(zip(ingredients, ingredient_ids))
+
+    return ingredient_ids_dict
+
+
+def _merge_synonmys(food_to_embeddings_dict, max_sentence_count):
+    synonmy_replacements_path = Path('foodbert_embeddings/data/synonmy_replacements.json')
+    synonmy_replacements = {}
+
+    merged_dict = defaultdict(list)
+    # Merge ingredients
+    for key, value in food_to_embeddings_dict.items():
+        if key in synonmy_replacements:
+            key_to_use = synonmy_replacements[key]
+        else:
+            key_to_use = key
+
+        merged_dict[key_to_use].append(value)
+
+    merged_dict = {k: np.concatenate(v) for k, v in merged_dict.items()}
+    # When embedding count exceeds maximum allowed, reduce back to requested count
+    for key, value in merged_dict.items():
+        if len(value) > max_sentence_count:
+            index = np.random.choice(value.shape[0], max_sentence_count, replace=False)
+            new_value = value[index]
+            merged_dict[key] = new_value
+
+    return merged_dict
+
+
+def generate_food_embedding_dict(max_sentence_count, model_path, eval_path='data/eval/', dataset_path="output"):
+    '''
+    Creates a dict where the keys are the ingredients and the values are a list of embeddings with length max_sentence_count or less if there are less occurences
+    These embeddings are used in generate_substitutes.py to predict substitutes
+    '''
+    food_to_embeddings_dict_path = Path(eval_path + 'food_embeddings_dict.pkl')
+    if food_to_embeddings_dict_path.exists():
+        with food_to_embeddings_dict_path.open('rb') as f:
+            food_to_embeddings_dict = pickle.load(f)
+
+        # # delete keys if we deleted ingredients
+        # old_ingredients = set(food_to_embeddings_dict.keys())
+        # with open('train_model/vocab/used_ingredients.json', "r") as f:
+        #     new_ingredients = set(json.load(f))
+        #
+        # keys_to_delete = old_ingredients.difference(new_ingredients)
+        # for key in keys_to_delete:
+        #     food_to_embeddings_dict.pop(key, None)  # delete key if it exists
+        #
+        # # merge new synonyms
+        # food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
+        #
+        # with food_to_embeddings_dict_path.open('wb') as f:
+        #     pickle.dump(food_to_embeddings_dict, f)  # Overwrite dict with cleaned version
+
+        return food_to_embeddings_dict
+
+    print('Sampling Random Sentences')
+    food_to_sentences_dict_random_samples = sample_random_sentence_dict(model_path=dataset_path, max_sentence_count=max_sentence_count)
+    food_to_embeddings_dict = defaultdict(list)
+    print('Mapping Ingredients to Input Ids')
+    all_ingredient_ids = _map_ingredients_to_input_ids(model_path=model_path)
+
+    prediction_model = PredictionModel(model_path=model_path)
+
+    for food, sentences in tqdm(food_to_sentences_dict_random_samples.items(), total=len(food_to_sentences_dict_random_samples),
+                                desc='Calculating Embeddings for Food items'):
+        embeddings, ingredient_ids = prediction_model.predict_embeddings(sentences)
+        # get embedding of food word
+        embeddings_flat = embeddings.view((-1, 768))
+        ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
+        food_id = all_ingredient_ids[food]
+        food_embeddings = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
+        food_to_embeddings_dict[food].extend(food_embeddings)
+
+    food_to_embeddings_dict = {k: np.stack(v) for k, v in food_to_embeddings_dict.items()}
+    # Clean synonmy
+    food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
+
+    with food_to_embeddings_dict_path.open('wb') as f:
+        pickle.dump(food_to_embeddings_dict, f)
+
+    return food_to_embeddings_dict
@@ -0,0 +1,38 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+
+class InstructionsDataset(Dataset):
+    def __init__(self, tokenizer, sentences):
+        self.tokenizer = tokenizer
+
+        batch_encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, max_length=512, truncation=True)
+        self.examples = batch_encoding["input_ids"]
+        self.examples = self._tensorize_batch([torch.tensor(elem) for elem in self.examples])
+
+    def _tensorize_batch(self, examples) -> torch.Tensor:
+        length_of_first = examples[0].size(0)
+        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+        if are_tensors_same_length:
+            return torch.stack(examples, dim=0)
+        else:
+            if self.tokenizer._pad_token is None:
+                raise ValueError(
+                    "You are attempting to pad samples but the tokenizer you are using"
+                    f" ({self.tokenizer.__class__.__name__}) does not have one."
+                )
+            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i):
+        return self.examples[i]
@@ -0,0 +1,36 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+from pathlib import Path
+
+import joblib
+from sklearn.neighbors import NearestNeighbors
+
+
+class KNNClassifier:
+    def __init__(self, all_ingredient_embeddings, max_embedding_count,
+                 save_path=Path('data/eval/knn_classifier.joblib')):
+
+        if save_path.exists():
+            print('Loading Existing Classifier')
+            self.knn_classifier: NearestNeighbors = joblib.load(save_path)
+        else:
+            print('Training New Classifier')
+            # To make sure we don't just get ourselves: add max_embedding_count
+            self.knn_classifier: NearestNeighbors = NearestNeighbors(n_neighbors=max_embedding_count + 200, n_jobs=12,
+                                                                     algorithm='brute')  # kd_tree, ball_tree or brute
+            self.knn_classifier.fit(all_ingredient_embeddings)
+
+            print('Saving Classifier')
+            joblib.dump(self.knn_classifier, save_path)
+
+        print(f'\nKNN with: {self.knn_classifier._fit_method} and leaf size: {self.knn_classifier.leaf_size}\n')
+
+    def k_nearest_neighbors(self, ingredient_embeddings):
+        distances, indices = self.knn_classifier.kneighbors(ingredient_embeddings, return_distance=True)
+
+        return distances, indices
@@ -0,0 +1,53 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import json
+
+import torch
+from torch.utils.data import DataLoader
+from transformers import BertModel, BertTokenizer
+
+from evaluation.helpers.instructions_dataset import InstructionsDataset
+
+
+class PredictionModel:
+
+    def __init__(self, model_path=''):
+        self.model: BertModel = BertModel.from_pretrained(
+            pretrained_model_name_or_path=model_path)
+        with open('train_model/vocab/used_ingredients.json', 'r') as f:
+            used_ingredients = json.load(f)
+        self.tokenizer = BertTokenizer(vocab_file='train_model/vocab/bert_vocab.txt', do_lower_case=False,
+                                       max_len=512, never_split=used_ingredients, truncation=True)
+
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+        self.model.to(self.device)
+
+    def predict_embeddings(self, sentences):
+        dataset = InstructionsDataset(tokenizer=self.tokenizer, sentences=sentences)
+        dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)
+
+        embeddings = []
+        ingredient_ids = []
+        for batch in dataloader:
+            batch = batch.to(self.device)
+            with torch.no_grad():
+                embeddings_batch = self.model(batch)
+                embeddings.extend(embeddings_batch[0])
+                ingredient_ids.extend(batch)
+
+        return torch.stack(embeddings), ingredient_ids
+
+    def compute_embedding_for_ingredient(self, sentence, ingredient_name):
+        embeddings, ingredient_ids = self.predict_embeddings([sentence])
+        embeddings_flat = embeddings.view((-1, 768))
+        ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
+        food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name)
+        food_embedding = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
+
+        return food_embedding[0]
@@ -0,0 +1,166 @@
+import json
+synonyms_path = "data/synonyms.json"
+ground_truth_path = "data/ground_truth.json"
+category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
+                "Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
+                "Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Truthahn", "Wein"]
+
+with open("final_Versions/models/vers3/eval/complete_substitute_pairs_50.json") as f:
+    sub_dict = json.load(f)
+
+
+def engl_combined_substitutes_dict(found_substitutes_dict):
+    with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+    with open("evaluation/engl_data/revised_engl_ground_truth.json", "r") as whole_json_file:
+        ground_truth_dict = json.load(whole_json_file)
+
+    reversed_synonyms_dict = get_reversed_syn_dict(is_engl=True)
+
+    new_found_sub_dict = {}
+
+    for ingredient in found_substitutes_dict.keys():
+        new_found_sub_dict[ingredient] = []
+        current_subs = set()
+        for sub in found_substitutes_dict[ingredient]:
+            # delete substitute if it is the same as the ingredient
+            if sub == ingredient:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # delete substitute if it is a synonym of the ingredient
+            if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # if substitute is a synonym of sth
+            if sub in reversed_synonyms_dict.keys():
+                if len(reversed_synonyms_dict[sub]) == 1:
+                    current_subs.add(reversed_synonyms_dict[sub][0])
+                else:
+                    print(sub + " is in " + str(reversed_synonyms_dict[sub]))
+
+            else:
+                current_subs.add(sub)
+
+        new_found_sub_dict[ingredient] += list(current_subs)
+    return new_found_sub_dict
+
+
+# merges substitutes with their synonyms, replaces synonyms with base synonym
+def combined_substitutes_dict(found_substitutes_dict):
+    with open(synonyms_path, "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+    with open(ground_truth_path, "r") as whole_json_file:
+        ground_truth_dict = json.load(whole_json_file)
+
+
+    reversed_synonyms_dict = get_reversed_syn_dict()
+
+    new_found_sub_dict = {}
+
+    for ingredient in found_substitutes_dict.keys():
+        new_found_sub_dict[ingredient] = []
+        current_subs = set()
+        for sub in found_substitutes_dict[ingredient]:
+            # delete substitute if it is the same as the ingredient
+            if sub == ingredient:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # delete substitute if it is a synonym of the ingredient
+            if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # if substitute is a synonym of sth
+            if sub in reversed_synonyms_dict.keys():
+                if len(reversed_synonyms_dict[sub]) == 1:
+                    if reversed_synonyms_dict[sub][0] not in category_subs:
+                        current_subs.add(reversed_synonyms_dict[sub][0])
+                    else:
+                        if ingredient in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][0])
+                        else:
+                            current_subs.add(sub)
+                elif len(reversed_synonyms_dict[sub]) == 2:
+                    if ingredient in category_subs:
+                        if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][0])
+                        elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][1])
+                        else:
+                            print(reversed_synonyms_dict[sub])
+                    else:
+                        if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][1])
+                        elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][0])
+                        else:
+                            print(reversed_synonyms_dict[sub])
+                else:
+                    print(sub + " is in " + str(reversed_synonyms_dict[sub]))
+
+            else:
+                current_subs.add(sub)
+
+        new_found_sub_dict[ingredient] += list(current_subs)
+    return new_found_sub_dict
+
+
+# combine substitutes found for an ingredient and its synonyms
+# also combine synonyms in substitutes
+def combine_all_synonyms(found_substitutes_dict):
+    reversed_synonyms_dict = get_reversed_syn_dict_no_cat()
+
+    new_found_sub_dict = {}
+    for ingredient in found_substitutes_dict.keys():
+        if ingredient not in reversed_synonyms_dict.keys():
+            new_found_sub_dict[ingredient] = set()
+
+    for ingredient in found_substitutes_dict.keys():
+        if ingredient in reversed_synonyms_dict.keys():
+            new_found_sub_dict[reversed_synonyms_dict[ingredient][0]] |= set(found_substitutes_dict[ingredient])
+        else:
+            new_found_sub_dict[ingredient] |= set(found_substitutes_dict[ingredient])
+
+    new_found_sub_dict_list = {}
+
+    for ingredient in new_found_sub_dict.keys():
+       new_found_sub_dict_list[ingredient] = list(new_found_sub_dict[ingredient])
+
+    return combined_substitutes_dict(new_found_sub_dict_list)
+
+
+
+def get_reversed_syn_dict(is_engl=False):
+    if is_engl:
+        with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
+            synonyms_dict = json.load(whole_json_file)
+    else:
+        with open(synonyms_path, "r") as whole_json_file:
+            synonyms_dict = json.load(whole_json_file)
+    reversed_synonyms_dict = {}
+    for ingredient in synonyms_dict.keys():
+        for syn in synonyms_dict[ingredient]:
+            if syn not in reversed_synonyms_dict.keys():
+                reversed_synonyms_dict[syn] = []
+            reversed_synonyms_dict[syn].append(ingredient)
+
+    return reversed_synonyms_dict
+
+def get_reversed_syn_dict_no_cat():
+    with open(synonyms_path, "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+    reversed_synonyms_dict = {}
+    for ingredient in synonyms_dict.keys():
+        if ingredient not in category_subs:
+            for syn in synonyms_dict[ingredient]:
+                if syn not in reversed_synonyms_dict.keys():
+                    reversed_synonyms_dict[syn] = []
+                reversed_synonyms_dict[syn].append(ingredient)
+
+    return reversed_synonyms_dict
+
+
+combined_substitutes_dict(sub_dict)