initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/evaluation/helpers/generate_ingredient_embeddings.py
+++ b/evaluation/helpers/generate_ingredient_embeddings.py
@@ -0,0 +1,152 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import json
+import pickle
+import random
+import re
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from evaluation.helpers.prediction_model import PredictionModel
+
+
+def _generate_food_sentence_dict(model_path):
+    with open('data/mult_ingredients_nice.json', "r") as f:
+        food_items = json.load(f)
+        food_items_set = set(food_items.keys())
+
+    with open(model_path + 'training_data.txt', "r") as f:
+        train_instruction_sentences = f.read().splitlines()
+        # remove overlong sentences
+        train_instruction_sentences = [s for s in train_instruction_sentences if len(s.split()) <= 100]
+
+    with open(model_path + 'testing_data.txt', "r") as f:
+        test_instruction_sentences = f.read().splitlines()
+        # remove overlong sentences
+        test_instruction_sentences = [s for s in test_instruction_sentences if len(s.split()) <= 100]
+
+    instruction_sentences = train_instruction_sentences + test_instruction_sentences
+
+    food_to_sentences_dict = defaultdict(list)
+    for sentence in instruction_sentences:
+        words = re.sub("[^\w]-'", " ", sentence).split()
+        for word in words:
+            if word in food_items_set:
+                food_to_sentences_dict[word].append(sentence)
+
+    return food_to_sentences_dict
+
+
+def _random_sample_with_min_count(population, k):
+    if len(population) <= k:
+        return population
+    else:
+        return random.sample(population, k)
+
+
+def sample_random_sentence_dict(model_path, max_sentence_count):
+    food_to_sentences_dict = _generate_food_sentence_dict(model_path=model_path)
+    # only keep 100 randomly selected sentences
+    food_to_sentences_dict_random_samples = {food: _random_sample_with_min_count(sentences, max_sentence_count) for
+                                             food, sentences in food_to_sentences_dict.items()}
+    return food_to_sentences_dict_random_samples
+
+
+def _map_ingredients_to_input_ids(model_path):
+    with open('data/mult_ingredients_nice.json', "r") as f:
+        ingredients = json.load(f).keys()
+    model = PredictionModel(model_path)
+    ingredient_ids = model.tokenizer.convert_tokens_to_ids(ingredients)
+
+    ingredient_ids_dict = dict(zip(ingredients, ingredient_ids))
+
+    return ingredient_ids_dict
+
+
+def _merge_synonmys(food_to_embeddings_dict, max_sentence_count):
+    synonmy_replacements_path = Path('foodbert_embeddings/data/synonmy_replacements.json')
+    synonmy_replacements = {}
+
+    merged_dict = defaultdict(list)
+    # Merge ingredients
+    for key, value in food_to_embeddings_dict.items():
+        if key in synonmy_replacements:
+            key_to_use = synonmy_replacements[key]
+        else:
+            key_to_use = key
+
+        merged_dict[key_to_use].append(value)
+
+    merged_dict = {k: np.concatenate(v) for k, v in merged_dict.items()}
+    # When embedding count exceeds maximum allowed, reduce back to requested count
+    for key, value in merged_dict.items():
+        if len(value) > max_sentence_count:
+            index = np.random.choice(value.shape[0], max_sentence_count, replace=False)
+            new_value = value[index]
+            merged_dict[key] = new_value
+
+    return merged_dict
+
+
+def generate_food_embedding_dict(max_sentence_count, model_path, eval_path='data/eval/', dataset_path="output"):
+    '''
+    Creates a dict where the keys are the ingredients and the values are a list of embeddings with length max_sentence_count or less if there are less occurences
+    These embeddings are used in generate_substitutes.py to predict substitutes
+    '''
+    food_to_embeddings_dict_path = Path(eval_path + 'food_embeddings_dict.pkl')
+    if food_to_embeddings_dict_path.exists():
+        with food_to_embeddings_dict_path.open('rb') as f:
+            food_to_embeddings_dict = pickle.load(f)
+
+        # # delete keys if we deleted ingredients
+        # old_ingredients = set(food_to_embeddings_dict.keys())
+        # with open('train_model/vocab/used_ingredients.json', "r") as f:
+        #     new_ingredients = set(json.load(f))
+        #
+        # keys_to_delete = old_ingredients.difference(new_ingredients)
+        # for key in keys_to_delete:
+        #     food_to_embeddings_dict.pop(key, None)  # delete key if it exists
+        #
+        # # merge new synonyms
+        # food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
+        #
+        # with food_to_embeddings_dict_path.open('wb') as f:
+        #     pickle.dump(food_to_embeddings_dict, f)  # Overwrite dict with cleaned version
+
+        return food_to_embeddings_dict
+
+    print('Sampling Random Sentences')
+    food_to_sentences_dict_random_samples = sample_random_sentence_dict(model_path=dataset_path, max_sentence_count=max_sentence_count)
+    food_to_embeddings_dict = defaultdict(list)
+    print('Mapping Ingredients to Input Ids')
+    all_ingredient_ids = _map_ingredients_to_input_ids(model_path=model_path)
+
+    prediction_model = PredictionModel(model_path=model_path)
+
+    for food, sentences in tqdm(food_to_sentences_dict_random_samples.items(), total=len(food_to_sentences_dict_random_samples),
+                                desc='Calculating Embeddings for Food items'):
+        embeddings, ingredient_ids = prediction_model.predict_embeddings(sentences)
+        # get embedding of food word
+        embeddings_flat = embeddings.view((-1, 768))
+        ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
+        food_id = all_ingredient_ids[food]
+        food_embeddings = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
+        food_to_embeddings_dict[food].extend(food_embeddings)
+
+    food_to_embeddings_dict = {k: np.stack(v) for k, v in food_to_embeddings_dict.items()}
+    # Clean synonmy
+    food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
+
+    with food_to_embeddings_dict_path.open('wb') as f:
+        pickle.dump(food_to_embeddings_dict, f)
+
+    return food_to_embeddings_dict