initial commit of project

2021-04-11 23:28:41 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
@@ -0,0 +1,29 @@
+import json
+
+def main():
+    eval_path = "final_Versions/models/vers2/eval/"
+    file_name = "substitute_pairs_65.json"
+    found_substitutes_path = eval_path + file_name
+    with open(found_substitutes_path, "r") as whole_json_file:
+        model_substitutes_dict = json.load(whole_json_file)
+
+    data_path = "data/"
+    occurances_path = "mult_ingredients_nice.json"
+    with open(data_path + occurances_path, "r") as whole_json_file:
+        occurrences_dict = json.load(whole_json_file)
+
+    all_substitutes = {}
+    for ingredient in occurrences_dict.keys():
+        if ingredient not in model_substitutes_dict.keys():
+            all_substitutes[ingredient] = []
+            # print(ingredient)
+        else:
+            all_substitutes[ingredient] = model_substitutes_dict[ingredient]
+
+    print(str(len(all_substitutes.keys())))
+    out_path = eval_path + "complete_" + file_name
+    with open(out_path, 'w') as f:
+        json.dump(all_substitutes, f, ensure_ascii=False, indent=4)
+
+
+main()
@@ -0,0 +1,110 @@
+import json
+import statistics
+
+
+def dataset(full_dataset_path):
+    all_urls = []
+
+    with open(full_dataset_path, "r") as whole_json_file:
+        full_dataset = json.load(whole_json_file)
+    counter = 0
+    ingredient_counter = 0
+    ingredient_lengths = []
+    pic_counter = 0
+    comment_counter = 0
+    no_comments = 0
+    comment_lengths = []
+    instruction_counter = 0
+    instruction_lengths = []
+    for url in full_dataset.keys():
+        ingredient_counter += len(full_dataset[url]['ingredients'])
+        ingredient_lengths.append(len(full_dataset[url]['ingredients']))
+        if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg":
+            pic_counter += 1
+        if full_dataset[url]['comments']:
+            comment_lengths.append(len(full_dataset[url]['comments']))
+            comment_counter += len(full_dataset[url]['comments'])
+        else:
+            comment_lengths.append(0)
+            no_comments += 1
+        instruction_counter += len(full_dataset[url]['instructions'])
+        instruction_lengths.append(len(full_dataset[url]['instructions']))
+        counter += 1
+        print(counter)
+        if url not in all_urls:
+            all_urls.append(url)
+    print("number of recipes: " + str(len(full_dataset.keys())))
+    print("\n")
+    print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys())))
+    print("median ingredient count: " + str(statistics.median(ingredient_lengths)))
+    print("\n")
+    print("number of recipes with picture: " + str(pic_counter))
+    print("\n")
+    print("number of comments: " + str(comment_counter))
+    print("number of recipes withOUT comments: " + str(no_comments))
+    print("average amount of comments: " + str(comment_counter/len(full_dataset.keys())))
+    print("median comment count: " + str(statistics.median(comment_lengths)))
+    print("\n")
+    print("total instruction count: " + str(instruction_counter))
+    print("average instruction count: " + str(instruction_counter / len(full_dataset.keys())))
+    print("median instruction count: " + str(statistics.median(instruction_lengths)))
+
+
+def ingredients_before(full_dataset_path):
+    counter = 0
+    with open(full_dataset_path, "r") as whole_json_file:
+        full_dataset = json.load(whole_json_file)
+    all_ingredients = []
+    for url in full_dataset.keys():
+        counter += 1
+        print(counter)
+        for ingred in full_dataset[url]['ingredients']:
+            if ingred not in all_ingredients:
+                all_ingredients.append(ingred)
+    print(str(len(all_ingredients)))
+
+def ingredient_stats():
+    ingredients_list_path = "data/mult_ingredients_nice.json"
+    ingredients_instructions_path = "data/cleaned_steps_occurrance.json"
+
+    with open(ingredients_list_path, "r") as whole_json_file:
+        ingred_lists = json.load(whole_json_file)
+
+    with open(ingredients_instructions_path, "r") as whole_json_file:
+        ingred_instruct = json.load(whole_json_file)
+
+    print("in ingredient lists: ")
+    ingred_counts = []
+    ingred_sum = 0
+    for ingred in ingred_lists.keys():
+        ingred_counts.append(ingred_lists[ingred])
+        ingred_sum += ingred_lists[ingred]
+
+    print("average: " + str(ingred_sum/len(ingred_lists.keys())))
+    print("median: " + str(statistics.median(ingred_counts)))
+
+    print("in instructions: ")
+    instruct_counts = []
+    instruct_sum = 0
+    none_counts = 0
+    for ingred in ingred_instruct.keys():
+        instruct_counts.append(ingred_instruct[ingred])
+        instruct_sum += ingred_instruct[ingred]
+        if ingred_instruct[ingred] <5 :
+            none_counts += 1
+
+    print("average: " + str(instruct_sum / len(ingred_instruct.keys())))
+    print("median: " + str(statistics.median(instruct_counts)))
+    print("nones: " + str(none_counts))
+    sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1]))
+    print(sorted_instruct)
+
+
+def main():
+    before_dataset_path = "data/dataset_fin.json"
+    full_dataset_path = "Versions/vers3/full_dataset.json"
+    # dataset(full_dataset_path)
+    # ingredients_before(before_dataset_path)
+    ingredient_stats()
+
+main()
@@ -0,0 +1,216 @@
+{
+	"carrot": [
+    "parsnip",
+    "daikon",
+    "turnip",
+    "celery",
+    "squash",
+    "celery root",
+    "sweet potato",
+    "yam",
+    "radish",
+    "potato",
+    "pumpkin",
+    "green papaya",
+    "swede",
+    "beet",
+    "rutabaga",
+    "red bell pepper",
+    "yellow squash",
+    "butternut squash",
+    "root vegetable",
+    "parsley root"
+  ],
+  "cherry": [
+    "acerola",
+    "apricot",
+    "plum",
+    "nectarine",
+    "raspberry",
+    "grape",
+    "strawberry",
+    "currant",
+    "blackberry",
+    "frozen mixed berry",
+    "peach",
+    "cranberry",
+    "dried cranberry",
+    "blueberry",
+    "maraschino",
+    "berry",
+    "prune"
+  ],
+  "chicken": [
+    "turkey",
+    "rabbit",
+    "oyster mushroom",
+    "squab",
+    "veal",
+    "fish",
+    "tofu",
+    "beef",
+    "extra firm tofu",
+    "pork",
+    "seitan",
+    "duck",
+    "capon",
+    "lamb",
+    "venison",
+    "mushroom",
+    "shrimp",
+    "quail",
+    "goose"
+  ],
+  "parsley": [
+    "chervil",
+    "cilantro",
+    "tarragon",
+    "basil",
+    "oregano",
+    "chopped cilantro",
+    "lovage",
+    "dill",
+    "fresh coriander",
+    "coriander",
+    "rosemary",
+    "caper",
+    "fresh cilantro",
+    "fresh dill",
+    "thyme",
+    "fresh oregano",
+    "chive",
+    "mint",
+    "fresh basil",
+    "fresh thyme",
+    "dried basil",
+    "dried oregano",
+    "fresh chive",
+    "dried thyme"
+  ],
+  "chocolate": [
+    "truffle",
+    "nutella",
+    "ganache",
+    "cocoa powder",
+    "sugar",
+    "jam",
+    "marshmallow",
+    "cocoa",
+    "candy",
+    "caramel",
+    "peanut butter"
+  ],
+  "bacon": [
+    "pancetta",
+    "prosciutto",
+    "speck",
+    "smoked sausage",
+    "smoked ham",
+    "parma ham",
+    "ham",
+    "salami",
+    "pepperoni",
+    "guanciale",
+    "chorizo",
+    "salt pork",
+    "kielbasa",
+    "pork rind",
+    "cubed ham",
+    "italian sausage",
+    "crouton",
+    "capicola",
+    "hard salami",
+    "lardon",
+    "cooked ham",
+    "corned beef",
+    "bologna"
+  ],
+  "kale": [
+    "collard green",
+    "turnip green",
+    "spinach",
+    "chinese cabbage",
+    "leek",
+    "escarole",
+    "spring green",
+    "chard",
+    "green cabbage",
+    "savoy cabbage",
+    "cabbage",
+    "cauliflower",
+    "collard",
+    "watercres",
+    "arugula",
+    "broccoli rabe",
+    "spinach leaves",
+    "lettuce",
+    "romaine lettuce",
+    "baby spinach",
+    "mizuna"
+  ],
+  "sugar": [
+    "splenda",
+    "honey",
+    "stevia",
+    "sweetener",
+    "liquid stevia",
+    "corn syrup",
+    "splenda granular",
+    "liquid sweetener",
+    "brown rice syrup",
+    "turbinado",
+    "maple syrup",
+    "pure maple syrup",
+    "jaggery",
+    "sweetened condensed milk",
+    "artificial sweetener",
+    "agave nectar",
+    "sweet chocolate",
+    "chocolate",
+    "caramel",
+    "vanilla",
+    "molasse",
+    "golden syrup",
+    "syrup"
+  ],
+  "brie": [
+    "camembert",
+    "reblochon",
+    "gorgonzola",
+    "cheese spread",
+    "cheddar",
+    "goat cheese",
+    "havarti",
+    "boursin",
+    "blue cheese",
+    "roquefort",
+    "monterey jack",
+    "gouda",
+    "fontina",
+    "provolone cheese",
+    "stilton",
+    "feta",
+    "processed cheese"
+  ],
+  "turkey": [
+    "chicken",
+    "rabbit",
+    "duck",
+    "ham",
+    "pheasant",
+    "goose",
+    "capon",
+    "beef",
+    "venison",
+    "lamb",
+    "pork",
+    "hen",
+    "roast beef",
+    "veal",
+    "poultry",
+    "chicken breast",
+    "chicken thigh",
+    "quail",
+    "pork chop"
+  ]
+}
@@ -0,0 +1,14 @@
+{
+  "squash": ["pumpkin"],
+  "sweet potato": ["yam"],
+  "cilantro": ["coriander"],
+  "fresh cilantro": ["chopped cilantro", "fresh coriander"],
+  "dill": ["fresh dill"],
+  "oregano": ["fresh oregano"],
+  "basil": ["fresh basil"],
+  "thyme": ["fresh thyme"],
+  "chive": ["fresh chive"],
+  "cabbage":["collard"],
+  "maple syrup": ["pure maple syrup"],
+  "sweetener": ["artificial sweetener"]
+}
@@ -0,0 +1,203 @@
+{
+	"carrot": [
+    "parsnip",
+    "daikon",
+    "turnip",
+    "celery",
+    "squash",
+    "celery root",
+    "sweet potato",
+    "radish",
+    "potato",
+    "green papaya",
+    "swede",
+    "beet",
+    "rutabaga",
+    "red bell pepper",
+    "yellow squash",
+    "butternut squash",
+    "root vegetable",
+    "parsley root"
+  ],
+  "cherry": [
+    "acerola",
+    "apricot",
+    "plum",
+    "nectarine",
+    "raspberry",
+    "grape",
+    "strawberry",
+    "currant",
+    "blackberry",
+    "frozen mixed berry",
+    "peach",
+    "cranberry",
+    "dried cranberry",
+    "blueberry",
+    "maraschino",
+    "berry",
+    "prune"
+  ],
+  "chicken": [
+    "turkey",
+    "rabbit",
+    "oyster mushroom",
+    "squab",
+    "veal",
+    "fish",
+    "tofu",
+    "beef",
+    "extra firm tofu",
+    "pork",
+    "seitan",
+    "duck",
+    "capon",
+    "lamb",
+    "venison",
+    "mushroom",
+    "shrimp",
+    "quail",
+    "goose"
+  ],
+  "parsley": [
+    "chervil",
+    "cilantro",
+    "tarragon",
+    "basil",
+    "oregano",
+    "lovage",
+    "dill",
+    "rosemary",
+    "caper",
+    "fresh cilantro",
+    "thyme",
+    "chive",
+    "mint",
+    "dried basil",
+    "dried oregano",
+    "dried thyme"
+  ],
+  "chocolate": [
+    "truffle",
+    "nutella",
+    "ganache",
+    "cocoa powder",
+    "sugar",
+    "jam",
+    "marshmallow",
+    "cocoa",
+    "candy",
+    "caramel",
+    "peanut butter"
+  ],
+  "bacon": [
+    "pancetta",
+    "prosciutto",
+    "speck",
+    "smoked sausage",
+    "smoked ham",
+    "parma ham",
+    "ham",
+    "salami",
+    "pepperoni",
+    "guanciale",
+    "chorizo",
+    "salt pork",
+    "kielbasa",
+    "pork rind",
+    "cubed ham",
+    "italian sausage",
+    "crouton",
+    "capicola",
+    "hard salami",
+    "lardon",
+    "cooked ham",
+    "corned beef",
+    "bologna"
+  ],
+  "kale": [
+    "collard green",
+    "turnip green",
+    "spinach",
+    "chinese cabbage",
+    "leek",
+    "escarole",
+    "spring green",
+    "chard",
+    "green cabbage",
+    "savoy cabbage",
+    "cabbage",
+    "cauliflower",
+    "watercres",
+    "arugula",
+    "broccoli rabe",
+    "spinach leaves",
+    "lettuce",
+    "romaine lettuce",
+    "baby spinach",
+    "mizuna"
+  ],
+  "sugar": [
+    "splenda",
+    "honey",
+    "stevia",
+    "sweetener",
+    "liquid stevia",
+    "corn syrup",
+    "splenda granular",
+    "liquid sweetener",
+    "brown rice syrup",
+    "turbinado",
+    "maple syrup",
+    "jaggery",
+    "sweetened condensed milk",
+    "agave nectar",
+    "sweet chocolate",
+    "chocolate",
+    "caramel",
+    "vanilla",
+    "molasse",
+    "golden syrup",
+    "syrup"
+  ],
+  "brie": [
+    "camembert",
+    "reblochon",
+    "gorgonzola",
+    "cheese spread",
+    "cheddar",
+    "goat cheese",
+    "havarti",
+    "boursin",
+    "blue cheese",
+    "roquefort",
+    "monterey jack",
+    "gouda",
+    "fontina",
+    "provolone cheese",
+    "stilton",
+    "feta",
+    "processed cheese"
+  ],
+  "turkey": [
+    "chicken",
+    "rabbit",
+    "duck",
+    "ham",
+    "pheasant",
+    "goose",
+    "capon",
+    "beef",
+    "venison",
+    "lamb",
+    "pork",
+    "hen",
+    "roast beef",
+    "veal",
+    "poultry",
+    "chicken breast",
+    "chicken thigh",
+    "quail",
+    "pork chop"
+  ]
+}
@@ -0,0 +1,168 @@
+{
+	"carrot": "Karotte",
+    "parsnip": "Pastinake",
+    "daikon": "Rettich",
+    "turnip": "Steckrübe",
+    "celery": "Staudensellerie",
+    "squash": "Kürbis",
+    "sweet potato": "Süßkartoffel",
+    "yam": "Süßkartoffel",
+    "radish": "Radieschen",
+    "potato": "Kartoffel",
+    "pumpkin": "Kürbis",
+    "beet": "Rübe",
+    "red bell pepper": "Paprika_rot",
+    "butternut squash": "Butternusskürbis",
+    "parsley root": "Petersilienwurzel",
+    "cherry": "Kirsche",
+    "apricot": "Aprikose",
+    "plum": "Pflaume",
+    "nectarine": "Nektarine",
+    "raspberry": "Himbeeren",
+    "grape": "Weintrauben",
+    "strawberry": "Erdbeeren",
+    "currant": "Johannisbeeren",
+    "blackberry": "Brombeeren",
+    "frozen mixed berry": "Beeren_gemischte",
+    "peach": "Pfirsich",
+    "cranberry": "Cranberries",
+    "dried cranberry": "Cranberries_getrocknet",
+    "blueberry": "Blaubeeren",
+    "maraschino": "Maraschino",
+    "berry": "Beeren",
+    "prune": "Trockenpflaumen",
+    "chicken": "Huhn",
+    "turkey": "Truthahn",
+    "rabbit": "Kaninchen",
+    "oyster mushroom": "Austernpilze",
+    "veal": "Kalbfleisch",
+    "fish": "Fisch",
+    "tofu": "Tofu",
+    "beef": "Rindfleisch",
+    "extra firm tofu": "Tofu_fester",
+    "pork": "Schweinefleisch",
+    "seitan": "Seitan",
+    "duck": "Ente",
+    "lamb": "Lamm",
+    "venison": "Wildfleisch",
+    "mushroom": "Pilze",
+    "shrimp": "Shrimps",
+    "quail": "Wachtel",
+    "goose": "Gans",
+    "parsley": "Petersilie",
+    "chervil": "Kerbel",
+    "cilantro": "Koriander",
+    "tarragon": "Estragon",
+    "basil": "Basilikum",
+    "oregano": "Oregano",
+    "chopped cilantro": "Koriandergrün",
+    "lovage": "Liebstöckel",
+    "dill": "Dill",
+    "fresh coriander": "Koriandergrün",
+    "coriander": "Koriander",
+    "rosemary": "Rosmarin",
+    "caper": "Kapern",
+    "fresh cilantro": "Koriandergrün",
+    "fresh dill": "Dill",
+    "thyme": "Thymian",
+    "fresh oregano": "Oregano",
+    "chive": "Schnittlauch",
+    "mint": "Minze",
+    "fresh basil": "Basilikum",
+    "fresh thyme": "Thymian",
+    "dried basil": "Basilikum_getrockneter",
+    "dried oregano": "Oregano_getrocknet",
+    "fresh chive": "Schnittlauch",
+    "dried thyme": "Thymian_getrocknet",
+    "chocolate": "Schokolade",
+    "nutella": "Nutella",
+    "cocoa powder": "Kakaopulver_Instant",
+    "sugar": "Zucker",
+    "jam": "Marmelade",
+    "marshmallow": "Marshmallow",
+    "cocoa": "Kakao",
+    "candy": "Süßigkeiten",
+    "peanut butter": "Erdnussbutter",
+    "bacon": "Frühstücksspeck",
+    "pancetta": "Pancetta",
+    "prosciutto": "Schinken_Prosciutto",
+    "speck": "Speck",
+    "smoked ham": "Schinken_rohen",
+    "parma ham": "Parmaschinken",
+    "ham": "Kochschinken",
+    "salami": "Salami",
+    "chorizo": "Chorizo",
+    "kielbasa": "Wurst_Krakauer",
+    "pork rind": "Schweineschwarte",
+    "cubed ham": "Schinkenwürfel",
+    "crouton": "Croûtons",
+    "lardon": "Speckwürfel",
+    "cooked ham": "Kochschinken",
+    "corned beef": "Corned_Beef",
+    "bologna": "Wurst_Mortadella",
+    "kale": "Grünkohl",
+    "spinach": "Spinat",
+    "chinese cabbage": "Chinakohl",
+    "leek": "Lauch",
+    "escarole": "Endiviensalat",
+    "chard": "Mangold",
+    "savoy cabbage": "Wirsing",
+    "cabbage": "Kohl",
+    "cauliflower": "Blumenkohl",
+    "collard": "Kohl",
+    "watercres": "Brunnenkresse",
+    "arugula": "Rucola",
+    "spinach leaves": "Blattspinat",
+    "lettuce": "Kopfsalat",
+    "romaine lettuce": "Römersalat",
+    "baby spinach": "Babyspinat",
+    "sugar": "Zucker",
+    "honey": "Honig",
+    "stevia": "Stevia",
+    "sweetener": "Süßstoff",
+    "liquid stevia": "Stevia_flüssig",
+    "liquid sweetener": "Süßstoff_flüssigen",
+    "brown rice syrup": "Reissirup",
+    "maple syrup": "Ahornsirup",
+    "pure maple syrup": "Ahornsirup",
+    "sweetened condensed milk": "Kondensmilch_gezuckerte",
+    "artificial sweetener": "Süßstoff",
+    "agave nectar": "Agavendicksaft",
+    "chocolate": "Schokolade",
+    "vanilla": "Vanille",
+    "molasse": "Melasse",
+    "golden syrup": "Zuckerrübensirup",
+    "syrup": "Sirup",
+    "brie": "Brie",
+    "camembert": "Camembert",
+    "gorgonzola": "Gorgonzola",
+    "cheese spread": "Schmelzkäse",
+    "cheddar": "Cheddarkäse",
+    "goat cheese": "Ziegenkäse",
+    "boursin": "Doppelrahmfrischkäse",
+    "blue cheese": "Blauschimmelkäse",
+    "roquefort": "Roquefort",
+    "gouda": "Gouda",
+    "fontina": "Käse_Fontina",
+    "provolone cheese": "Käse_Provolone",
+    "feta": "Feta_Käse",
+    "processed cheese": "Scheiblettenkäse",
+    "turkey": "Truthahn",
+    "chicken": "Huhn",
+    "rabbit": "Kaninchen",
+    "duck": "Ente",
+    "ham": "Schinken",
+    "pheasant": "Fasan",
+    "goose": "Gans",
+    "beef": "Rindfleisch",
+    "venison": "Wildfleisch",
+    "lamb": "Lammfleisch",
+    "pork": "Schweinefleisch",
+    "roast beef": "Roastbeef",
+    "veal": "Kalbfleisch",
+    "poultry": "Geflügelfleisch",
+    "chicken breast": "Hähnchenfilet",
+    "chicken thigh": "Hühnerkeule",
+    "quail": "Wachtel",
+    "pork chop": "schweinekotelett"
+}
@@ -0,0 +1,523 @@
+import json
+import statistics
+
+data_path = "data/"
+occurances_path = "mult_ingredients_nice.json"
+ground_truth_path = "ground_truth.json"
+engl_data_path = "evaluation/engl_data/"
+
+evaluation_path = "evaluation/"
+synonyms_path = "synonyms.json"
+
+found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
+# model_name = "Versions/vers3/"
+
+german_ground_truth = {
+        "Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen", "Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel"],
+        "Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren", "Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren", "Maraschino", "Beeren", "Trockenpflaumen"],
+        "Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester", "Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"],
+        "Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill", "Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze", "Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"],
+        "Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten", "Erdnussbutter"],
+        "Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken", "Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons", "Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"],
+        "Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl", "Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"],
+        "Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup", "Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup", "Sirup"],
+        "Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse", "Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse", "Scheiblettenkäse"],
+        "Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch", "Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule", "Wachtel", "schweinekotelett", "Wildfleisch"]
+    }
+
+
+def no_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
+    if get_occurrences:
+        with open(data_path + occurances_path, "r") as whole_json_file:
+            occurrences_dict = json.load(whole_json_file)
+
+    if not ground_truth_dict:
+        with open(data_path+ground_truth_path, "r") as whole_json_file:
+            ground_truth_dict = json.load(whole_json_file)
+    if synonyms:
+        with open(data_path + synonyms_path, "r") as whole_json_file:
+            synonyms_dict = json.load(whole_json_file)
+    else:
+        synonyms_dict = {}
+
+    if not found_substitutes_dict:
+        with open(found_substitutes_path, "r") as whole_json_file:
+            model_substitutes_dict = json.load(whole_json_file)
+    else:
+        model_substitutes_dict = found_substitutes_dict
+
+    found_ground_ingr = {}
+    correctly_found = 0
+    incorrectly_found = 0
+    average_precision = 0.0
+    average_recall = 0.0
+    number_correct_subs_found_overall = []
+    total_number_subs_found_overall = []
+    # base ingredient without synonyms, substitutes with synonyms
+    for base_ingred in ground_truth_dict.keys():
+        if get_occurrences:
+            occurrences = occurrences_dict[base_ingred]
+        found_substitutes = model_substitutes_dict[base_ingred].copy()
+
+        # if len(found_substitutes) > 30:
+        #     found_substitutes = found_substitutes[:30]
+
+        found = []
+        # remove synonyms of base ingredient
+        new_found_substitutes = []
+        for subst in found_substitutes:
+            if base_ingred in synonyms_dict.keys():
+                if subst not in synonyms_dict[base_ingred]:
+                    new_found_substitutes.append(subst)
+            else:
+                new_found_substitutes.append(subst)
+        found_substitutes = new_found_substitutes
+
+        # check which substitutes were found
+        for subst in ground_truth_dict[base_ingred]:
+            # only add substitute if not already added
+            if subst in found_substitutes and subst not in found:
+                found.append(subst)
+                found_substitutes.remove(subst)
+
+            # check if synonyms of substitute were found
+            # check if ingredient has synonyms
+            if subst in synonyms_dict.keys():
+                for synon in synonyms_dict[subst]:
+                    if synon in found_substitutes:
+                        if synon not in found and subst not in found:
+                            found.append(subst)
+                        found_substitutes.remove(synon)
+        # if base_ingred == "Erdbeere":
+        print(base_ingred + ": " + str(found_substitutes))
+        found_ground_ingr[base_ingred] = found
+        # print(base_ingred + ": ")
+        # if get_occurrences:
+        #     print("occurrences in dataset: " + str(occurrences))
+        # print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
+        # print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
+        # print("correctly found substitutes: " + str(found))
+        # print("incorrectly found substitutes: " + str(found_substitutes))
+        # print("-----------------------------\n")
+        if len(found) > 0:
+            average_precision += len(found)/(len(found) + len(found_substitutes))
+        # print(len(found))
+        average_recall += len(found)/len(ground_truth_dict[base_ingred])
+        correctly_found += len(found)
+        incorrectly_found += len(found_substitutes)
+        number_correct_subs_found_overall.append(len(found))
+        total_number_subs_found_overall.append(len(found) + len(found_substitutes))
+
+    print("average precision: " + str(average_precision/40))
+    print("average recall: " + str(average_recall/40))
+    print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
+    print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
+    return found_ground_ingr
+
+
+def merge_lists(all_lists):
+    max_len = 0
+    min_len = 99999
+    output = []
+    for curr_list in all_lists:
+        if len(curr_list) < min_len:
+            min_len = len(curr_list)
+        if len(curr_list) > max_len:
+            max_len = len(curr_list)
+    for index_counter in range(max_len):
+        for curr_list in all_lists:
+            if index_counter < len(curr_list):
+                if curr_list[index_counter] not in output:
+                    output.append(curr_list[index_counter])
+    return output
+
+
+def with_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
+    if get_occurrences:
+        with open(data_path + occurances_path, "r") as whole_json_file:
+            occurrences_dict = json.load(whole_json_file)
+
+    if not ground_truth_dict:
+        with open(data_path+ground_truth_path, "r") as whole_json_file:
+            ground_truth_dict = json.load(whole_json_file)
+
+    if synonyms:
+        with open(data_path + synonyms_path, "r") as whole_json_file:
+            synonyms_dict = json.load(whole_json_file)
+    else:
+        synonyms_dict = {}
+
+    if not found_substitutes_dict:
+        with open(found_substitutes_path, "r") as whole_json_file:
+            model_substitutes_dict = json.load(whole_json_file)
+    else:
+        model_substitutes_dict = found_substitutes_dict
+
+    correctly_found = 0
+    incorrectly_found = 0
+    average_precision = 0.0
+    average_recall = 0.0
+    number_correct_subs_found_overall = []
+    total_number_subs_found_overall = []
+
+    found_ground_ingr = {}
+    # base ingredient with synonyms, substitutes with synonyms
+    for base_ingred in ground_truth_dict.keys():
+        base_synonyms = [base_ingred]
+        if get_occurrences:
+            occurrences = 0
+        # get list of all synonyms of base ingredient
+        if base_ingred in synonyms_dict.keys():
+            synonyms = synonyms_dict[base_ingred]
+            base_synonyms = base_synonyms + synonyms
+            found_substitutes = []
+            all_substitutes = []
+            # get top 30 substitutes of each base synonym
+            for synon in base_synonyms:
+                if get_occurrences:
+                    occurrences += occurrences_dict[synon]
+                all_substitutes.append(model_substitutes_dict[synon].copy())
+                # synon_subs = model_substitutes_dict[synon].copy()
+                # if len(synon_subs) > 30:
+                #     synon_subs = synon_subs[:30]
+                # for sub in synon_subs:
+                #     if sub not in found_substitutes:
+                #         found_substitutes.append(sub)
+            found_substitutes = merge_lists(all_substitutes)
+        else:
+            found_substitutes = model_substitutes_dict[base_ingred].copy()
+
+        if len(found_substitutes) > 30:
+            found_substitutes = found_substitutes[:30]
+
+        found = []
+
+        # remove all base synonyms from found substitutes
+        new_found_substitutes = []
+        for subst in found_substitutes:
+            if subst not in base_synonyms:
+                new_found_substitutes.append(subst)
+        found_substitutes = new_found_substitutes
+
+        # check which substitutes were found
+        for subst in ground_truth_dict[base_ingred]:
+            # only add substitute if not already added
+            if subst in found_substitutes and subst not in found:
+                found.append(subst)
+                found_substitutes.remove(subst)
+
+            # check if synonyms of substitute were found
+            # check if ingredient has synonyms
+            if subst in synonyms_dict.keys():
+                for synon in synonyms_dict[subst]:
+                    if synon in found_substitutes:
+                        if synon not in found and subst not in found:
+                            found.append(subst)
+                        found_substitutes.remove(synon)
+
+        found_ground_ingr[base_ingred] = found
+        # print(base_ingred + ": ")
+        # if get_occurrences:
+        #     print("occurrences in dataset: " + str(occurrences))
+        # print("number of synonyms incl. original word: " + str(len(base_synonyms)))
+        # print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
+        # print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
+        # print("correctly found substitutes: " + str(found))
+        # print("incorrectly found substitutes: " + str(found_substitutes))
+        # print("-----------------------------\n")
+
+        if len(found) > 0:
+            average_precision += len(found) / (len(found) + len(found_substitutes))
+        average_recall += len(found) / len(ground_truth_dict[base_ingred])
+        correctly_found += len(found)
+        incorrectly_found += len(found_substitutes)
+        number_correct_subs_found_overall.append(len(found))
+        total_number_subs_found_overall.append(len(found) + len(found_substitutes))
+
+    print("average precision: " + str(average_precision / 40))
+    print("average recall: " + str(average_recall / 40))
+    print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
+    print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
+
+    return found_ground_ingr
+
+
+def translate_engl_ground_truth(ground_truth, ger_transl):
+    new_ground_truth = {}
+    for base_ingr in ground_truth.keys():
+        new_ground_truth[ger_transl[base_ingr]] = []
+        for subst in ground_truth[base_ingr]:
+            if subst in ger_transl.keys():
+                new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst])
+
+    return new_ground_truth
+
+
+def with_base_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
+    if get_occurrences:
+        with open(data_path + occurances_path, "r") as whole_json_file:
+            occurrences_dict = json.load(whole_json_file)
+
+    if not ground_truth_dict:
+        with open(data_path+ground_truth_path, "r") as whole_json_file:
+            ground_truth_dict = json.load(whole_json_file)
+
+    if synonyms:
+        with open(data_path + synonyms_path, "r") as whole_json_file:
+            synonyms_dict = json.load(whole_json_file)
+    else:
+        synonyms_dict = {}
+
+    if not found_substitutes_dict:
+        with open(found_substitutes_path, "r") as whole_json_file:
+            model_substitutes_dict = json.load(whole_json_file)
+    else:
+        model_substitutes_dict = found_substitutes_dict
+
+    found_ground_ingr = {}
+    # base ingredient with synonyms, substitutes with synonyms
+    for base_ingred in ground_truth_dict.keys():
+        base_synonyms = [base_ingred]
+        if get_occurrences:
+            occurrences = 0
+        # get list of all synonyms of base ingredient
+        if base_ingred in synonyms_dict.keys():
+            synonyms = synonyms_dict[base_ingred]
+            base_synonyms = base_synonyms + synonyms
+            found_substitutes = []
+            all_substitutes = []
+            # get top 30 substitutes of each base synonym
+            for synon in base_synonyms:
+                if get_occurrences:
+                    occurrences += occurrences_dict[synon]
+                all_substitutes.append(model_substitutes_dict[synon].copy())
+
+            found_substitutes = merge_lists(all_substitutes)
+        else:
+            found_substitutes = model_substitutes_dict[base_ingred].copy()
+
+        if len(found_substitutes) > 30:
+            found_substitutes = found_substitutes[:30]
+
+        found = []
+
+        # remove all base synonyms from found substitutes
+        new_found_substitutes = []
+        for subst in found_substitutes:
+            if subst not in base_synonyms:
+                new_found_substitutes.append(subst)
+        found_substitutes = new_found_substitutes
+
+        # check which substitutes were found
+        for subst in ground_truth_dict[base_ingred]:
+            # only add substitute if not already added
+            if subst in found_substitutes and subst not in found:
+                found.append(subst)
+                found_substitutes.remove(subst)
+
+            # check if synonyms of substitute were found
+            # check if ingredient has synonyms
+            # if subst in synonyms_dict.keys():
+            #     for synon in synonyms_dict[subst]:
+            #         if synon in found_substitutes:
+            #             if synon not in found and subst not in found:
+            #                 found.append(subst)
+            #             found_substitutes.remove(synon)
+
+
+
+        found_ground_ingr[base_ingred] = found
+        print(base_ingred + ": ")
+        if get_occurrences:
+            print("occurrences in dataset: " + str(occurrences))
+        print("number of synonyms incl. original word: " + str(len(base_synonyms)))
+        print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
+        print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
+        print("correctly found substitutes: " + str(found))
+        print("incorrectly found substitutes: " + str(found_substitutes))
+        print("-----------------------------\n")
+
+    return found_ground_ingr
+
+
+def engl_compare():
+    # with open(data_path + occurances_path, "r") as whole_json_file:
+    #     occurrences_dict = json.load(whole_json_file)
+
+    with open(engl_data_path + "translation.json", "r") as whole_json_file:
+        ger_transl = json.load(whole_json_file)
+
+    # with open(data_path + synonyms_path, "r") as whole_json_file:
+    #     synonyms_dict = json.load(whole_json_file)
+
+    with open(found_substitutes_path, "r") as whole_json_file:
+        model_substitutes_dict = json.load(whole_json_file)
+
+    with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
+        engl_list = json.load(whole_json_file)
+
+    with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file:
+        engl_ground_truth = json.load(whole_json_file)
+
+    engl_dict = {}
+    for foo in engl_list:
+        if foo[0] in engl_dict.keys():
+            engl_dict[foo[0]].append(foo[1])
+        else:
+            engl_dict[foo[0]] = [foo[1]]
+
+    translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl)
+
+    # without any synonyms
+    print("Engl compare without any synonyms:")
+    engl_replacements = {}
+    # ger_replacements = {}
+    for ingred in engl_ground_truth.keys():
+        found = []
+        incorr = []
+        found_ger = []
+        incorr_ger = []
+        engl_replacements[ingred] = {}
+        engl_replacements[ingred]["engl"] = 0
+        engl_replacements[ingred]["ger"] = 0
+        # ger_replacements[ingred] = 0
+        if ingred in engl_dict.keys():
+            for sub in engl_ground_truth[ingred]:
+                if sub in engl_dict[ingred]:
+                    engl_replacements[ingred]["engl"] += 1
+                    found.append(sub)
+        if ger_transl[ingred] in model_substitutes_dict.keys():
+            for sub in german_ground_truth[ger_transl[ingred]]:
+                if sub in model_substitutes_dict[ger_transl[ingred]]:
+                    engl_replacements[ingred]["ger"] += 1
+                    found_ger.append(sub)
+                    # ger_replacements[ingred] += 1
+        for found_sub in engl_dict[ingred]:
+            if found_sub not in engl_ground_truth[ingred]:
+                incorr.append(found_sub)
+        for found_sub in model_substitutes_dict[ger_transl[ingred]]:
+            if found_sub not in translated_ground_truth[ger_transl[ingred]]:
+                incorr_ger.append(found_sub)
+
+
+        print(ger_transl[ingred] + ": ")
+        print("number of found substitutes: " + str(len(found_ger)) + "/" + str(len(translated_ground_truth[ger_transl[ingred]])))
+        print("correctly found substitutes: " + str(len(found_ger)) + "/" + str(len(found_ger) + len(incorr_ger)))
+        print("correctly found substitutes: " + str(found_ger))
+        print("incorrectly found substitutes: " + str(incorr_ger))
+        print("-----------------------------\n")
+
+        print(ingred + ": ")
+        print("number of found substitutes: " + str(len(found)) + "/" + str(len(engl_ground_truth[ingred])))
+        print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(incorr)))
+        print("correctly found substitutes: " + str(found))
+        print("incorrectly found substitutes: " + str(incorr))
+        print("-----------------------------\n")
+
+    with open(evaluation_path + "engl_comparison_results/engl_no_syn.json", 'w') as f:
+        json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
+
+
+    # with synonyms of substitutes
+    print("Engl compare with synonyms of substitutes only:")
+    # german
+    new_german_result = no_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
+    #engl
+    new_engl_result = no_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
+
+    engl_replacements = {}
+    for ingred in engl_ground_truth.keys():
+        engl_replacements[ingred] = {}
+        engl_replacements[ingred]["engl"] = 0
+        engl_replacements[ingred]["ger"] = 0
+        if ingred in new_engl_result.keys():
+            for sub in engl_ground_truth[ingred]:
+                if sub in new_engl_result[ingred]:
+                    engl_replacements[ingred]["engl"] += 1
+        if ger_transl[ingred] in new_german_result.keys():
+            for sub in german_ground_truth[ger_transl[ingred]]:
+                if sub in new_german_result[ger_transl[ingred]]:
+                    engl_replacements[ingred]["ger"] += 1
+
+    with open(evaluation_path + "engl_comparison_results/engl_sub_syn.json", 'w') as f:
+        json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
+
+    # with synonyms for substitutes and base words
+    print("Engl compare with synonyms of both:")
+    # german
+    new_german_result = with_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
+    # engl
+    new_engl_result = with_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
+
+    engl_replacements = {}
+    for ingred in engl_ground_truth.keys():
+        engl_replacements[ingred] = {}
+        engl_replacements[ingred]["engl"] = 0
+        engl_replacements[ingred]["ger"] = 0
+        if ingred in new_engl_result.keys():
+            for sub in engl_ground_truth[ingred]:
+                if sub in new_engl_result[ingred]:
+                    engl_replacements[ingred]["engl"] += 1
+        if ger_transl[ingred] in new_german_result.keys():
+            for sub in german_ground_truth[ger_transl[ingred]]:
+                if sub in new_german_result[ger_transl[ingred]]:
+                    engl_replacements[ingred]["ger"] += 1
+
+    with open(evaluation_path + "engl_comparison_results/engl_all_syn.json", 'w') as f:
+        json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
+
+    # with synonyms for base words
+    print("Engl compare with synonyms of base words only:")
+
+    # german
+    new_german_result = with_base_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
+    # engl
+    new_engl_result = with_base_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict,
+                                    get_occurrences=False, synonyms=False)
+
+    engl_replacements = {}
+    for ingred in engl_ground_truth.keys():
+        engl_replacements[ingred] = {}
+        engl_replacements[ingred]["engl"] = 0
+        engl_replacements[ingred]["ger"] = 0
+        if ingred in new_engl_result.keys():
+            for sub in engl_ground_truth[ingred]:
+                if sub in new_engl_result[ingred]:
+                    engl_replacements[ingred]["engl"] += 1
+        if ger_transl[ingred] in new_german_result.keys():
+            for sub in german_ground_truth[ger_transl[ingred]]:
+                if sub in new_german_result[ger_transl[ingred]]:
+                    engl_replacements[ingred]["ger"] += 1
+
+    with open(evaluation_path + "engl_comparison_results/engl_base_syn.json", 'w') as f:
+        json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
+
+    print("test")
+
+
+
+def main():
+    # compare english and german results
+    # engl_compare()
+
+    print("--------------------------------------------------------")
+    print("--------------------------------------------------------")
+    print("--------------------------------------------------------\n")
+
+    # get results, synonyms only used in substitutes
+    no_synonyms()
+
+    print("--------------------------------------------------------")
+    print("--------------------------------------------------------")
+    print("--------------------------------------------------------\n")
+
+    # get results, synonyms used in substitutes and base ingredients
+    with_synonyms()
+
+
+
+
+
+
+
+main()
@@ -0,0 +1,288 @@
+import json
+import statistics
+import helpers.revise_substitutes as revise_subs
+
+
+def eval_dataset(substitutes_dict):
+    nones = 0
+    all_lengths = []
+    for ingredient in substitutes_dict.keys():
+        if len(substitutes_dict[ingredient]) == 0:
+            nones += 1
+        all_lengths.append(len(substitutes_dict[ingredient]))
+
+    print("number of ingredients: " + str(len(substitutes_dict.keys())))
+    print("number of nones: " + str(nones))
+    print("average number of subs: " + str(sum(all_lengths) / len(substitutes_dict.keys())))
+    print("median number of subs: " + str(statistics.median(all_lengths)))
+    print("largest number of subs: " + str(max(all_lengths)))
+    print("smallest number of subs: " + str(min(all_lengths)))
+
+
+def translate_engl_ground_truth(ground_truth, ger_transl):
+    new_ground_truth = {}
+    for base_ingr in ground_truth.keys():
+        new_ground_truth[ger_transl[base_ingr]] = []
+        for subst in ground_truth[base_ingr]:
+            if subst in ger_transl.keys():
+                new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst])
+            # else:
+            #     print("translation error: " + subst)
+
+    return new_ground_truth
+
+
+def eval_ground_truth(substitutes_dict, ground_truth_dict):
+    total_corr_int = 0
+    total_corr_list = []
+    total_incorr_int = 0
+    total_incorr_list = []
+    total_subs_ground_truth = 0
+    test_prec = 0
+
+    highest_prec = [0,[]]
+    highest_recall = [0,[]]
+
+    other_corr = 0
+    other_incorr = 0
+    ger_corr = 0
+    ger_incorr = 0
+    ger_total = 0
+    other_total = 0
+
+    german_words = ["Spätzle", "Schwarzbrot", "Schupfnudeln", "Bratwürste_Nürnberger", "Vanillinzucker", "Bier", "Semmelknödel", "Rote_Bete", "Eisbeine", "Spargel_weiß"]
+
+    for ingredient in ground_truth_dict:
+        correct = 0
+        incorrect = 0
+        correct_list = []
+        incorrect_list = []
+
+        # print("\n" + ingredient + ": " + str(len(substitutes_dict[ingredient])))
+        for sub in substitutes_dict[ingredient]:
+            if sub in ground_truth_dict[ingredient]:
+                # print(sub)
+                correct += 1
+                correct_list.append(sub)
+            else:
+                incorrect += 1
+                incorrect_list.append(sub)
+        total_corr_int += correct
+        total_incorr_int += incorrect
+        total_corr_list.append(correct)
+        total_incorr_list.append(incorrect)
+        total_subs_ground_truth += len(ground_truth_dict[ingredient])
+        if correct > 0:
+            curr_recall = correct/len(ground_truth_dict[ingredient])
+            curr_prec = correct/(correct+incorrect)
+            test_prec += curr_prec
+            if curr_prec == highest_prec[0]:
+                highest_prec[1].append(ingredient)
+            if curr_prec > highest_prec[0]:
+                highest_prec[0] = curr_prec
+                highest_prec[1] = [ingredient]
+            if curr_recall == highest_recall[0]:
+                highest_recall[1].append(ingredient)
+            if curr_recall > highest_recall[0]:
+                highest_recall[0] = curr_recall
+                highest_recall[1] = [ingredient]
+            print(ingredient + ": " + str(curr_prec) + " ..... " + str(curr_recall))
+        if ingredient in german_words:
+            ger_corr += correct
+            ger_incorr += incorrect
+        else:
+            other_corr += correct
+            other_incorr += incorrect
+
+        if ingredient == "Zucker":
+            print("correct: " + str(correct_list) + ", incorrect: " + str(incorrect_list))
+    ger_total = ger_corr + ger_incorr
+    other_total = other_corr + other_incorr
+
+    print("ger_total: " + str(ger_total/10))
+    print("other_total: " + str(other_total/30))
+
+                  # print(correct)
+    print(ingredient + ": " + str(correct_list) + " / " + str(incorrect_list))
+
+    print("precision: " + str(total_corr_int / (total_corr_int + total_incorr_int)))
+    print("(average precision:) " + str(test_prec/40))
+    print("recall: " + str(total_corr_int / total_subs_ground_truth))
+    print("median number of correct subs (ground truth): " + str(statistics.median(total_corr_list)))
+    print("average number of correct subs (ground truth): " + str(statistics.mean(total_corr_list)))
+    at_least_3 = 0
+    no_corr = 0
+    for nr in total_corr_list:
+        if nr >= 3:
+            at_least_3 += 1
+        if nr < 1:
+            no_corr += 1
+    print("ingredients with at least 3 correct substitutes: " + str(at_least_3))
+    print("ingredients with no correct substitutes: " + str(no_corr))
+    print("highest precision: " + str(highest_prec[1]) + ": " + str(highest_prec[0]))
+    print("highest recall: " + str(highest_recall[1]) + ": " + str(highest_recall[0]))
+
+    # print("german precision: " + str(ger_corr/(ger_corr + ger_incorr)))
+    # print("german correct:" + str(ger_corr))
+    # print("precision rest: " + str(other_corr/(other_corr + other_incorr)))
+    # print("other correct: " + str(other_corr))
+
+
+def get_ground_truth_substitutes(substitutes_dict, ground_truth_dict):
+    ground_truth_substitutes = {}
+    for ingredient in ground_truth_dict:
+        ground_truth_substitutes[ingredient] = substitutes_dict[ingredient]
+    return ground_truth_substitutes
+
+
+
+def main():
+    substitutes_path = "final_Versions/models/vers3/eval/complete_substitute_pairs_50.json"
+    with open(substitutes_path, "r") as whole_json_file:
+        substitutes_dict = json.load(whole_json_file)
+
+    ground_truth_path = "data/ground_truth.json"
+    with open(ground_truth_path, "r") as whole_json_file:
+        ground_truth_dict = json.load(whole_json_file)
+
+    print("no synonyms at all:")
+    print("entire dataset")
+    eval_dataset(substitutes_dict)
+    print("\nonly ground truth:")
+    ground_truth_substitutes0 = get_ground_truth_substitutes(substitutes_dict, ground_truth_dict)
+    # print(ground_truth_substitutes["Truthahn"])
+    eval_dataset(ground_truth_substitutes0)
+    eval_ground_truth(substitutes_dict, ground_truth_dict)
+
+    print("======================================")
+
+    print("\nsynonyms of substitutes only: ")
+    new_substitutes_dict1 = substitutes_dict.copy()
+    new_substitutes_dict1 = revise_subs.combined_substitutes_dict(new_substitutes_dict1)
+    print("entire dataset")
+    eval_dataset(new_substitutes_dict1)
+    print("\nonly ground truth:")
+    ground_truth_substitutes1 = get_ground_truth_substitutes(new_substitutes_dict1, ground_truth_dict)
+    # print(ground_truth_substitutes["Truthahn"])
+    eval_dataset(ground_truth_substitutes1)
+    eval_ground_truth(new_substitutes_dict1, ground_truth_dict)
+
+    print("======================================")
+
+    print("\nsynonyms of everything: ")
+    new_substitutes_dict2 = substitutes_dict.copy()
+    new_substitutes_dict2 = revise_subs.combine_all_synonyms(new_substitutes_dict2)
+    print("entire dataset")
+    eval_dataset(new_substitutes_dict2)
+    print("\nonly ground truth:")
+    ground_truth_substitutes2 = get_ground_truth_substitutes(new_substitutes_dict2, ground_truth_dict)
+    # print(ground_truth_substitutes["Truthahn"])
+    eval_dataset(ground_truth_substitutes2)
+    eval_ground_truth(new_substitutes_dict2, ground_truth_dict)
+
+    print("======================================")
+    print("======================================")
+
+    print("English Evaluation")
+
+    data_path = "data/"
+    occurances_path = "mult_ingredients_nice.json"
+    ground_truth_path = "ground_truth.json"
+    engl_data_path = "evaluation/engl_data/"
+
+    evaluation_path = "evaluation/"
+    synonyms_path = "synonyms.json"
+
+    german_ground_truth = {
+        "Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen",
+                    "Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel", "Rübe"],
+        "Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren",
+                    "Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren",
+                    "Maraschino", "Beeren", "Trockenpflaumen"],
+        "Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester",
+                 "Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"],
+        "Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill",
+                       "Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze",
+                       "Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"],
+        "Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten",
+                       "Erdnussbutter"],
+        "Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken",
+                            "Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons",
+                            "Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"],
+        "Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl",
+                     "Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"],
+        "Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup",
+                   "Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup",
+                   "Sirup"],
+        "Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse",
+                 "Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse",
+                 "Scheiblettenkäse"],
+        "Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch",
+                     "Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule",
+                     "Wachtel", "schweinekotelett", "Wildfleisch"]
+    }
+
+    with open(engl_data_path + "translation.json", "r") as whole_json_file:
+        ger_transl = json.load(whole_json_file)
+
+    with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
+        engl_list = json.load(whole_json_file)
+
+    with open(engl_data_path + "revised_engl_ground_truth.json", "r") as whole_json_file:
+        engl_ground_truth = json.load(whole_json_file)
+
+    engl_dict = {}
+    for foo in engl_list:
+        if foo[0] in engl_dict.keys():
+            engl_dict[foo[0]].append(foo[1])
+        else:
+            engl_dict[foo[0]] = [foo[1]]
+
+    # translate english ground truth to german for comparison
+    # any ingredients that aren't in the german dataset are removed
+    # translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl)
+
+    print("Eval English results")
+    print("entire dataset")
+    eval_dataset(engl_dict)
+    orig_engl_dict = engl_dict.copy()
+    # print("turkey results: " + str(orig_engl_dict["turkey"]))
+
+    print("\nonly ground truth:")
+    ground_truth_substitutes_engl = get_ground_truth_substitutes(orig_engl_dict, engl_ground_truth)
+    # print(ground_truth_substitutes)
+    eval_dataset(ground_truth_substitutes_engl)
+    eval_ground_truth(ground_truth_substitutes_engl, engl_ground_truth)
+
+    print("\n\nEval method 1:")
+    engl_dict1 = engl_dict.copy()
+    engl_dict1 = revise_subs.engl_combined_substitutes_dict(engl_dict1)
+    print("entire dataset")
+    eval_dataset(engl_dict1)
+    print("\nonly ground truth:")
+    ground_truth_substitutes_engl = get_ground_truth_substitutes(engl_dict1, engl_ground_truth)
+    # print(ground_truth_substitutes["Truthahn"])
+    eval_dataset(ground_truth_substitutes_engl)
+    eval_ground_truth(ground_truth_substitutes_engl, engl_ground_truth)
+
+
+    print("\nReevaluate German Data: ")
+    eval_ground_truth(substitutes_dict, german_ground_truth)
+    print("\nwith syn1")
+    eval_ground_truth(new_substitutes_dict1, german_ground_truth)
+    # print("Truthahn results 1: " + str(new_substitutes_dict1["Truthahn"]))
+    print("\nwith syn2")
+    eval_ground_truth(new_substitutes_dict2, german_ground_truth)
+    # print("Truthahn results 2: " + str(new_substitutes_dict2["Truthahn"]))
+
+
+    #
+    # engl_substitutes_dict = get_ground_truth_substitutes(engl_dict1, german_ground_truth)
+    #
+    # engl_new_substitutes_dict1 = new_substitutes_dict1.copy()
+    # engl_new_substitutes_dict2 = new_substitutes_dict2.copy()
+
+
+
+
+main()
@@ -0,0 +1,28 @@
+import json
+import random
+
+def main():
+    data_path = "data/"
+    ingredients_path = "mult_ingredients_nice.json"
+    with open(data_path + ingredients_path, "r") as whole_json_file:
+        all_ingredients = json.load(whole_json_file)
+
+    rare = []
+    frequent = []
+    for ingredient in all_ingredients.keys():
+        if all_ingredients[ingredient] >= 1000:
+            frequent.append(ingredient)
+        elif all_ingredients[ingredient] >= 100 and all_ingredients[ingredient] <= 200:
+            rare.append(ingredient)
+
+    picked_rare = random.sample(rare, 10)
+    picked_frequent = random.sample(frequent, 10)
+
+    print("rare: ")
+    print(picked_rare)
+    print("\nfrequent: ")
+    print(picked_frequent)
+
+
+
+main()
@@ -0,0 +1,197 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import json
+import os
+from collections import defaultdict
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import torch
+from sklearn.decomposition import PCA
+from tqdm import tqdm
+
+from evaluation.helpers.approx_knn_classifier import ApproxKNNClassifier
+from evaluation.helpers.generate_ingredient_embeddings import generate_food_embedding_dict
+from evaluation.helpers.knn_classifier import KNNClassifier
+
+
+def avg(values):
+    summed = sum(values)
+    length = len(values)
+    return summed / length
+
+
+def custom_potential_neighbors_sort(potential_neighbors):
+    # First sort by how often something was nearby, if this is equal, use the smaller distance
+    sorted_neighbors = sorted(potential_neighbors.items(), key=lambda x: (len(x[1]), -avg(x[1])), reverse=True)
+    return sorted_neighbors
+
+
+def filter_out_forbidden_neigbours(ingredient_name, potential_neighbors):
+    '''
+    Neigbors that are the same as the ingredient are to be removed, additional rules such as mozeralla & mozeralla_cheese, penne & penne_pasta can be added here
+    '''
+    banned_keys = {ingredient_name}
+
+    # Ban ingredients that contain ingredient_name
+    for ingredient in potential_neighbors.keys():
+        if ingredient == ingredient_name:
+            banned_keys.add(ingredient)
+        # if ingredient_name in ingredient.split('_'):
+        #     banned_keys.add(ingredient)
+
+    filtered_potential_neighbors = {key: value for key, value in potential_neighbors.items() if
+                                    key not in banned_keys}
+
+    return filtered_potential_neighbors
+
+
+def get_nearest_N_neigbours(ingredient_name, ingredients_to_embeddings, all_ingredient_labels,
+                            knn_classifier: Union[KNNClassifier, ApproxKNNClassifier], thresh = 50):
+    ingredient_embeddings = ingredients_to_embeddings[ingredient_name]
+    all_distances, all_indices = knn_classifier.k_nearest_neighbors(ingredient_embeddings)
+
+    potential_neighbors = defaultdict(list)
+
+    for i in range(len(ingredient_embeddings)):
+        labels = all_ingredient_labels[all_indices[i]]
+        distances = all_distances[i]
+
+        for label, distance in zip(labels, distances):
+            potential_neighbors[label].append(distance)
+
+    potential_neighbors = filter_out_forbidden_neigbours(ingredient_name, potential_neighbors)
+    sorted_neighbors = custom_potential_neighbors_sort(potential_neighbors)
+    sorted_neighbors2 = []
+    for key, value in sorted_neighbors:
+        if len(value) >= thresh:
+            sorted_neighbors2.append((key, value))
+    # sorted_neighbors = [(key, value) for key, value in sorted_neighbors if len(value) >= len(ingredient_embeddings)]  # remove too rare ones
+    # further removal
+    relative_lengths = [len(elem[1]) / (len(sorted_neighbors2[0][1])) for elem in sorted_neighbors2]
+    final_neighbors = []
+    for idx in range(len(relative_lengths)):
+        if relative_lengths[idx] >= 0.0:  # Currently doesn't sort anything out
+            final_neighbors.append(sorted_neighbors2[idx])
+
+    try:
+        return list(zip(*final_neighbors))[0]
+
+    except Exception as e:
+        return None
+
+
+def clean_ingredient_name(ingredient_name, normalization_fixes):
+    words = ingredient_name.split('_')
+    cleaned_words = []
+    for word in words:
+        if word in normalization_fixes:
+            cleaned_words.append(normalization_fixes[word])
+        else:
+            cleaned_words.append(word)
+
+    return ' '.join(cleaned_words)
+
+
+def clean_substitutes(subtitutes, normalization_fixes):
+    cleaned_subtitutes = []
+    for subtitute in subtitutes:
+        cleaned_subtitutes.append(clean_ingredient_name(subtitute, normalization_fixes))
+
+    return cleaned_subtitutes
+
+
+# def test_eval():
+#     return ["Zucker", "Eier", "Reis", "Spaghetti", "Wein", "Gouda_junger"]
+
+
+def main():
+    # models = ["Versions/vers1/", "Versions/vers2/"]
+    # models = ["final_Versions/models/vers1/", "final_Versions/models/vers2/", "final_Versions/models/vers3/"]
+    models = ["final_Versions/models/vers2/"]
+    thresh = 100
+    # models = ["test/"]
+
+    # os.makedirs('data/eval')
+
+    # test_substitute_pairs_path = 'Versions/test_substitute_pairs.json'
+
+    # normalization_fixes_path = Path('data/eval/normalization_correction.json')
+    max_embedding_count = 100
+    # image_embedding_dim = 768
+    approx_knn = True
+
+    # compare models
+    for curr_model in models:
+        # os.makedirs(curr_model + "eval/")
+        substitute_pairs_path = curr_model + "eval/substitute_pairs_" + str(thresh) + ".json"
+
+        # get embeddings for all ingredients
+        ingredients_to_embeddings = generate_food_embedding_dict(max_sentence_count=max_embedding_count, model_path=curr_model+"output/", eval_path=curr_model + "eval/", dataset_path=curr_model+"dataset/")
+
+        all_ingredient_embeddings = []
+        all_ingredient_labels = []
+
+        # make list of all ingredients and all embeddings
+        for key, value in ingredients_to_embeddings.items():
+            all_ingredient_embeddings.append(value)
+            all_ingredient_labels.extend([key] * len(value))
+
+        all_ingredient_embeddings = np.concatenate(all_ingredient_embeddings)
+        all_ingredient_labels = np.stack(all_ingredient_labels)
+
+        # get knn classifier
+        if approx_knn:
+            knn_classifier: Union[KNNClassifier, ApproxKNNClassifier] = ApproxKNNClassifier(
+                all_ingredient_embeddings=all_ingredient_embeddings,
+                max_embedding_count=max_embedding_count, save_path=Path(curr_model + "eval/" + 'approx_knn_classifier.ann'))
+        else:
+            knn_classifier: Union[KNNClassifier, ApproxKNNClassifier] = KNNClassifier(
+                all_ingredient_embeddings=all_ingredient_embeddings,
+                max_embedding_count=max_embedding_count, save_path=Path(curr_model + "eval/" + 'knn_classifier.joblib'))
+
+        # get substitutes via knn classifier
+        substitute_pairs = set()
+        none_counter = 0
+        subst_dict = {}
+        for ingredient_name in tqdm(ingredients_to_embeddings.keys(), total=len(ingredients_to_embeddings)):
+            substitutes = get_nearest_N_neigbours(ingredient_name=ingredient_name,
+                                                  ingredients_to_embeddings=ingredients_to_embeddings,
+                                                  all_ingredient_labels=all_ingredient_labels,
+                                                  knn_classifier=knn_classifier, thresh=thresh)
+
+            if substitutes is None:
+                none_counter += 1
+                subst_dict[ingredient_name] = []
+            else:
+                subst_dict[ingredient_name] = list(substitutes)
+
+                #
+                # cleaned_substitutes = clean_substitutes(substitutes, normalization_fixes)
+                # for cleaned_substitute in cleaned_substitutes:
+                #     substitute_pairs.add((clean_ingredient_name(ingredient_name, normalization_fixes), cleaned_substitute))
+
+        with open(substitute_pairs_path, 'w') as f:
+            json.dump(subst_dict, f, ensure_ascii=False, indent=4)
+        print(f'Nones: {none_counter}')
+
+
+    # output = {}
+    # for ing in ingredients:
+    #     output[ing] = []
+    # for model in all_subs.keys():
+    #     for ing in ingredients:
+    #         output[ing].append(all_subs[model][ing])
+    #
+    # with open(test_substitute_pairs_path, 'w') as f:
+    #     json.dump(output, f, ensure_ascii=False, indent=4)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,48 @@
+import json
+import statistics
+
+
+def main():
+    ground_truth_path = "data/ground_truth.json"
+    # ground_truth_path = "evaluation/engl_data/engl_ground_truth.json"
+    with open(ground_truth_path, "r") as whole_json_file:
+        ground_truth_dict = json.load(whole_json_file)
+    ingredients_path = "data/mult_ingredients_nice.json"
+    # ingredients_path = "data/cleaned_steps_occurrance.json"
+    with open(ingredients_path, "r") as whole_json_file:
+        ingredients_occurrences = json.load(whole_json_file)
+    synonyms_path = "data/synonyms.json"
+    with open(synonyms_path, "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+
+    german_words = ["Spätzle", "Schwarzbrot", "Schupfnudeln", "Bratwürste_Nürnberger", "Vanillinzucker", "Bier", "Semmelknödel", "Rote_Bete", "Eisbeine", "Spargel_weiß"]
+    german_total = 0
+    other_total = 0
+
+    all_counts = []
+    occurrence_count = []
+    for base in ground_truth_dict.keys():
+        # print(base + " substitutes: " + str(len(ground_truth_dict[base])))
+        all_counts.append(len(ground_truth_dict[base]))
+        curr_occurrences = ingredients_occurrences[base]
+        if base in synonyms_dict.keys():
+            for syn in synonyms_dict[base]:
+                curr_occurrences += ingredients_occurrences[syn]
+        occurrence_count.append(curr_occurrences)
+        print(base + " occurrences: " + str(curr_occurrences))
+        if base in german_words:
+            german_total += len(ground_truth_dict[base])
+        else:
+            other_total += len(ground_truth_dict[base])
+
+    print("Average: " + str(statistics.mean(all_counts)))
+    print("Median: " + str(statistics.median(all_counts)))
+    print("Standard deviation: " + str(statistics.stdev(all_counts)))
+    print("Min: " + str(min(all_counts)))
+    print("Max: " + str(max(all_counts)))
+
+    # print("german total: " + str(german_total))
+    # print("other total: " + str(other_total))
+
+
+main()
@@ -0,0 +1,45 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+from pathlib import Path
+
+import numpy as np
+from annoy import AnnoyIndex
+from tqdm import tqdm
+
+
+# Full guide https://github.com/spotify/annoy
+class ApproxKNNClassifier:
+    def __init__(self, all_ingredient_embeddings, max_embedding_count,
+                 save_path=Path('data/eval/approx_knn_classifier.ann'), n_trees=10):
+
+        vector_length = all_ingredient_embeddings.shape[-1]
+        self.max_embedding_count = max_embedding_count
+        if save_path.exists():
+            print('Loading Existing Approx Classifier')
+            self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')
+            self.approx_knn_classifier.load(str(save_path))  # super fast, will just mmap the file
+        else:
+
+            # To make sure we don't just get ourselves: add max_embedding_count
+            self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')  # Length of item vector that will be indexed
+            for i in tqdm(range(len(all_ingredient_embeddings)), total=len(all_ingredient_embeddings), desc='Creating Approx Classifier'):
+                self.approx_knn_classifier.add_item(i, all_ingredient_embeddings[i])
+
+            self.approx_knn_classifier.build(n_trees)
+            print('Saving Approx Classifier')
+            self.approx_knn_classifier.save(str(save_path))
+
+    def k_nearest_neighbors(self, ingredient_embeddings):
+        all_indices, all_distances = [], []
+        for idx, ingredient_embedding in enumerate(
+                ingredient_embeddings):  # search_k gives you a run-time tradeoff between better accuracy and speed currently defaults
+            indices, distances = self.approx_knn_classifier.get_nns_by_vector(ingredient_embedding, self.max_embedding_count + 200, include_distances=True)
+            all_indices.append(indices)
+            all_distances.append(distances)
+
+        return np.stack(all_distances), np.stack(all_indices)
@@ -0,0 +1,152 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import json
+import pickle
+import random
+import re
+from collections import defaultdict
+from pathlib import Path
+
+import numpy as np
+import torch
+from tqdm import tqdm
+
+from evaluation.helpers.prediction_model import PredictionModel
+
+
+def _generate_food_sentence_dict(model_path):
+    with open('data/mult_ingredients_nice.json', "r") as f:
+        food_items = json.load(f)
+        food_items_set = set(food_items.keys())
+
+    with open(model_path + 'training_data.txt', "r") as f:
+        train_instruction_sentences = f.read().splitlines()
+        # remove overlong sentences
+        train_instruction_sentences = [s for s in train_instruction_sentences if len(s.split()) <= 100]
+
+    with open(model_path + 'testing_data.txt', "r") as f:
+        test_instruction_sentences = f.read().splitlines()
+        # remove overlong sentences
+        test_instruction_sentences = [s for s in test_instruction_sentences if len(s.split()) <= 100]
+
+    instruction_sentences = train_instruction_sentences + test_instruction_sentences
+
+    food_to_sentences_dict = defaultdict(list)
+    for sentence in instruction_sentences:
+        words = re.sub("[^\w]-'", " ", sentence).split()
+        for word in words:
+            if word in food_items_set:
+                food_to_sentences_dict[word].append(sentence)
+
+    return food_to_sentences_dict
+
+
+def _random_sample_with_min_count(population, k):
+    if len(population) <= k:
+        return population
+    else:
+        return random.sample(population, k)
+
+
+def sample_random_sentence_dict(model_path, max_sentence_count):
+    food_to_sentences_dict = _generate_food_sentence_dict(model_path=model_path)
+    # only keep 100 randomly selected sentences
+    food_to_sentences_dict_random_samples = {food: _random_sample_with_min_count(sentences, max_sentence_count) for
+                                             food, sentences in food_to_sentences_dict.items()}
+    return food_to_sentences_dict_random_samples
+
+
+def _map_ingredients_to_input_ids(model_path):
+    with open('data/mult_ingredients_nice.json', "r") as f:
+        ingredients = json.load(f).keys()
+    model = PredictionModel(model_path)
+    ingredient_ids = model.tokenizer.convert_tokens_to_ids(ingredients)
+
+    ingredient_ids_dict = dict(zip(ingredients, ingredient_ids))
+
+    return ingredient_ids_dict
+
+
+def _merge_synonmys(food_to_embeddings_dict, max_sentence_count):
+    synonmy_replacements_path = Path('foodbert_embeddings/data/synonmy_replacements.json')
+    synonmy_replacements = {}
+
+    merged_dict = defaultdict(list)
+    # Merge ingredients
+    for key, value in food_to_embeddings_dict.items():
+        if key in synonmy_replacements:
+            key_to_use = synonmy_replacements[key]
+        else:
+            key_to_use = key
+
+        merged_dict[key_to_use].append(value)
+
+    merged_dict = {k: np.concatenate(v) for k, v in merged_dict.items()}
+    # When embedding count exceeds maximum allowed, reduce back to requested count
+    for key, value in merged_dict.items():
+        if len(value) > max_sentence_count:
+            index = np.random.choice(value.shape[0], max_sentence_count, replace=False)
+            new_value = value[index]
+            merged_dict[key] = new_value
+
+    return merged_dict
+
+
+def generate_food_embedding_dict(max_sentence_count, model_path, eval_path='data/eval/', dataset_path="output"):
+    '''
+    Creates a dict where the keys are the ingredients and the values are a list of embeddings with length max_sentence_count or less if there are less occurences
+    These embeddings are used in generate_substitutes.py to predict substitutes
+    '''
+    food_to_embeddings_dict_path = Path(eval_path + 'food_embeddings_dict.pkl')
+    if food_to_embeddings_dict_path.exists():
+        with food_to_embeddings_dict_path.open('rb') as f:
+            food_to_embeddings_dict = pickle.load(f)
+
+        # # delete keys if we deleted ingredients
+        # old_ingredients = set(food_to_embeddings_dict.keys())
+        # with open('train_model/vocab/used_ingredients.json', "r") as f:
+        #     new_ingredients = set(json.load(f))
+        #
+        # keys_to_delete = old_ingredients.difference(new_ingredients)
+        # for key in keys_to_delete:
+        #     food_to_embeddings_dict.pop(key, None)  # delete key if it exists
+        #
+        # # merge new synonyms
+        # food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
+        #
+        # with food_to_embeddings_dict_path.open('wb') as f:
+        #     pickle.dump(food_to_embeddings_dict, f)  # Overwrite dict with cleaned version
+
+        return food_to_embeddings_dict
+
+    print('Sampling Random Sentences')
+    food_to_sentences_dict_random_samples = sample_random_sentence_dict(model_path=dataset_path, max_sentence_count=max_sentence_count)
+    food_to_embeddings_dict = defaultdict(list)
+    print('Mapping Ingredients to Input Ids')
+    all_ingredient_ids = _map_ingredients_to_input_ids(model_path=model_path)
+
+    prediction_model = PredictionModel(model_path=model_path)
+
+    for food, sentences in tqdm(food_to_sentences_dict_random_samples.items(), total=len(food_to_sentences_dict_random_samples),
+                                desc='Calculating Embeddings for Food items'):
+        embeddings, ingredient_ids = prediction_model.predict_embeddings(sentences)
+        # get embedding of food word
+        embeddings_flat = embeddings.view((-1, 768))
+        ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
+        food_id = all_ingredient_ids[food]
+        food_embeddings = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
+        food_to_embeddings_dict[food].extend(food_embeddings)
+
+    food_to_embeddings_dict = {k: np.stack(v) for k, v in food_to_embeddings_dict.items()}
+    # Clean synonmy
+    food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
+
+    with food_to_embeddings_dict_path.open('wb') as f:
+        pickle.dump(food_to_embeddings_dict, f)
+
+    return food_to_embeddings_dict
@@ -0,0 +1,38 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import Dataset
+
+
+class InstructionsDataset(Dataset):
+    def __init__(self, tokenizer, sentences):
+        self.tokenizer = tokenizer
+
+        batch_encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, max_length=512, truncation=True)
+        self.examples = batch_encoding["input_ids"]
+        self.examples = self._tensorize_batch([torch.tensor(elem) for elem in self.examples])
+
+    def _tensorize_batch(self, examples) -> torch.Tensor:
+        length_of_first = examples[0].size(0)
+        are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
+        if are_tensors_same_length:
+            return torch.stack(examples, dim=0)
+        else:
+            if self.tokenizer._pad_token is None:
+                raise ValueError(
+                    "You are attempting to pad samples but the tokenizer you are using"
+                    f" ({self.tokenizer.__class__.__name__}) does not have one."
+                )
+            return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, i):
+        return self.examples[i]
@@ -0,0 +1,36 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+from pathlib import Path
+
+import joblib
+from sklearn.neighbors import NearestNeighbors
+
+
+class KNNClassifier:
+    def __init__(self, all_ingredient_embeddings, max_embedding_count,
+                 save_path=Path('data/eval/knn_classifier.joblib')):
+
+        if save_path.exists():
+            print('Loading Existing Classifier')
+            self.knn_classifier: NearestNeighbors = joblib.load(save_path)
+        else:
+            print('Training New Classifier')
+            # To make sure we don't just get ourselves: add max_embedding_count
+            self.knn_classifier: NearestNeighbors = NearestNeighbors(n_neighbors=max_embedding_count + 200, n_jobs=12,
+                                                                     algorithm='brute')  # kd_tree, ball_tree or brute
+            self.knn_classifier.fit(all_ingredient_embeddings)
+
+            print('Saving Classifier')
+            joblib.dump(self.knn_classifier, save_path)
+
+        print(f'\nKNN with: {self.knn_classifier._fit_method} and leaf size: {self.knn_classifier.leaf_size}\n')
+
+    def k_nearest_neighbors(self, ingredient_embeddings):
+        distances, indices = self.knn_classifier.kneighbors(ingredient_embeddings, return_distance=True)
+
+        return distances, indices
@@ -0,0 +1,53 @@
+# adapted from:
+# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
+# “Exploiting Food Embeddings for Ingredient Substitution.”
+# In: Proceedings of the 14th International Joint Conference on Biomedical
+# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
+# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
+
+import json
+
+import torch
+from torch.utils.data import DataLoader
+from transformers import BertModel, BertTokenizer
+
+from evaluation.helpers.instructions_dataset import InstructionsDataset
+
+
+class PredictionModel:
+
+    def __init__(self, model_path=''):
+        self.model: BertModel = BertModel.from_pretrained(
+            pretrained_model_name_or_path=model_path)
+        with open('train_model/vocab/used_ingredients.json', 'r') as f:
+            used_ingredients = json.load(f)
+        self.tokenizer = BertTokenizer(vocab_file='train_model/vocab/bert_vocab.txt', do_lower_case=False,
+                                       max_len=512, never_split=used_ingredients, truncation=True)
+
+        self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+        self.model.to(self.device)
+
+    def predict_embeddings(self, sentences):
+        dataset = InstructionsDataset(tokenizer=self.tokenizer, sentences=sentences)
+        dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)
+
+        embeddings = []
+        ingredient_ids = []
+        for batch in dataloader:
+            batch = batch.to(self.device)
+            with torch.no_grad():
+                embeddings_batch = self.model(batch)
+                embeddings.extend(embeddings_batch[0])
+                ingredient_ids.extend(batch)
+
+        return torch.stack(embeddings), ingredient_ids
+
+    def compute_embedding_for_ingredient(self, sentence, ingredient_name):
+        embeddings, ingredient_ids = self.predict_embeddings([sentence])
+        embeddings_flat = embeddings.view((-1, 768))
+        ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
+        food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name)
+        food_embedding = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
+
+        return food_embedding[0]
@@ -0,0 +1,166 @@
+import json
+synonyms_path = "data/synonyms.json"
+ground_truth_path = "data/ground_truth.json"
+category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
+                "Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
+                "Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Truthahn", "Wein"]
+
+with open("final_Versions/models/vers3/eval/complete_substitute_pairs_50.json") as f:
+    sub_dict = json.load(f)
+
+
+def engl_combined_substitutes_dict(found_substitutes_dict):
+    with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+    with open("evaluation/engl_data/revised_engl_ground_truth.json", "r") as whole_json_file:
+        ground_truth_dict = json.load(whole_json_file)
+
+    reversed_synonyms_dict = get_reversed_syn_dict(is_engl=True)
+
+    new_found_sub_dict = {}
+
+    for ingredient in found_substitutes_dict.keys():
+        new_found_sub_dict[ingredient] = []
+        current_subs = set()
+        for sub in found_substitutes_dict[ingredient]:
+            # delete substitute if it is the same as the ingredient
+            if sub == ingredient:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # delete substitute if it is a synonym of the ingredient
+            if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # if substitute is a synonym of sth
+            if sub in reversed_synonyms_dict.keys():
+                if len(reversed_synonyms_dict[sub]) == 1:
+                    current_subs.add(reversed_synonyms_dict[sub][0])
+                else:
+                    print(sub + " is in " + str(reversed_synonyms_dict[sub]))
+
+            else:
+                current_subs.add(sub)
+
+        new_found_sub_dict[ingredient] += list(current_subs)
+    return new_found_sub_dict
+
+
+# merges substitutes with their synonyms, replaces synonyms with base synonym
+def combined_substitutes_dict(found_substitutes_dict):
+    with open(synonyms_path, "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+    with open(ground_truth_path, "r") as whole_json_file:
+        ground_truth_dict = json.load(whole_json_file)
+
+
+    reversed_synonyms_dict = get_reversed_syn_dict()
+
+    new_found_sub_dict = {}
+
+    for ingredient in found_substitutes_dict.keys():
+        new_found_sub_dict[ingredient] = []
+        current_subs = set()
+        for sub in found_substitutes_dict[ingredient]:
+            # delete substitute if it is the same as the ingredient
+            if sub == ingredient:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # delete substitute if it is a synonym of the ingredient
+            if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
+                # found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
+                continue
+
+            # if substitute is a synonym of sth
+            if sub in reversed_synonyms_dict.keys():
+                if len(reversed_synonyms_dict[sub]) == 1:
+                    if reversed_synonyms_dict[sub][0] not in category_subs:
+                        current_subs.add(reversed_synonyms_dict[sub][0])
+                    else:
+                        if ingredient in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][0])
+                        else:
+                            current_subs.add(sub)
+                elif len(reversed_synonyms_dict[sub]) == 2:
+                    if ingredient in category_subs:
+                        if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][0])
+                        elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][1])
+                        else:
+                            print(reversed_synonyms_dict[sub])
+                    else:
+                        if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][1])
+                        elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
+                            current_subs.add(reversed_synonyms_dict[sub][0])
+                        else:
+                            print(reversed_synonyms_dict[sub])
+                else:
+                    print(sub + " is in " + str(reversed_synonyms_dict[sub]))
+
+            else:
+                current_subs.add(sub)
+
+        new_found_sub_dict[ingredient] += list(current_subs)
+    return new_found_sub_dict
+
+
+# combine substitutes found for an ingredient and its synonyms
+# also combine synonyms in substitutes
+def combine_all_synonyms(found_substitutes_dict):
+    reversed_synonyms_dict = get_reversed_syn_dict_no_cat()
+
+    new_found_sub_dict = {}
+    for ingredient in found_substitutes_dict.keys():
+        if ingredient not in reversed_synonyms_dict.keys():
+            new_found_sub_dict[ingredient] = set()
+
+    for ingredient in found_substitutes_dict.keys():
+        if ingredient in reversed_synonyms_dict.keys():
+            new_found_sub_dict[reversed_synonyms_dict[ingredient][0]] |= set(found_substitutes_dict[ingredient])
+        else:
+            new_found_sub_dict[ingredient] |= set(found_substitutes_dict[ingredient])
+
+    new_found_sub_dict_list = {}
+
+    for ingredient in new_found_sub_dict.keys():
+       new_found_sub_dict_list[ingredient] = list(new_found_sub_dict[ingredient])
+
+    return combined_substitutes_dict(new_found_sub_dict_list)
+
+
+
+def get_reversed_syn_dict(is_engl=False):
+    if is_engl:
+        with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
+            synonyms_dict = json.load(whole_json_file)
+    else:
+        with open(synonyms_path, "r") as whole_json_file:
+            synonyms_dict = json.load(whole_json_file)
+    reversed_synonyms_dict = {}
+    for ingredient in synonyms_dict.keys():
+        for syn in synonyms_dict[ingredient]:
+            if syn not in reversed_synonyms_dict.keys():
+                reversed_synonyms_dict[syn] = []
+            reversed_synonyms_dict[syn].append(ingredient)
+
+    return reversed_synonyms_dict
+
+def get_reversed_syn_dict_no_cat():
+    with open(synonyms_path, "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+    reversed_synonyms_dict = {}
+    for ingredient in synonyms_dict.keys():
+        if ingredient not in category_subs:
+            for syn in synonyms_dict[ingredient]:
+                if syn not in reversed_synonyms_dict.keys():
+                    reversed_synonyms_dict[syn] = []
+                reversed_synonyms_dict[syn].append(ingredient)
+
+    return reversed_synonyms_dict
+
+
+combined_substitutes_dict(sub_dict)
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 4
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,94 @@
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4313
+number of nones: 3035
+ingredients with over 30 substitutes: 71
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.044284720612103
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4313
+number of nones: 3035
+ingredients with over 30 substitutes: 71
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.3382796197542315
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 40
+number of nones: 29
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.75
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 40
+number of nones: 29
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.75
+================================
+englisch:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.433616143086448
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.457234579224949
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 2
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 6
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 1
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 1
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 3
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 0
+    }
+}
@@ -0,0 +1,94 @@
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4313
+number of nones: 2294
+ingredients with over 30 substitutes: 272
+ingredients with over 100 substitutes: 10
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 4.630883375840482
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4313
+number of nones: 2294
+ingredients with over 30 substitutes: 272
+ingredients with over 100 substitutes: 10
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.102480871782982
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 40
+number of nones: 21
+ingredients with over 30 substitutes: 1
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 5.375
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 40
+number of nones: 21
+ingredients with over 30 substitutes: 1
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 5.6
+================================
+englisch:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.433616143086448
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.457234579224949
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 3
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 4
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 5
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 8
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 4
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 7
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 4
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 3
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 3
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 3
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 6
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 1
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 2
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 1
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 6
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 2
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 5
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 2
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 2
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 7
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 4
+    }
+}
@@ -0,0 +1,93 @@
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4313
+number of nones: 1604
+ingredients with over 30 substitutes: 573
+ingredients with over 100 substitutes: 4
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 8.59169951309993
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4313
+number of nones: 1604
+ingredients with over 30 substitutes: 573
+ingredients with over 100 substitutes: 4
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 11.539067934152563
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 40
+number of nones: 3
+ingredients with over 30 substitutes: 14
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 16.8
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 40
+number of nones: 3
+ingredients with over 30 substitutes: 14
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 23.25
+================================
+englisch:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.433616143086448
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.457234579224949
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 3
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 4
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 5
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 5
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 1
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 2
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 3
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 3
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 3
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 1
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 1
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 5
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 2
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 3
+    }
+}
@@ -0,0 +1,94 @@
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4313
+number of nones: 2353
+ingredients with over 30 substitutes: 231
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 4.939485277069325
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4313
+number of nones: 2353
+ingredients with over 30 substitutes: 231
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 5.935311847901692
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 40
+number of nones: 8
+ingredients with over 30 substitutes: 5
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 11.95
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 40
+number of nones: 8
+ingredients with over 30 substitutes: 5
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 13.4
+================================
+englisch:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.433616143086448
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.457234579224949
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 1
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 4
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 5
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 3
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 2
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,42 @@
+{
+    "carrot": {
+        "engl": 0,
+        "ger": 0
+    },
+    "cherry": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chicken": {
+        "engl": 1,
+        "ger": 0
+    },
+    "parsley": {
+        "engl": 3,
+        "ger": 0
+    },
+    "chocolate": {
+        "engl": 3,
+        "ger": 0
+    },
+    "bacon": {
+        "engl": 2,
+        "ger": 0
+    },
+    "kale": {
+        "engl": 2,
+        "ger": 0
+    },
+    "sugar": {
+        "engl": 2,
+        "ger": 0
+    },
+    "brie": {
+        "engl": 3,
+        "ger": 0
+    },
+    "turkey": {
+        "engl": 3,
+        "ger": 1
+    }
+}
@@ -0,0 +1,94 @@
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4313
+number of nones: 2996
+ingredients with over 30 substitutes: 100
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.6978900996985855
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4313
+number of nones: 2996
+ingredients with over 30 substitutes: 100
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.9476002782286113
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 40
+number of nones: 14
+ingredients with over 30 substitutes: 2
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 7.45
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 40
+number of nones: 14
+ingredients with over 30 substitutes: 2
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 7.65
+================================
+englisch:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.433616143086448
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 4361
+number of nones: 0
+ingredients with over 30 substitutes: 22
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 6.457234579224949
+--------------------------------------------
+ground truth only:
+
+cap at 30 set to True
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.417429094236048
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
+cap at 30 set to False
+english ingredients with over 30 substitutes: 22
+english nones: 11
+average amount of substitutes found for english ingredients: 6.440988106129917
+number of ingredients in dataset: 10
+number of nones: 0
+ingredients with over 30 substitutes: 0
+ingredients with over 100 substitutes: 0
+ingredients with over 1000 substitutes: 0
+average number of substitutes: 2.7
+
@@ -0,0 +1,207 @@
+from transformers import BertTokenizer
+import json
+
+
+def print_stats(model_substitutes_dict, cap_at_30):
+    print("\ncap at 30 set to " + str(cap_at_30))
+    evaluation_path = "evaluation/"
+    # synonyms_path = "synonyms.json"
+
+    found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
+    model_name = "final_Versions/models/vers3/output/"
+
+    with open(evaluation_path + "engl_data/substitute_pairs_foodbert_text.json", "r") as whole_json_file:
+        engl_list = json.load(whole_json_file)
+
+    engl_dict = {}
+    for foo in engl_list:
+        if foo[0] in engl_dict.keys():
+            engl_dict[foo[0]].append(foo[1])
+        else:
+            engl_dict[foo[0]] = [foo[1]]
+
+    substitute_sum = 0
+    over30 = 0
+    for ingred in engl_dict.keys():
+        curr_nr = len(engl_dict[ingred])
+        if cap_at_30:
+            if curr_nr > 30:
+                substitute_sum += 30
+                over30 += 1
+            else:
+                substitute_sum += curr_nr
+        else:
+            if curr_nr > 30:
+                over30 += 1
+            substitute_sum += curr_nr
+    print("english ingredients with over 30 substitutes: " + str(over30))
+    print("english nones: " + str(4372-len(engl_dict.keys())))
+    print("average amount of substitutes found for english ingredients: " + str(substitute_sum / 4372))
+
+    # with open(found_substitutes_path, "r") as whole_json_file:
+    #     model_substitutes_dict = json.load(whole_json_file)[model_name]
+
+    substitute_sum = 0
+    over100 = 0
+    over1000 = 0
+    over30 = 0
+    nones = 0
+
+    for ingred in model_substitutes_dict.keys():
+
+        curr_nr = len(model_substitutes_dict[ingred])
+        if curr_nr == 0:
+            nones += 1
+        if curr_nr > 100:
+            # print(ingred + ": " + str(curr_nr))
+            over100 += 1
+        if curr_nr > 1000:
+            # print(ingred + ": " + str(curr_nr))
+            over1000 += 1
+        if cap_at_30:
+            if curr_nr > 30:
+                substitute_sum += 30
+                over30 += 1
+            else:
+                substitute_sum += curr_nr
+        else:
+            if curr_nr > 30:
+                over30 += 1
+            substitute_sum += curr_nr
+    # print(str(substitute_sum))
+    print("number of ingredients in dataset: " + str(len(model_substitutes_dict.keys())))
+    print("number of nones: " + str(nones))
+    print("ingredients with over 30 substitutes: " + str(over30))
+    print("ingredients with over 100 substitutes: " + str(over100))
+    print("ingredients with over 1000 substitutes: " + str(over1000))
+    print("average number of substitutes: " + str(substitute_sum / len(model_substitutes_dict.keys())))
+    # print(str(len(model_substitutes_dict.keys())))
+
+
+def main():
+    # with open("train_model/vocab/used_ingredients.json", "r") as used_ingredients_file:
+    #     used_ingredients = json.load(used_ingredients_file)
+    # tokenizer = BertTokenizer(vocab_file='train_model/vocab/vocab.txt', do_lower_case=False, model_max_length=512,
+    #                           never_split=used_ingredients)
+    #
+    # sent = ["Die Paprika schneiden. Dann die Stücke kochen."]
+    #
+    # batch_encoding = tokenizer.batch_encode_plus(sent, add_special_tokens=True, max_length=512, truncation=True)
+    #
+    # # Get the input IDs and attention mask in tensor format
+    # input_ids = batch_encoding['input_ids']
+    # attn_mask = batch_encoding['attention_mask']
+    #
+    # print(input_ids)
+    # print(attn_mask)
+
+    evaluation_path = "evaluation/"
+    synonyms_path = "synonyms.json"
+    data_path = "data/"
+
+    engl_data_path = evaluation_path + "engl_data/"
+
+    found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
+    # model_name = "final_Versions/models/vers3/output/"
+
+    with open(found_substitutes_path, "r") as whole_json_file:
+        model_substitutes_dict = json.load(whole_json_file)
+
+    with open(data_path + synonyms_path, "r") as whole_json_file:
+        synonyms_dict = json.load(whole_json_file)
+
+    category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
+                "Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
+                "Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Geflügelfleisch", "Wein", "Suppenfleisch"]
+
+    # synonyms_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter"],
+    #                  "Hähnchenfilet": ["Filet_Hähnchen", "Hühnerfilet"],
+    #                  "Huhn": ["Hähnchenfilet", "Filet_Hähnchen", "Hühnchenschenkel", "Hühnerbeine"],
+    #                  "Kuvertüre_Zartbitter": ["Zartbitterkuvertüre"]}
+    #
+    # model_substitutes_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter", "Kuvertüre_Zartbitter", "Zartbitterkuvertüre", "Nutella"],
+    #                           "Schokolade_Zartbitter": ["Kuvertüre_Zartbitter", "Weiße_Schokolade", "Zartbitterschokolade"],
+    #                           "Huhn": ["Hähnchenfilet", "Schweinelende"],
+    #                           "Dill": ["Petersilie"]}
+
+
+    final_dict = {}
+
+    new_syn_dict = {}
+    # get base word for all synonyms
+    for ingred in synonyms_dict.keys():
+        if ingred not in category_subs:
+            for syn in synonyms_dict[ingred]:
+                new_syn_dict[syn] = ingred
+
+    #
+    for ingred in model_substitutes_dict.keys():
+        if ingred not in new_syn_dict.keys():
+            final_dict[ingred] = set()
+
+    for ingred in model_substitutes_dict.keys():
+        curr_set = set()
+        for sub in model_substitutes_dict[ingred]:
+            if sub in new_syn_dict:
+                curr_set.add(new_syn_dict[sub])
+            else:
+                curr_set.add(sub)
+        if ingred not in new_syn_dict:
+            final_dict[ingred] |= curr_set
+        else:
+            test = new_syn_dict[ingred]
+            final_dict[test] |= curr_set
+    # print(final_dict)
+    for ingred in final_dict.keys():
+        if ingred in final_dict[ingred]:
+            final_dict[ingred].remove(ingred)
+
+    new_final_dict = {}
+    for ingred in final_dict.keys():
+        new_final_dict[ingred] = list(final_dict[ingred])
+
+    with open(found_substitutes_path, "r") as whole_json_file:
+        new_final_dict = json.load(whole_json_file)
+
+    print_stats(new_final_dict, cap_at_30=True)
+    print_stats(new_final_dict, cap_at_30=False)
+
+    print("--------------------------------------------\nground truth only: ")
+
+    with open("data/ground_truth.json", "r") as whole_json_file:
+        ground_truth = json.load(whole_json_file)
+
+    ground_truth_only = {}
+    for ingred in new_final_dict.keys():
+        if ingred in ground_truth.keys():
+            ground_truth_only[ingred] = new_final_dict[ingred]
+
+    print_stats(ground_truth_only, cap_at_30=True)
+    print_stats(ground_truth_only, cap_at_30=False)
+
+    print("================================\nenglisch:")
+    with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
+        engl_list = json.load(whole_json_file)
+    with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file:
+        engl_ground_truth = json.load(whole_json_file)
+
+    engl_dict = {}
+    for foo in engl_list:
+        if foo[0] in engl_dict.keys():
+            engl_dict[foo[0]].append(foo[1])
+        else:
+            engl_dict[foo[0]] = [foo[1]]
+    print_stats(engl_dict, cap_at_30=True)
+    print_stats(engl_dict, cap_at_30=False)
+
+    print("--------------------------------------------\nground truth only: ")
+
+    ground_truth_only = {}
+    for ingred in engl_dict.keys():
+        if ingred in engl_ground_truth.keys():
+            ground_truth_only[ingred] = engl_dict[ingred]
+
+    print_stats(ground_truth_only, cap_at_30=True)
+    print_stats(ground_truth_only, cap_at_30=False)
+
+main()