initial commit of project

This commit is contained in:
2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions

0
evaluation/README.md Normal file
View File

View File

@@ -0,0 +1,29 @@
import json
def main():
eval_path = "final_Versions/models/vers2/eval/"
file_name = "substitute_pairs_65.json"
found_substitutes_path = eval_path + file_name
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
data_path = "data/"
occurances_path = "mult_ingredients_nice.json"
with open(data_path + occurances_path, "r") as whole_json_file:
occurrences_dict = json.load(whole_json_file)
all_substitutes = {}
for ingredient in occurrences_dict.keys():
if ingredient not in model_substitutes_dict.keys():
all_substitutes[ingredient] = []
# print(ingredient)
else:
all_substitutes[ingredient] = model_substitutes_dict[ingredient]
print(str(len(all_substitutes.keys())))
out_path = eval_path + "complete_" + file_name
with open(out_path, 'w') as f:
json.dump(all_substitutes, f, ensure_ascii=False, indent=4)
main()

110
evaluation/dataset_stats.py Normal file
View File

@@ -0,0 +1,110 @@
import json
import statistics
def dataset(full_dataset_path):
all_urls = []
with open(full_dataset_path, "r") as whole_json_file:
full_dataset = json.load(whole_json_file)
counter = 0
ingredient_counter = 0
ingredient_lengths = []
pic_counter = 0
comment_counter = 0
no_comments = 0
comment_lengths = []
instruction_counter = 0
instruction_lengths = []
for url in full_dataset.keys():
ingredient_counter += len(full_dataset[url]['ingredients'])
ingredient_lengths.append(len(full_dataset[url]['ingredients']))
if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg":
pic_counter += 1
if full_dataset[url]['comments']:
comment_lengths.append(len(full_dataset[url]['comments']))
comment_counter += len(full_dataset[url]['comments'])
else:
comment_lengths.append(0)
no_comments += 1
instruction_counter += len(full_dataset[url]['instructions'])
instruction_lengths.append(len(full_dataset[url]['instructions']))
counter += 1
print(counter)
if url not in all_urls:
all_urls.append(url)
print("number of recipes: " + str(len(full_dataset.keys())))
print("\n")
print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys())))
print("median ingredient count: " + str(statistics.median(ingredient_lengths)))
print("\n")
print("number of recipes with picture: " + str(pic_counter))
print("\n")
print("number of comments: " + str(comment_counter))
print("number of recipes withOUT comments: " + str(no_comments))
print("average amount of comments: " + str(comment_counter/len(full_dataset.keys())))
print("median comment count: " + str(statistics.median(comment_lengths)))
print("\n")
print("total instruction count: " + str(instruction_counter))
print("average instruction count: " + str(instruction_counter / len(full_dataset.keys())))
print("median instruction count: " + str(statistics.median(instruction_lengths)))
def ingredients_before(full_dataset_path):
counter = 0
with open(full_dataset_path, "r") as whole_json_file:
full_dataset = json.load(whole_json_file)
all_ingredients = []
for url in full_dataset.keys():
counter += 1
print(counter)
for ingred in full_dataset[url]['ingredients']:
if ingred not in all_ingredients:
all_ingredients.append(ingred)
print(str(len(all_ingredients)))
def ingredient_stats():
ingredients_list_path = "data/mult_ingredients_nice.json"
ingredients_instructions_path = "data/cleaned_steps_occurrance.json"
with open(ingredients_list_path, "r") as whole_json_file:
ingred_lists = json.load(whole_json_file)
with open(ingredients_instructions_path, "r") as whole_json_file:
ingred_instruct = json.load(whole_json_file)
print("in ingredient lists: ")
ingred_counts = []
ingred_sum = 0
for ingred in ingred_lists.keys():
ingred_counts.append(ingred_lists[ingred])
ingred_sum += ingred_lists[ingred]
print("average: " + str(ingred_sum/len(ingred_lists.keys())))
print("median: " + str(statistics.median(ingred_counts)))
print("in instructions: ")
instruct_counts = []
instruct_sum = 0
none_counts = 0
for ingred in ingred_instruct.keys():
instruct_counts.append(ingred_instruct[ingred])
instruct_sum += ingred_instruct[ingred]
if ingred_instruct[ingred] <5 :
none_counts += 1
print("average: " + str(instruct_sum / len(ingred_instruct.keys())))
print("median: " + str(statistics.median(instruct_counts)))
print("nones: " + str(none_counts))
sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1]))
print(sorted_instruct)
def main():
before_dataset_path = "data/dataset_fin.json"
full_dataset_path = "Versions/vers3/full_dataset.json"
# dataset(full_dataset_path)
# ingredients_before(before_dataset_path)
ingredient_stats()
main()

View File

@@ -0,0 +1,216 @@
{
"carrot": [
"parsnip",
"daikon",
"turnip",
"celery",
"squash",
"celery root",
"sweet potato",
"yam",
"radish",
"potato",
"pumpkin",
"green papaya",
"swede",
"beet",
"rutabaga",
"red bell pepper",
"yellow squash",
"butternut squash",
"root vegetable",
"parsley root"
],
"cherry": [
"acerola",
"apricot",
"plum",
"nectarine",
"raspberry",
"grape",
"strawberry",
"currant",
"blackberry",
"frozen mixed berry",
"peach",
"cranberry",
"dried cranberry",
"blueberry",
"maraschino",
"berry",
"prune"
],
"chicken": [
"turkey",
"rabbit",
"oyster mushroom",
"squab",
"veal",
"fish",
"tofu",
"beef",
"extra firm tofu",
"pork",
"seitan",
"duck",
"capon",
"lamb",
"venison",
"mushroom",
"shrimp",
"quail",
"goose"
],
"parsley": [
"chervil",
"cilantro",
"tarragon",
"basil",
"oregano",
"chopped cilantro",
"lovage",
"dill",
"fresh coriander",
"coriander",
"rosemary",
"caper",
"fresh cilantro",
"fresh dill",
"thyme",
"fresh oregano",
"chive",
"mint",
"fresh basil",
"fresh thyme",
"dried basil",
"dried oregano",
"fresh chive",
"dried thyme"
],
"chocolate": [
"truffle",
"nutella",
"ganache",
"cocoa powder",
"sugar",
"jam",
"marshmallow",
"cocoa",
"candy",
"caramel",
"peanut butter"
],
"bacon": [
"pancetta",
"prosciutto",
"speck",
"smoked sausage",
"smoked ham",
"parma ham",
"ham",
"salami",
"pepperoni",
"guanciale",
"chorizo",
"salt pork",
"kielbasa",
"pork rind",
"cubed ham",
"italian sausage",
"crouton",
"capicola",
"hard salami",
"lardon",
"cooked ham",
"corned beef",
"bologna"
],
"kale": [
"collard green",
"turnip green",
"spinach",
"chinese cabbage",
"leek",
"escarole",
"spring green",
"chard",
"green cabbage",
"savoy cabbage",
"cabbage",
"cauliflower",
"collard",
"watercres",
"arugula",
"broccoli rabe",
"spinach leaves",
"lettuce",
"romaine lettuce",
"baby spinach",
"mizuna"
],
"sugar": [
"splenda",
"honey",
"stevia",
"sweetener",
"liquid stevia",
"corn syrup",
"splenda granular",
"liquid sweetener",
"brown rice syrup",
"turbinado",
"maple syrup",
"pure maple syrup",
"jaggery",
"sweetened condensed milk",
"artificial sweetener",
"agave nectar",
"sweet chocolate",
"chocolate",
"caramel",
"vanilla",
"molasse",
"golden syrup",
"syrup"
],
"brie": [
"camembert",
"reblochon",
"gorgonzola",
"cheese spread",
"cheddar",
"goat cheese",
"havarti",
"boursin",
"blue cheese",
"roquefort",
"monterey jack",
"gouda",
"fontina",
"provolone cheese",
"stilton",
"feta",
"processed cheese"
],
"turkey": [
"chicken",
"rabbit",
"duck",
"ham",
"pheasant",
"goose",
"capon",
"beef",
"venison",
"lamb",
"pork",
"hen",
"roast beef",
"veal",
"poultry",
"chicken breast",
"chicken thigh",
"quail",
"pork chop"
]
}

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
{
"squash": ["pumpkin"],
"sweet potato": ["yam"],
"cilantro": ["coriander"],
"fresh cilantro": ["chopped cilantro", "fresh coriander"],
"dill": ["fresh dill"],
"oregano": ["fresh oregano"],
"basil": ["fresh basil"],
"thyme": ["fresh thyme"],
"chive": ["fresh chive"],
"cabbage":["collard"],
"maple syrup": ["pure maple syrup"],
"sweetener": ["artificial sweetener"]
}

View File

@@ -0,0 +1,203 @@
{
"carrot": [
"parsnip",
"daikon",
"turnip",
"celery",
"squash",
"celery root",
"sweet potato",
"radish",
"potato",
"green papaya",
"swede",
"beet",
"rutabaga",
"red bell pepper",
"yellow squash",
"butternut squash",
"root vegetable",
"parsley root"
],
"cherry": [
"acerola",
"apricot",
"plum",
"nectarine",
"raspberry",
"grape",
"strawberry",
"currant",
"blackberry",
"frozen mixed berry",
"peach",
"cranberry",
"dried cranberry",
"blueberry",
"maraschino",
"berry",
"prune"
],
"chicken": [
"turkey",
"rabbit",
"oyster mushroom",
"squab",
"veal",
"fish",
"tofu",
"beef",
"extra firm tofu",
"pork",
"seitan",
"duck",
"capon",
"lamb",
"venison",
"mushroom",
"shrimp",
"quail",
"goose"
],
"parsley": [
"chervil",
"cilantro",
"tarragon",
"basil",
"oregano",
"lovage",
"dill",
"rosemary",
"caper",
"fresh cilantro",
"thyme",
"chive",
"mint",
"dried basil",
"dried oregano",
"dried thyme"
],
"chocolate": [
"truffle",
"nutella",
"ganache",
"cocoa powder",
"sugar",
"jam",
"marshmallow",
"cocoa",
"candy",
"caramel",
"peanut butter"
],
"bacon": [
"pancetta",
"prosciutto",
"speck",
"smoked sausage",
"smoked ham",
"parma ham",
"ham",
"salami",
"pepperoni",
"guanciale",
"chorizo",
"salt pork",
"kielbasa",
"pork rind",
"cubed ham",
"italian sausage",
"crouton",
"capicola",
"hard salami",
"lardon",
"cooked ham",
"corned beef",
"bologna"
],
"kale": [
"collard green",
"turnip green",
"spinach",
"chinese cabbage",
"leek",
"escarole",
"spring green",
"chard",
"green cabbage",
"savoy cabbage",
"cabbage",
"cauliflower",
"watercres",
"arugula",
"broccoli rabe",
"spinach leaves",
"lettuce",
"romaine lettuce",
"baby spinach",
"mizuna"
],
"sugar": [
"splenda",
"honey",
"stevia",
"sweetener",
"liquid stevia",
"corn syrup",
"splenda granular",
"liquid sweetener",
"brown rice syrup",
"turbinado",
"maple syrup",
"jaggery",
"sweetened condensed milk",
"agave nectar",
"sweet chocolate",
"chocolate",
"caramel",
"vanilla",
"molasse",
"golden syrup",
"syrup"
],
"brie": [
"camembert",
"reblochon",
"gorgonzola",
"cheese spread",
"cheddar",
"goat cheese",
"havarti",
"boursin",
"blue cheese",
"roquefort",
"monterey jack",
"gouda",
"fontina",
"provolone cheese",
"stilton",
"feta",
"processed cheese"
],
"turkey": [
"chicken",
"rabbit",
"duck",
"ham",
"pheasant",
"goose",
"capon",
"beef",
"venison",
"lamb",
"pork",
"hen",
"roast beef",
"veal",
"poultry",
"chicken breast",
"chicken thigh",
"quail",
"pork chop"
]
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,168 @@
{
"carrot": "Karotte",
"parsnip": "Pastinake",
"daikon": "Rettich",
"turnip": "Steckrübe",
"celery": "Staudensellerie",
"squash": "Kürbis",
"sweet potato": "Süßkartoffel",
"yam": "Süßkartoffel",
"radish": "Radieschen",
"potato": "Kartoffel",
"pumpkin": "Kürbis",
"beet": "Rübe",
"red bell pepper": "Paprika_rot",
"butternut squash": "Butternusskürbis",
"parsley root": "Petersilienwurzel",
"cherry": "Kirsche",
"apricot": "Aprikose",
"plum": "Pflaume",
"nectarine": "Nektarine",
"raspberry": "Himbeeren",
"grape": "Weintrauben",
"strawberry": "Erdbeeren",
"currant": "Johannisbeeren",
"blackberry": "Brombeeren",
"frozen mixed berry": "Beeren_gemischte",
"peach": "Pfirsich",
"cranberry": "Cranberries",
"dried cranberry": "Cranberries_getrocknet",
"blueberry": "Blaubeeren",
"maraschino": "Maraschino",
"berry": "Beeren",
"prune": "Trockenpflaumen",
"chicken": "Huhn",
"turkey": "Truthahn",
"rabbit": "Kaninchen",
"oyster mushroom": "Austernpilze",
"veal": "Kalbfleisch",
"fish": "Fisch",
"tofu": "Tofu",
"beef": "Rindfleisch",
"extra firm tofu": "Tofu_fester",
"pork": "Schweinefleisch",
"seitan": "Seitan",
"duck": "Ente",
"lamb": "Lamm",
"venison": "Wildfleisch",
"mushroom": "Pilze",
"shrimp": "Shrimps",
"quail": "Wachtel",
"goose": "Gans",
"parsley": "Petersilie",
"chervil": "Kerbel",
"cilantro": "Koriander",
"tarragon": "Estragon",
"basil": "Basilikum",
"oregano": "Oregano",
"chopped cilantro": "Koriandergrün",
"lovage": "Liebstöckel",
"dill": "Dill",
"fresh coriander": "Koriandergrün",
"coriander": "Koriander",
"rosemary": "Rosmarin",
"caper": "Kapern",
"fresh cilantro": "Koriandergrün",
"fresh dill": "Dill",
"thyme": "Thymian",
"fresh oregano": "Oregano",
"chive": "Schnittlauch",
"mint": "Minze",
"fresh basil": "Basilikum",
"fresh thyme": "Thymian",
"dried basil": "Basilikum_getrockneter",
"dried oregano": "Oregano_getrocknet",
"fresh chive": "Schnittlauch",
"dried thyme": "Thymian_getrocknet",
"chocolate": "Schokolade",
"nutella": "Nutella",
"cocoa powder": "Kakaopulver_Instant",
"sugar": "Zucker",
"jam": "Marmelade",
"marshmallow": "Marshmallow",
"cocoa": "Kakao",
"candy": "Süßigkeiten",
"peanut butter": "Erdnussbutter",
"bacon": "Frühstücksspeck",
"pancetta": "Pancetta",
"prosciutto": "Schinken_Prosciutto",
"speck": "Speck",
"smoked ham": "Schinken_rohen",
"parma ham": "Parmaschinken",
"ham": "Kochschinken",
"salami": "Salami",
"chorizo": "Chorizo",
"kielbasa": "Wurst_Krakauer",
"pork rind": "Schweineschwarte",
"cubed ham": "Schinkenwürfel",
"crouton": "Croûtons",
"lardon": "Speckwürfel",
"cooked ham": "Kochschinken",
"corned beef": "Corned_Beef",
"bologna": "Wurst_Mortadella",
"kale": "Grünkohl",
"spinach": "Spinat",
"chinese cabbage": "Chinakohl",
"leek": "Lauch",
"escarole": "Endiviensalat",
"chard": "Mangold",
"savoy cabbage": "Wirsing",
"cabbage": "Kohl",
"cauliflower": "Blumenkohl",
"collard": "Kohl",
"watercres": "Brunnenkresse",
"arugula": "Rucola",
"spinach leaves": "Blattspinat",
"lettuce": "Kopfsalat",
"romaine lettuce": "Römersalat",
"baby spinach": "Babyspinat",
"sugar": "Zucker",
"honey": "Honig",
"stevia": "Stevia",
"sweetener": "Süßstoff",
"liquid stevia": "Stevia_flüssig",
"liquid sweetener": "Süßstoff_flüssigen",
"brown rice syrup": "Reissirup",
"maple syrup": "Ahornsirup",
"pure maple syrup": "Ahornsirup",
"sweetened condensed milk": "Kondensmilch_gezuckerte",
"artificial sweetener": "Süßstoff",
"agave nectar": "Agavendicksaft",
"chocolate": "Schokolade",
"vanilla": "Vanille",
"molasse": "Melasse",
"golden syrup": "Zuckerrübensirup",
"syrup": "Sirup",
"brie": "Brie",
"camembert": "Camembert",
"gorgonzola": "Gorgonzola",
"cheese spread": "Schmelzkäse",
"cheddar": "Cheddarkäse",
"goat cheese": "Ziegenkäse",
"boursin": "Doppelrahmfrischkäse",
"blue cheese": "Blauschimmelkäse",
"roquefort": "Roquefort",
"gouda": "Gouda",
"fontina": "Käse_Fontina",
"provolone cheese": "Käse_Provolone",
"feta": "Feta_Käse",
"processed cheese": "Scheiblettenkäse",
"turkey": "Truthahn",
"chicken": "Huhn",
"rabbit": "Kaninchen",
"duck": "Ente",
"ham": "Schinken",
"pheasant": "Fasan",
"goose": "Gans",
"beef": "Rindfleisch",
"venison": "Wildfleisch",
"lamb": "Lammfleisch",
"pork": "Schweinefleisch",
"roast beef": "Roastbeef",
"veal": "Kalbfleisch",
"poultry": "Geflügelfleisch",
"chicken breast": "Hähnchenfilet",
"chicken thigh": "Hühnerkeule",
"quail": "Wachtel",
"pork chop": "schweinekotelett"
}

523
evaluation/evaluate.py Normal file
View File

@@ -0,0 +1,523 @@
import json
import statistics
data_path = "data/"
occurances_path = "mult_ingredients_nice.json"
ground_truth_path = "ground_truth.json"
engl_data_path = "evaluation/engl_data/"
evaluation_path = "evaluation/"
synonyms_path = "synonyms.json"
found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
# model_name = "Versions/vers3/"
german_ground_truth = {
"Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen", "Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel"],
"Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren", "Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren", "Maraschino", "Beeren", "Trockenpflaumen"],
"Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester", "Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"],
"Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill", "Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze", "Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"],
"Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten", "Erdnussbutter"],
"Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken", "Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons", "Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"],
"Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl", "Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"],
"Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup", "Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup", "Sirup"],
"Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse", "Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse", "Scheiblettenkäse"],
"Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch", "Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule", "Wachtel", "schweinekotelett", "Wildfleisch"]
}
def no_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
if get_occurrences:
with open(data_path + occurances_path, "r") as whole_json_file:
occurrences_dict = json.load(whole_json_file)
if not ground_truth_dict:
with open(data_path+ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
if synonyms:
with open(data_path + synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
synonyms_dict = {}
if not found_substitutes_dict:
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
else:
model_substitutes_dict = found_substitutes_dict
found_ground_ingr = {}
correctly_found = 0
incorrectly_found = 0
average_precision = 0.0
average_recall = 0.0
number_correct_subs_found_overall = []
total_number_subs_found_overall = []
# base ingredient without synonyms, substitutes with synonyms
for base_ingred in ground_truth_dict.keys():
if get_occurrences:
occurrences = occurrences_dict[base_ingred]
found_substitutes = model_substitutes_dict[base_ingred].copy()
# if len(found_substitutes) > 30:
# found_substitutes = found_substitutes[:30]
found = []
# remove synonyms of base ingredient
new_found_substitutes = []
for subst in found_substitutes:
if base_ingred in synonyms_dict.keys():
if subst not in synonyms_dict[base_ingred]:
new_found_substitutes.append(subst)
else:
new_found_substitutes.append(subst)
found_substitutes = new_found_substitutes
# check which substitutes were found
for subst in ground_truth_dict[base_ingred]:
# only add substitute if not already added
if subst in found_substitutes and subst not in found:
found.append(subst)
found_substitutes.remove(subst)
# check if synonyms of substitute were found
# check if ingredient has synonyms
if subst in synonyms_dict.keys():
for synon in synonyms_dict[subst]:
if synon in found_substitutes:
if synon not in found and subst not in found:
found.append(subst)
found_substitutes.remove(synon)
# if base_ingred == "Erdbeere":
print(base_ingred + ": " + str(found_substitutes))
found_ground_ingr[base_ingred] = found
# print(base_ingred + ": ")
# if get_occurrences:
# print("occurrences in dataset: " + str(occurrences))
# print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
# print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
# print("correctly found substitutes: " + str(found))
# print("incorrectly found substitutes: " + str(found_substitutes))
# print("-----------------------------\n")
if len(found) > 0:
average_precision += len(found)/(len(found) + len(found_substitutes))
# print(len(found))
average_recall += len(found)/len(ground_truth_dict[base_ingred])
correctly_found += len(found)
incorrectly_found += len(found_substitutes)
number_correct_subs_found_overall.append(len(found))
total_number_subs_found_overall.append(len(found) + len(found_substitutes))
print("average precision: " + str(average_precision/40))
print("average recall: " + str(average_recall/40))
print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
return found_ground_ingr
def merge_lists(all_lists):
max_len = 0
min_len = 99999
output = []
for curr_list in all_lists:
if len(curr_list) < min_len:
min_len = len(curr_list)
if len(curr_list) > max_len:
max_len = len(curr_list)
for index_counter in range(max_len):
for curr_list in all_lists:
if index_counter < len(curr_list):
if curr_list[index_counter] not in output:
output.append(curr_list[index_counter])
return output
def with_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
if get_occurrences:
with open(data_path + occurances_path, "r") as whole_json_file:
occurrences_dict = json.load(whole_json_file)
if not ground_truth_dict:
with open(data_path+ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
if synonyms:
with open(data_path + synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
synonyms_dict = {}
if not found_substitutes_dict:
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
else:
model_substitutes_dict = found_substitutes_dict
correctly_found = 0
incorrectly_found = 0
average_precision = 0.0
average_recall = 0.0
number_correct_subs_found_overall = []
total_number_subs_found_overall = []
found_ground_ingr = {}
# base ingredient with synonyms, substitutes with synonyms
for base_ingred in ground_truth_dict.keys():
base_synonyms = [base_ingred]
if get_occurrences:
occurrences = 0
# get list of all synonyms of base ingredient
if base_ingred in synonyms_dict.keys():
synonyms = synonyms_dict[base_ingred]
base_synonyms = base_synonyms + synonyms
found_substitutes = []
all_substitutes = []
# get top 30 substitutes of each base synonym
for synon in base_synonyms:
if get_occurrences:
occurrences += occurrences_dict[synon]
all_substitutes.append(model_substitutes_dict[synon].copy())
# synon_subs = model_substitutes_dict[synon].copy()
# if len(synon_subs) > 30:
# synon_subs = synon_subs[:30]
# for sub in synon_subs:
# if sub not in found_substitutes:
# found_substitutes.append(sub)
found_substitutes = merge_lists(all_substitutes)
else:
found_substitutes = model_substitutes_dict[base_ingred].copy()
if len(found_substitutes) > 30:
found_substitutes = found_substitutes[:30]
found = []
# remove all base synonyms from found substitutes
new_found_substitutes = []
for subst in found_substitutes:
if subst not in base_synonyms:
new_found_substitutes.append(subst)
found_substitutes = new_found_substitutes
# check which substitutes were found
for subst in ground_truth_dict[base_ingred]:
# only add substitute if not already added
if subst in found_substitutes and subst not in found:
found.append(subst)
found_substitutes.remove(subst)
# check if synonyms of substitute were found
# check if ingredient has synonyms
if subst in synonyms_dict.keys():
for synon in synonyms_dict[subst]:
if synon in found_substitutes:
if synon not in found and subst not in found:
found.append(subst)
found_substitutes.remove(synon)
found_ground_ingr[base_ingred] = found
# print(base_ingred + ": ")
# if get_occurrences:
# print("occurrences in dataset: " + str(occurrences))
# print("number of synonyms incl. original word: " + str(len(base_synonyms)))
# print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
# print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
# print("correctly found substitutes: " + str(found))
# print("incorrectly found substitutes: " + str(found_substitutes))
# print("-----------------------------\n")
if len(found) > 0:
average_precision += len(found) / (len(found) + len(found_substitutes))
average_recall += len(found) / len(ground_truth_dict[base_ingred])
correctly_found += len(found)
incorrectly_found += len(found_substitutes)
number_correct_subs_found_overall.append(len(found))
total_number_subs_found_overall.append(len(found) + len(found_substitutes))
print("average precision: " + str(average_precision / 40))
print("average recall: " + str(average_recall / 40))
print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
return found_ground_ingr
def translate_engl_ground_truth(ground_truth, ger_transl):
new_ground_truth = {}
for base_ingr in ground_truth.keys():
new_ground_truth[ger_transl[base_ingr]] = []
for subst in ground_truth[base_ingr]:
if subst in ger_transl.keys():
new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst])
return new_ground_truth
def with_base_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
if get_occurrences:
with open(data_path + occurances_path, "r") as whole_json_file:
occurrences_dict = json.load(whole_json_file)
if not ground_truth_dict:
with open(data_path+ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
if synonyms:
with open(data_path + synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
synonyms_dict = {}
if not found_substitutes_dict:
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
else:
model_substitutes_dict = found_substitutes_dict
found_ground_ingr = {}
# base ingredient with synonyms, substitutes with synonyms
for base_ingred in ground_truth_dict.keys():
base_synonyms = [base_ingred]
if get_occurrences:
occurrences = 0
# get list of all synonyms of base ingredient
if base_ingred in synonyms_dict.keys():
synonyms = synonyms_dict[base_ingred]
base_synonyms = base_synonyms + synonyms
found_substitutes = []
all_substitutes = []
# get top 30 substitutes of each base synonym
for synon in base_synonyms:
if get_occurrences:
occurrences += occurrences_dict[synon]
all_substitutes.append(model_substitutes_dict[synon].copy())
found_substitutes = merge_lists(all_substitutes)
else:
found_substitutes = model_substitutes_dict[base_ingred].copy()
if len(found_substitutes) > 30:
found_substitutes = found_substitutes[:30]
found = []
# remove all base synonyms from found substitutes
new_found_substitutes = []
for subst in found_substitutes:
if subst not in base_synonyms:
new_found_substitutes.append(subst)
found_substitutes = new_found_substitutes
# check which substitutes were found
for subst in ground_truth_dict[base_ingred]:
# only add substitute if not already added
if subst in found_substitutes and subst not in found:
found.append(subst)
found_substitutes.remove(subst)
# check if synonyms of substitute were found
# check if ingredient has synonyms
# if subst in synonyms_dict.keys():
# for synon in synonyms_dict[subst]:
# if synon in found_substitutes:
# if synon not in found and subst not in found:
# found.append(subst)
# found_substitutes.remove(synon)
found_ground_ingr[base_ingred] = found
print(base_ingred + ": ")
if get_occurrences:
print("occurrences in dataset: " + str(occurrences))
print("number of synonyms incl. original word: " + str(len(base_synonyms)))
print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
print("correctly found substitutes: " + str(found))
print("incorrectly found substitutes: " + str(found_substitutes))
print("-----------------------------\n")
return found_ground_ingr
def engl_compare():
# with open(data_path + occurances_path, "r") as whole_json_file:
# occurrences_dict = json.load(whole_json_file)
with open(engl_data_path + "translation.json", "r") as whole_json_file:
ger_transl = json.load(whole_json_file)
# with open(data_path + synonyms_path, "r") as whole_json_file:
# synonyms_dict = json.load(whole_json_file)
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
engl_list = json.load(whole_json_file)
with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file:
engl_ground_truth = json.load(whole_json_file)
engl_dict = {}
for foo in engl_list:
if foo[0] in engl_dict.keys():
engl_dict[foo[0]].append(foo[1])
else:
engl_dict[foo[0]] = [foo[1]]
translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl)
# without any synonyms
print("Engl compare without any synonyms:")
engl_replacements = {}
# ger_replacements = {}
for ingred in engl_ground_truth.keys():
found = []
incorr = []
found_ger = []
incorr_ger = []
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
# ger_replacements[ingred] = 0
if ingred in engl_dict.keys():
for sub in engl_ground_truth[ingred]:
if sub in engl_dict[ingred]:
engl_replacements[ingred]["engl"] += 1
found.append(sub)
if ger_transl[ingred] in model_substitutes_dict.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in model_substitutes_dict[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
found_ger.append(sub)
# ger_replacements[ingred] += 1
for found_sub in engl_dict[ingred]:
if found_sub not in engl_ground_truth[ingred]:
incorr.append(found_sub)
for found_sub in model_substitutes_dict[ger_transl[ingred]]:
if found_sub not in translated_ground_truth[ger_transl[ingred]]:
incorr_ger.append(found_sub)
print(ger_transl[ingred] + ": ")
print("number of found substitutes: " + str(len(found_ger)) + "/" + str(len(translated_ground_truth[ger_transl[ingred]])))
print("correctly found substitutes: " + str(len(found_ger)) + "/" + str(len(found_ger) + len(incorr_ger)))
print("correctly found substitutes: " + str(found_ger))
print("incorrectly found substitutes: " + str(incorr_ger))
print("-----------------------------\n")
print(ingred + ": ")
print("number of found substitutes: " + str(len(found)) + "/" + str(len(engl_ground_truth[ingred])))
print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(incorr)))
print("correctly found substitutes: " + str(found))
print("incorrectly found substitutes: " + str(incorr))
print("-----------------------------\n")
with open(evaluation_path + "engl_comparison_results/engl_no_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
# with synonyms of substitutes
print("Engl compare with synonyms of substitutes only:")
# german
new_german_result = no_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
#engl
new_engl_result = no_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
engl_replacements = {}
for ingred in engl_ground_truth.keys():
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
if ingred in new_engl_result.keys():
for sub in engl_ground_truth[ingred]:
if sub in new_engl_result[ingred]:
engl_replacements[ingred]["engl"] += 1
if ger_transl[ingred] in new_german_result.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in new_german_result[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
with open(evaluation_path + "engl_comparison_results/engl_sub_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
# with synonyms for substitutes and base words
print("Engl compare with synonyms of both:")
# german
new_german_result = with_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
# engl
new_engl_result = with_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
engl_replacements = {}
for ingred in engl_ground_truth.keys():
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
if ingred in new_engl_result.keys():
for sub in engl_ground_truth[ingred]:
if sub in new_engl_result[ingred]:
engl_replacements[ingred]["engl"] += 1
if ger_transl[ingred] in new_german_result.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in new_german_result[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
with open(evaluation_path + "engl_comparison_results/engl_all_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
# with synonyms for base words
print("Engl compare with synonyms of base words only:")
# german
new_german_result = with_base_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
# engl
new_engl_result = with_base_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict,
get_occurrences=False, synonyms=False)
engl_replacements = {}
for ingred in engl_ground_truth.keys():
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
if ingred in new_engl_result.keys():
for sub in engl_ground_truth[ingred]:
if sub in new_engl_result[ingred]:
engl_replacements[ingred]["engl"] += 1
if ger_transl[ingred] in new_german_result.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in new_german_result[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
with open(evaluation_path + "engl_comparison_results/engl_base_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
print("test")
def main():
# compare english and german results
# engl_compare()
print("--------------------------------------------------------")
print("--------------------------------------------------------")
print("--------------------------------------------------------\n")
# get results, synonyms only used in substitutes
no_synonyms()
print("--------------------------------------------------------")
print("--------------------------------------------------------")
print("--------------------------------------------------------\n")
# get results, synonyms used in substitutes and base ingredients
with_synonyms()
main()

288
evaluation/final_eval.py Normal file
View File

@@ -0,0 +1,288 @@
import json
import statistics
import helpers.revise_substitutes as revise_subs
def eval_dataset(substitutes_dict):
nones = 0
all_lengths = []
for ingredient in substitutes_dict.keys():
if len(substitutes_dict[ingredient]) == 0:
nones += 1
all_lengths.append(len(substitutes_dict[ingredient]))
print("number of ingredients: " + str(len(substitutes_dict.keys())))
print("number of nones: " + str(nones))
print("average number of subs: " + str(sum(all_lengths) / len(substitutes_dict.keys())))
print("median number of subs: " + str(statistics.median(all_lengths)))
print("largest number of subs: " + str(max(all_lengths)))
print("smallest number of subs: " + str(min(all_lengths)))
def translate_engl_ground_truth(ground_truth, ger_transl):
new_ground_truth = {}
for base_ingr in ground_truth.keys():
new_ground_truth[ger_transl[base_ingr]] = []
for subst in ground_truth[base_ingr]:
if subst in ger_transl.keys():
new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst])
# else:
# print("translation error: " + subst)
return new_ground_truth
def eval_ground_truth(substitutes_dict, ground_truth_dict):
total_corr_int = 0
total_corr_list = []
total_incorr_int = 0
total_incorr_list = []
total_subs_ground_truth = 0
test_prec = 0
highest_prec = [0,[]]
highest_recall = [0,[]]
other_corr = 0
other_incorr = 0
ger_corr = 0
ger_incorr = 0
ger_total = 0
other_total = 0
german_words = ["Spätzle", "Schwarzbrot", "Schupfnudeln", "Bratwürste_Nürnberger", "Vanillinzucker", "Bier", "Semmelknödel", "Rote_Bete", "Eisbeine", "Spargel_weiß"]
for ingredient in ground_truth_dict:
correct = 0
incorrect = 0
correct_list = []
incorrect_list = []
# print("\n" + ingredient + ": " + str(len(substitutes_dict[ingredient])))
for sub in substitutes_dict[ingredient]:
if sub in ground_truth_dict[ingredient]:
# print(sub)
correct += 1
correct_list.append(sub)
else:
incorrect += 1
incorrect_list.append(sub)
total_corr_int += correct
total_incorr_int += incorrect
total_corr_list.append(correct)
total_incorr_list.append(incorrect)
total_subs_ground_truth += len(ground_truth_dict[ingredient])
if correct > 0:
curr_recall = correct/len(ground_truth_dict[ingredient])
curr_prec = correct/(correct+incorrect)
test_prec += curr_prec
if curr_prec == highest_prec[0]:
highest_prec[1].append(ingredient)
if curr_prec > highest_prec[0]:
highest_prec[0] = curr_prec
highest_prec[1] = [ingredient]
if curr_recall == highest_recall[0]:
highest_recall[1].append(ingredient)
if curr_recall > highest_recall[0]:
highest_recall[0] = curr_recall
highest_recall[1] = [ingredient]
print(ingredient + ": " + str(curr_prec) + " ..... " + str(curr_recall))
if ingredient in german_words:
ger_corr += correct
ger_incorr += incorrect
else:
other_corr += correct
other_incorr += incorrect
if ingredient == "Zucker":
print("correct: " + str(correct_list) + ", incorrect: " + str(incorrect_list))
ger_total = ger_corr + ger_incorr
other_total = other_corr + other_incorr
print("ger_total: " + str(ger_total/10))
print("other_total: " + str(other_total/30))
# print(correct)
print(ingredient + ": " + str(correct_list) + " / " + str(incorrect_list))
print("precision: " + str(total_corr_int / (total_corr_int + total_incorr_int)))
print("(average precision:) " + str(test_prec/40))
print("recall: " + str(total_corr_int / total_subs_ground_truth))
print("median number of correct subs (ground truth): " + str(statistics.median(total_corr_list)))
print("average number of correct subs (ground truth): " + str(statistics.mean(total_corr_list)))
at_least_3 = 0
no_corr = 0
for nr in total_corr_list:
if nr >= 3:
at_least_3 += 1
if nr < 1:
no_corr += 1
print("ingredients with at least 3 correct substitutes: " + str(at_least_3))
print("ingredients with no correct substitutes: " + str(no_corr))
print("highest precision: " + str(highest_prec[1]) + ": " + str(highest_prec[0]))
print("highest recall: " + str(highest_recall[1]) + ": " + str(highest_recall[0]))
# print("german precision: " + str(ger_corr/(ger_corr + ger_incorr)))
# print("german correct:" + str(ger_corr))
# print("precision rest: " + str(other_corr/(other_corr + other_incorr)))
# print("other correct: " + str(other_corr))
def get_ground_truth_substitutes(substitutes_dict, ground_truth_dict):
ground_truth_substitutes = {}
for ingredient in ground_truth_dict:
ground_truth_substitutes[ingredient] = substitutes_dict[ingredient]
return ground_truth_substitutes
def main():
substitutes_path = "final_Versions/models/vers3/eval/complete_substitute_pairs_50.json"
with open(substitutes_path, "r") as whole_json_file:
substitutes_dict = json.load(whole_json_file)
ground_truth_path = "data/ground_truth.json"
with open(ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
print("no synonyms at all:")
print("entire dataset")
eval_dataset(substitutes_dict)
print("\nonly ground truth:")
ground_truth_substitutes0 = get_ground_truth_substitutes(substitutes_dict, ground_truth_dict)
# print(ground_truth_substitutes["Truthahn"])
eval_dataset(ground_truth_substitutes0)
eval_ground_truth(substitutes_dict, ground_truth_dict)
print("======================================")
print("\nsynonyms of substitutes only: ")
new_substitutes_dict1 = substitutes_dict.copy()
new_substitutes_dict1 = revise_subs.combined_substitutes_dict(new_substitutes_dict1)
print("entire dataset")
eval_dataset(new_substitutes_dict1)
print("\nonly ground truth:")
ground_truth_substitutes1 = get_ground_truth_substitutes(new_substitutes_dict1, ground_truth_dict)
# print(ground_truth_substitutes["Truthahn"])
eval_dataset(ground_truth_substitutes1)
eval_ground_truth(new_substitutes_dict1, ground_truth_dict)
print("======================================")
print("\nsynonyms of everything: ")
new_substitutes_dict2 = substitutes_dict.copy()
new_substitutes_dict2 = revise_subs.combine_all_synonyms(new_substitutes_dict2)
print("entire dataset")
eval_dataset(new_substitutes_dict2)
print("\nonly ground truth:")
ground_truth_substitutes2 = get_ground_truth_substitutes(new_substitutes_dict2, ground_truth_dict)
# print(ground_truth_substitutes["Truthahn"])
eval_dataset(ground_truth_substitutes2)
eval_ground_truth(new_substitutes_dict2, ground_truth_dict)
print("======================================")
print("======================================")
print("English Evaluation")
data_path = "data/"
occurances_path = "mult_ingredients_nice.json"
ground_truth_path = "ground_truth.json"
engl_data_path = "evaluation/engl_data/"
evaluation_path = "evaluation/"
synonyms_path = "synonyms.json"
german_ground_truth = {
"Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen",
"Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel", "Rübe"],
"Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren",
"Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren",
"Maraschino", "Beeren", "Trockenpflaumen"],
"Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester",
"Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"],
"Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill",
"Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze",
"Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"],
"Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten",
"Erdnussbutter"],
"Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken",
"Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons",
"Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"],
"Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl",
"Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"],
"Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup",
"Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup",
"Sirup"],
"Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse",
"Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse",
"Scheiblettenkäse"],
"Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch",
"Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule",
"Wachtel", "schweinekotelett", "Wildfleisch"]
}
with open(engl_data_path + "translation.json", "r") as whole_json_file:
ger_transl = json.load(whole_json_file)
with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
engl_list = json.load(whole_json_file)
with open(engl_data_path + "revised_engl_ground_truth.json", "r") as whole_json_file:
engl_ground_truth = json.load(whole_json_file)
engl_dict = {}
for foo in engl_list:
if foo[0] in engl_dict.keys():
engl_dict[foo[0]].append(foo[1])
else:
engl_dict[foo[0]] = [foo[1]]
# translate english ground truth to german for comparison
# any ingredients that aren't in the german dataset are removed
# translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl)
print("Eval English results")
print("entire dataset")
eval_dataset(engl_dict)
orig_engl_dict = engl_dict.copy()
# print("turkey results: " + str(orig_engl_dict["turkey"]))
print("\nonly ground truth:")
ground_truth_substitutes_engl = get_ground_truth_substitutes(orig_engl_dict, engl_ground_truth)
# print(ground_truth_substitutes)
eval_dataset(ground_truth_substitutes_engl)
eval_ground_truth(ground_truth_substitutes_engl, engl_ground_truth)
print("\n\nEval method 1:")
engl_dict1 = engl_dict.copy()
engl_dict1 = revise_subs.engl_combined_substitutes_dict(engl_dict1)
print("entire dataset")
eval_dataset(engl_dict1)
print("\nonly ground truth:")
ground_truth_substitutes_engl = get_ground_truth_substitutes(engl_dict1, engl_ground_truth)
# print(ground_truth_substitutes["Truthahn"])
eval_dataset(ground_truth_substitutes_engl)
eval_ground_truth(ground_truth_substitutes_engl, engl_ground_truth)
print("\nReevaluate German Data: ")
eval_ground_truth(substitutes_dict, german_ground_truth)
print("\nwith syn1")
eval_ground_truth(new_substitutes_dict1, german_ground_truth)
# print("Truthahn results 1: " + str(new_substitutes_dict1["Truthahn"]))
print("\nwith syn2")
eval_ground_truth(new_substitutes_dict2, german_ground_truth)
# print("Truthahn results 2: " + str(new_substitutes_dict2["Truthahn"]))
#
# engl_substitutes_dict = get_ground_truth_substitutes(engl_dict1, german_ground_truth)
#
# engl_new_substitutes_dict1 = new_substitutes_dict1.copy()
# engl_new_substitutes_dict2 = new_substitutes_dict2.copy()
main()

View File

@@ -0,0 +1,28 @@
import json
import random
def main():
data_path = "data/"
ingredients_path = "mult_ingredients_nice.json"
with open(data_path + ingredients_path, "r") as whole_json_file:
all_ingredients = json.load(whole_json_file)
rare = []
frequent = []
for ingredient in all_ingredients.keys():
if all_ingredients[ingredient] >= 1000:
frequent.append(ingredient)
elif all_ingredients[ingredient] >= 100 and all_ingredients[ingredient] <= 200:
rare.append(ingredient)
picked_rare = random.sample(rare, 10)
picked_frequent = random.sample(frequent, 10)
print("rare: ")
print(picked_rare)
print("\nfrequent: ")
print(picked_frequent)
main()

View File

@@ -0,0 +1,197 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
import json
import os
from collections import defaultdict
from pathlib import Path
from typing import Union
import numpy as np
import torch
from sklearn.decomposition import PCA
from tqdm import tqdm
from evaluation.helpers.approx_knn_classifier import ApproxKNNClassifier
from evaluation.helpers.generate_ingredient_embeddings import generate_food_embedding_dict
from evaluation.helpers.knn_classifier import KNNClassifier
def avg(values):
summed = sum(values)
length = len(values)
return summed / length
def custom_potential_neighbors_sort(potential_neighbors):
# First sort by how often something was nearby, if this is equal, use the smaller distance
sorted_neighbors = sorted(potential_neighbors.items(), key=lambda x: (len(x[1]), -avg(x[1])), reverse=True)
return sorted_neighbors
def filter_out_forbidden_neigbours(ingredient_name, potential_neighbors):
'''
Neigbors that are the same as the ingredient are to be removed, additional rules such as mozeralla & mozeralla_cheese, penne & penne_pasta can be added here
'''
banned_keys = {ingredient_name}
# Ban ingredients that contain ingredient_name
for ingredient in potential_neighbors.keys():
if ingredient == ingredient_name:
banned_keys.add(ingredient)
# if ingredient_name in ingredient.split('_'):
# banned_keys.add(ingredient)
filtered_potential_neighbors = {key: value for key, value in potential_neighbors.items() if
key not in banned_keys}
return filtered_potential_neighbors
def get_nearest_N_neigbours(ingredient_name, ingredients_to_embeddings, all_ingredient_labels,
knn_classifier: Union[KNNClassifier, ApproxKNNClassifier], thresh = 50):
ingredient_embeddings = ingredients_to_embeddings[ingredient_name]
all_distances, all_indices = knn_classifier.k_nearest_neighbors(ingredient_embeddings)
potential_neighbors = defaultdict(list)
for i in range(len(ingredient_embeddings)):
labels = all_ingredient_labels[all_indices[i]]
distances = all_distances[i]
for label, distance in zip(labels, distances):
potential_neighbors[label].append(distance)
potential_neighbors = filter_out_forbidden_neigbours(ingredient_name, potential_neighbors)
sorted_neighbors = custom_potential_neighbors_sort(potential_neighbors)
sorted_neighbors2 = []
for key, value in sorted_neighbors:
if len(value) >= thresh:
sorted_neighbors2.append((key, value))
# sorted_neighbors = [(key, value) for key, value in sorted_neighbors if len(value) >= len(ingredient_embeddings)] # remove too rare ones
# further removal
relative_lengths = [len(elem[1]) / (len(sorted_neighbors2[0][1])) for elem in sorted_neighbors2]
final_neighbors = []
for idx in range(len(relative_lengths)):
if relative_lengths[idx] >= 0.0: # Currently doesn't sort anything out
final_neighbors.append(sorted_neighbors2[idx])
try:
return list(zip(*final_neighbors))[0]
except Exception as e:
return None
def clean_ingredient_name(ingredient_name, normalization_fixes):
words = ingredient_name.split('_')
cleaned_words = []
for word in words:
if word in normalization_fixes:
cleaned_words.append(normalization_fixes[word])
else:
cleaned_words.append(word)
return ' '.join(cleaned_words)
def clean_substitutes(subtitutes, normalization_fixes):
cleaned_subtitutes = []
for subtitute in subtitutes:
cleaned_subtitutes.append(clean_ingredient_name(subtitute, normalization_fixes))
return cleaned_subtitutes
# def test_eval():
# return ["Zucker", "Eier", "Reis", "Spaghetti", "Wein", "Gouda_junger"]
def main():
# models = ["Versions/vers1/", "Versions/vers2/"]
# models = ["final_Versions/models/vers1/", "final_Versions/models/vers2/", "final_Versions/models/vers3/"]
models = ["final_Versions/models/vers2/"]
thresh = 100
# models = ["test/"]
# os.makedirs('data/eval')
# test_substitute_pairs_path = 'Versions/test_substitute_pairs.json'
# normalization_fixes_path = Path('data/eval/normalization_correction.json')
max_embedding_count = 100
# image_embedding_dim = 768
approx_knn = True
# compare models
for curr_model in models:
# os.makedirs(curr_model + "eval/")
substitute_pairs_path = curr_model + "eval/substitute_pairs_" + str(thresh) + ".json"
# get embeddings for all ingredients
ingredients_to_embeddings = generate_food_embedding_dict(max_sentence_count=max_embedding_count, model_path=curr_model+"output/", eval_path=curr_model + "eval/", dataset_path=curr_model+"dataset/")
all_ingredient_embeddings = []
all_ingredient_labels = []
# make list of all ingredients and all embeddings
for key, value in ingredients_to_embeddings.items():
all_ingredient_embeddings.append(value)
all_ingredient_labels.extend([key] * len(value))
all_ingredient_embeddings = np.concatenate(all_ingredient_embeddings)
all_ingredient_labels = np.stack(all_ingredient_labels)
# get knn classifier
if approx_knn:
knn_classifier: Union[KNNClassifier, ApproxKNNClassifier] = ApproxKNNClassifier(
all_ingredient_embeddings=all_ingredient_embeddings,
max_embedding_count=max_embedding_count, save_path=Path(curr_model + "eval/" + 'approx_knn_classifier.ann'))
else:
knn_classifier: Union[KNNClassifier, ApproxKNNClassifier] = KNNClassifier(
all_ingredient_embeddings=all_ingredient_embeddings,
max_embedding_count=max_embedding_count, save_path=Path(curr_model + "eval/" + 'knn_classifier.joblib'))
# get substitutes via knn classifier
substitute_pairs = set()
none_counter = 0
subst_dict = {}
for ingredient_name in tqdm(ingredients_to_embeddings.keys(), total=len(ingredients_to_embeddings)):
substitutes = get_nearest_N_neigbours(ingredient_name=ingredient_name,
ingredients_to_embeddings=ingredients_to_embeddings,
all_ingredient_labels=all_ingredient_labels,
knn_classifier=knn_classifier, thresh=thresh)
if substitutes is None:
none_counter += 1
subst_dict[ingredient_name] = []
else:
subst_dict[ingredient_name] = list(substitutes)
#
# cleaned_substitutes = clean_substitutes(substitutes, normalization_fixes)
# for cleaned_substitute in cleaned_substitutes:
# substitute_pairs.add((clean_ingredient_name(ingredient_name, normalization_fixes), cleaned_substitute))
with open(substitute_pairs_path, 'w') as f:
json.dump(subst_dict, f, ensure_ascii=False, indent=4)
print(f'Nones: {none_counter}')
# output = {}
# for ing in ingredients:
# output[ing] = []
# for model in all_subs.keys():
# for ing in ingredients:
# output[ing].append(all_subs[model][ing])
#
# with open(test_substitute_pairs_path, 'w') as f:
# json.dump(output, f, ensure_ascii=False, indent=4)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,48 @@
import json
import statistics
def main():
ground_truth_path = "data/ground_truth.json"
# ground_truth_path = "evaluation/engl_data/engl_ground_truth.json"
with open(ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
ingredients_path = "data/mult_ingredients_nice.json"
# ingredients_path = "data/cleaned_steps_occurrance.json"
with open(ingredients_path, "r") as whole_json_file:
ingredients_occurrences = json.load(whole_json_file)
synonyms_path = "data/synonyms.json"
with open(synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
german_words = ["Spätzle", "Schwarzbrot", "Schupfnudeln", "Bratwürste_Nürnberger", "Vanillinzucker", "Bier", "Semmelknödel", "Rote_Bete", "Eisbeine", "Spargel_weiß"]
german_total = 0
other_total = 0
all_counts = []
occurrence_count = []
for base in ground_truth_dict.keys():
# print(base + " substitutes: " + str(len(ground_truth_dict[base])))
all_counts.append(len(ground_truth_dict[base]))
curr_occurrences = ingredients_occurrences[base]
if base in synonyms_dict.keys():
for syn in synonyms_dict[base]:
curr_occurrences += ingredients_occurrences[syn]
occurrence_count.append(curr_occurrences)
print(base + " occurrences: " + str(curr_occurrences))
if base in german_words:
german_total += len(ground_truth_dict[base])
else:
other_total += len(ground_truth_dict[base])
print("Average: " + str(statistics.mean(all_counts)))
print("Median: " + str(statistics.median(all_counts)))
print("Standard deviation: " + str(statistics.stdev(all_counts)))
print("Min: " + str(min(all_counts)))
print("Max: " + str(max(all_counts)))
# print("german total: " + str(german_total))
# print("other total: " + str(other_total))
main()

View File

@@ -0,0 +1,45 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
from pathlib import Path
import numpy as np
from annoy import AnnoyIndex
from tqdm import tqdm
# Full guide https://github.com/spotify/annoy
class ApproxKNNClassifier:
def __init__(self, all_ingredient_embeddings, max_embedding_count,
save_path=Path('data/eval/approx_knn_classifier.ann'), n_trees=10):
vector_length = all_ingredient_embeddings.shape[-1]
self.max_embedding_count = max_embedding_count
if save_path.exists():
print('Loading Existing Approx Classifier')
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')
self.approx_knn_classifier.load(str(save_path)) # super fast, will just mmap the file
else:
# To make sure we don't just get ourselves: add max_embedding_count
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular') # Length of item vector that will be indexed
for i in tqdm(range(len(all_ingredient_embeddings)), total=len(all_ingredient_embeddings), desc='Creating Approx Classifier'):
self.approx_knn_classifier.add_item(i, all_ingredient_embeddings[i])
self.approx_knn_classifier.build(n_trees)
print('Saving Approx Classifier')
self.approx_knn_classifier.save(str(save_path))
def k_nearest_neighbors(self, ingredient_embeddings):
all_indices, all_distances = [], []
for idx, ingredient_embedding in enumerate(
ingredient_embeddings): # search_k gives you a run-time tradeoff between better accuracy and speed currently defaults
indices, distances = self.approx_knn_classifier.get_nns_by_vector(ingredient_embedding, self.max_embedding_count + 200, include_distances=True)
all_indices.append(indices)
all_distances.append(distances)
return np.stack(all_distances), np.stack(all_indices)

View File

@@ -0,0 +1,152 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
import json
import pickle
import random
import re
from collections import defaultdict
from pathlib import Path
import numpy as np
import torch
from tqdm import tqdm
from evaluation.helpers.prediction_model import PredictionModel
def _generate_food_sentence_dict(model_path):
with open('data/mult_ingredients_nice.json', "r") as f:
food_items = json.load(f)
food_items_set = set(food_items.keys())
with open(model_path + 'training_data.txt', "r") as f:
train_instruction_sentences = f.read().splitlines()
# remove overlong sentences
train_instruction_sentences = [s for s in train_instruction_sentences if len(s.split()) <= 100]
with open(model_path + 'testing_data.txt', "r") as f:
test_instruction_sentences = f.read().splitlines()
# remove overlong sentences
test_instruction_sentences = [s for s in test_instruction_sentences if len(s.split()) <= 100]
instruction_sentences = train_instruction_sentences + test_instruction_sentences
food_to_sentences_dict = defaultdict(list)
for sentence in instruction_sentences:
words = re.sub("[^\w]-'", " ", sentence).split()
for word in words:
if word in food_items_set:
food_to_sentences_dict[word].append(sentence)
return food_to_sentences_dict
def _random_sample_with_min_count(population, k):
if len(population) <= k:
return population
else:
return random.sample(population, k)
def sample_random_sentence_dict(model_path, max_sentence_count):
food_to_sentences_dict = _generate_food_sentence_dict(model_path=model_path)
# only keep 100 randomly selected sentences
food_to_sentences_dict_random_samples = {food: _random_sample_with_min_count(sentences, max_sentence_count) for
food, sentences in food_to_sentences_dict.items()}
return food_to_sentences_dict_random_samples
def _map_ingredients_to_input_ids(model_path):
with open('data/mult_ingredients_nice.json', "r") as f:
ingredients = json.load(f).keys()
model = PredictionModel(model_path)
ingredient_ids = model.tokenizer.convert_tokens_to_ids(ingredients)
ingredient_ids_dict = dict(zip(ingredients, ingredient_ids))
return ingredient_ids_dict
def _merge_synonmys(food_to_embeddings_dict, max_sentence_count):
synonmy_replacements_path = Path('foodbert_embeddings/data/synonmy_replacements.json')
synonmy_replacements = {}
merged_dict = defaultdict(list)
# Merge ingredients
for key, value in food_to_embeddings_dict.items():
if key in synonmy_replacements:
key_to_use = synonmy_replacements[key]
else:
key_to_use = key
merged_dict[key_to_use].append(value)
merged_dict = {k: np.concatenate(v) for k, v in merged_dict.items()}
# When embedding count exceeds maximum allowed, reduce back to requested count
for key, value in merged_dict.items():
if len(value) > max_sentence_count:
index = np.random.choice(value.shape[0], max_sentence_count, replace=False)
new_value = value[index]
merged_dict[key] = new_value
return merged_dict
def generate_food_embedding_dict(max_sentence_count, model_path, eval_path='data/eval/', dataset_path="output"):
'''
Creates a dict where the keys are the ingredients and the values are a list of embeddings with length max_sentence_count or less if there are less occurences
These embeddings are used in generate_substitutes.py to predict substitutes
'''
food_to_embeddings_dict_path = Path(eval_path + 'food_embeddings_dict.pkl')
if food_to_embeddings_dict_path.exists():
with food_to_embeddings_dict_path.open('rb') as f:
food_to_embeddings_dict = pickle.load(f)
# # delete keys if we deleted ingredients
# old_ingredients = set(food_to_embeddings_dict.keys())
# with open('train_model/vocab/used_ingredients.json', "r") as f:
# new_ingredients = set(json.load(f))
#
# keys_to_delete = old_ingredients.difference(new_ingredients)
# for key in keys_to_delete:
# food_to_embeddings_dict.pop(key, None) # delete key if it exists
#
# # merge new synonyms
# food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
#
# with food_to_embeddings_dict_path.open('wb') as f:
# pickle.dump(food_to_embeddings_dict, f) # Overwrite dict with cleaned version
return food_to_embeddings_dict
print('Sampling Random Sentences')
food_to_sentences_dict_random_samples = sample_random_sentence_dict(model_path=dataset_path, max_sentence_count=max_sentence_count)
food_to_embeddings_dict = defaultdict(list)
print('Mapping Ingredients to Input Ids')
all_ingredient_ids = _map_ingredients_to_input_ids(model_path=model_path)
prediction_model = PredictionModel(model_path=model_path)
for food, sentences in tqdm(food_to_sentences_dict_random_samples.items(), total=len(food_to_sentences_dict_random_samples),
desc='Calculating Embeddings for Food items'):
embeddings, ingredient_ids = prediction_model.predict_embeddings(sentences)
# get embedding of food word
embeddings_flat = embeddings.view((-1, 768))
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
food_id = all_ingredient_ids[food]
food_embeddings = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
food_to_embeddings_dict[food].extend(food_embeddings)
food_to_embeddings_dict = {k: np.stack(v) for k, v in food_to_embeddings_dict.items()}
# Clean synonmy
food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
with food_to_embeddings_dict_path.open('wb') as f:
pickle.dump(food_to_embeddings_dict, f)
return food_to_embeddings_dict

View File

@@ -0,0 +1,38 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset
class InstructionsDataset(Dataset):
def __init__(self, tokenizer, sentences):
self.tokenizer = tokenizer
batch_encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, max_length=512, truncation=True)
self.examples = batch_encoding["input_ids"]
self.examples = self._tensorize_batch([torch.tensor(elem) for elem in self.examples])
def _tensorize_batch(self, examples) -> torch.Tensor:
length_of_first = examples[0].size(0)
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
if are_tensors_same_length:
return torch.stack(examples, dim=0)
else:
if self.tokenizer._pad_token is None:
raise ValueError(
"You are attempting to pad samples but the tokenizer you are using"
f" ({self.tokenizer.__class__.__name__}) does not have one."
)
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
def __len__(self):
return len(self.examples)
def __getitem__(self, i):
return self.examples[i]

View File

@@ -0,0 +1,36 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
from pathlib import Path
import joblib
from sklearn.neighbors import NearestNeighbors
class KNNClassifier:
def __init__(self, all_ingredient_embeddings, max_embedding_count,
save_path=Path('data/eval/knn_classifier.joblib')):
if save_path.exists():
print('Loading Existing Classifier')
self.knn_classifier: NearestNeighbors = joblib.load(save_path)
else:
print('Training New Classifier')
# To make sure we don't just get ourselves: add max_embedding_count
self.knn_classifier: NearestNeighbors = NearestNeighbors(n_neighbors=max_embedding_count + 200, n_jobs=12,
algorithm='brute') # kd_tree, ball_tree or brute
self.knn_classifier.fit(all_ingredient_embeddings)
print('Saving Classifier')
joblib.dump(self.knn_classifier, save_path)
print(f'\nKNN with: {self.knn_classifier._fit_method} and leaf size: {self.knn_classifier.leaf_size}\n')
def k_nearest_neighbors(self, ingredient_embeddings):
distances, indices = self.knn_classifier.kneighbors(ingredient_embeddings, return_distance=True)
return distances, indices

View File

@@ -0,0 +1,53 @@
# adapted from:
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
# “Exploiting Food Embeddings for Ingredient Substitution.”
# In: Proceedings of the 14th International Joint Conference on Biomedical
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
# SciTePress, pp. 6777. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
import json
import torch
from torch.utils.data import DataLoader
from transformers import BertModel, BertTokenizer
from evaluation.helpers.instructions_dataset import InstructionsDataset
class PredictionModel:
def __init__(self, model_path=''):
self.model: BertModel = BertModel.from_pretrained(
pretrained_model_name_or_path=model_path)
with open('train_model/vocab/used_ingredients.json', 'r') as f:
used_ingredients = json.load(f)
self.tokenizer = BertTokenizer(vocab_file='train_model/vocab/bert_vocab.txt', do_lower_case=False,
max_len=512, never_split=used_ingredients, truncation=True)
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
self.model.to(self.device)
def predict_embeddings(self, sentences):
dataset = InstructionsDataset(tokenizer=self.tokenizer, sentences=sentences)
dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)
embeddings = []
ingredient_ids = []
for batch in dataloader:
batch = batch.to(self.device)
with torch.no_grad():
embeddings_batch = self.model(batch)
embeddings.extend(embeddings_batch[0])
ingredient_ids.extend(batch)
return torch.stack(embeddings), ingredient_ids
def compute_embedding_for_ingredient(self, sentence, ingredient_name):
embeddings, ingredient_ids = self.predict_embeddings([sentence])
embeddings_flat = embeddings.view((-1, 768))
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name)
food_embedding = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
return food_embedding[0]

View File

@@ -0,0 +1,166 @@
import json
synonyms_path = "data/synonyms.json"
ground_truth_path = "data/ground_truth.json"
category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
"Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
"Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Truthahn", "Wein"]
with open("final_Versions/models/vers3/eval/complete_substitute_pairs_50.json") as f:
sub_dict = json.load(f)
def engl_combined_substitutes_dict(found_substitutes_dict):
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
with open("evaluation/engl_data/revised_engl_ground_truth.json", "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
reversed_synonyms_dict = get_reversed_syn_dict(is_engl=True)
new_found_sub_dict = {}
for ingredient in found_substitutes_dict.keys():
new_found_sub_dict[ingredient] = []
current_subs = set()
for sub in found_substitutes_dict[ingredient]:
# delete substitute if it is the same as the ingredient
if sub == ingredient:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# delete substitute if it is a synonym of the ingredient
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# if substitute is a synonym of sth
if sub in reversed_synonyms_dict.keys():
if len(reversed_synonyms_dict[sub]) == 1:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
else:
current_subs.add(sub)
new_found_sub_dict[ingredient] += list(current_subs)
return new_found_sub_dict
# merges substitutes with their synonyms, replaces synonyms with base synonym
def combined_substitutes_dict(found_substitutes_dict):
with open(synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
with open(ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
reversed_synonyms_dict = get_reversed_syn_dict()
new_found_sub_dict = {}
for ingredient in found_substitutes_dict.keys():
new_found_sub_dict[ingredient] = []
current_subs = set()
for sub in found_substitutes_dict[ingredient]:
# delete substitute if it is the same as the ingredient
if sub == ingredient:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# delete substitute if it is a synonym of the ingredient
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
continue
# if substitute is a synonym of sth
if sub in reversed_synonyms_dict.keys():
if len(reversed_synonyms_dict[sub]) == 1:
if reversed_synonyms_dict[sub][0] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
if ingredient in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
current_subs.add(sub)
elif len(reversed_synonyms_dict[sub]) == 2:
if ingredient in category_subs:
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][1])
else:
print(reversed_synonyms_dict[sub])
else:
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][1])
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
current_subs.add(reversed_synonyms_dict[sub][0])
else:
print(reversed_synonyms_dict[sub])
else:
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
else:
current_subs.add(sub)
new_found_sub_dict[ingredient] += list(current_subs)
return new_found_sub_dict
# combine substitutes found for an ingredient and its synonyms
# also combine synonyms in substitutes
def combine_all_synonyms(found_substitutes_dict):
reversed_synonyms_dict = get_reversed_syn_dict_no_cat()
new_found_sub_dict = {}
for ingredient in found_substitutes_dict.keys():
if ingredient not in reversed_synonyms_dict.keys():
new_found_sub_dict[ingredient] = set()
for ingredient in found_substitutes_dict.keys():
if ingredient in reversed_synonyms_dict.keys():
new_found_sub_dict[reversed_synonyms_dict[ingredient][0]] |= set(found_substitutes_dict[ingredient])
else:
new_found_sub_dict[ingredient] |= set(found_substitutes_dict[ingredient])
new_found_sub_dict_list = {}
for ingredient in new_found_sub_dict.keys():
new_found_sub_dict_list[ingredient] = list(new_found_sub_dict[ingredient])
return combined_substitutes_dict(new_found_sub_dict_list)
def get_reversed_syn_dict(is_engl=False):
if is_engl:
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
with open(synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
reversed_synonyms_dict = {}
for ingredient in synonyms_dict.keys():
for syn in synonyms_dict[ingredient]:
if syn not in reversed_synonyms_dict.keys():
reversed_synonyms_dict[syn] = []
reversed_synonyms_dict[syn].append(ingredient)
return reversed_synonyms_dict
def get_reversed_syn_dict_no_cat():
with open(synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
reversed_synonyms_dict = {}
for ingredient in synonyms_dict.keys():
if ingredient not in category_subs:
for syn in synonyms_dict[ingredient]:
if syn not in reversed_synonyms_dict.keys():
reversed_synonyms_dict[syn] = []
reversed_synonyms_dict[syn].append(ingredient)
return reversed_synonyms_dict
combined_substitutes_dict(sub_dict)

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 4
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,94 @@
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4313
number of nones: 3035
ingredients with over 30 substitutes: 71
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.044284720612103
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4313
number of nones: 3035
ingredients with over 30 substitutes: 71
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.3382796197542315
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 40
number of nones: 29
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.75
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 40
number of nones: 29
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.75
================================
englisch:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.433616143086448
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.457234579224949
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 2
},
"chicken": {
"engl": 1,
"ger": 6
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 1
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 1
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 3
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 0
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,94 @@
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4313
number of nones: 2294
ingredients with over 30 substitutes: 272
ingredients with over 100 substitutes: 10
ingredients with over 1000 substitutes: 0
average number of substitutes: 4.630883375840482
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4313
number of nones: 2294
ingredients with over 30 substitutes: 272
ingredients with over 100 substitutes: 10
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.102480871782982
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 40
number of nones: 21
ingredients with over 30 substitutes: 1
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 5.375
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 40
number of nones: 21
ingredients with over 30 substitutes: 1
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 5.6
================================
englisch:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.433616143086448
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.457234579224949
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 3
},
"cherry": {
"engl": 3,
"ger": 4
},
"chicken": {
"engl": 1,
"ger": 5
},
"parsley": {
"engl": 3,
"ger": 8
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 4
},
"kale": {
"engl": 2,
"ger": 7
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 4
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 3
},
"cherry": {
"engl": 3,
"ger": 3
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 3
},
"kale": {
"engl": 2,
"ger": 6
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 1
},
"chicken": {
"engl": 1,
"ger": 2
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 1
},
"kale": {
"engl": 2,
"ger": 6
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 2
},
"chicken": {
"engl": 1,
"ger": 5
},
"parsley": {
"engl": 3,
"ger": 2
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 2
},
"kale": {
"engl": 2,
"ger": 7
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 4
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,93 @@
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4313
number of nones: 1604
ingredients with over 30 substitutes: 573
ingredients with over 100 substitutes: 4
ingredients with over 1000 substitutes: 0
average number of substitutes: 8.59169951309993
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4313
number of nones: 1604
ingredients with over 30 substitutes: 573
ingredients with over 100 substitutes: 4
ingredients with over 1000 substitutes: 0
average number of substitutes: 11.539067934152563
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 40
number of nones: 3
ingredients with over 30 substitutes: 14
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 16.8
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 40
number of nones: 3
ingredients with over 30 substitutes: 14
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 23.25
================================
englisch:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.433616143086448
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.457234579224949
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 3
},
"cherry": {
"engl": 3,
"ger": 4
},
"chicken": {
"engl": 1,
"ger": 5
},
"parsley": {
"engl": 3,
"ger": 5
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 1
},
"kale": {
"engl": 2,
"ger": 2
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 3
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 3
},
"cherry": {
"engl": 3,
"ger": 3
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 1
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 1
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 5
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 2
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 3
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,94 @@
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4313
number of nones: 2353
ingredients with over 30 substitutes: 231
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 4.939485277069325
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4313
number of nones: 2353
ingredients with over 30 substitutes: 231
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 5.935311847901692
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 40
number of nones: 8
ingredients with over 30 substitutes: 5
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 11.95
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 40
number of nones: 8
ingredients with over 30 substitutes: 5
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 13.4
================================
englisch:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.433616143086448
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.457234579224949
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 1
},
"cherry": {
"engl": 3,
"ger": 4
},
"chicken": {
"engl": 1,
"ger": 5
},
"parsley": {
"engl": 3,
"ger": 3
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 2
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

View File

@@ -0,0 +1,42 @@
{
"carrot": {
"engl": 0,
"ger": 0
},
"cherry": {
"engl": 3,
"ger": 0
},
"chicken": {
"engl": 1,
"ger": 0
},
"parsley": {
"engl": 3,
"ger": 0
},
"chocolate": {
"engl": 3,
"ger": 0
},
"bacon": {
"engl": 2,
"ger": 0
},
"kale": {
"engl": 2,
"ger": 0
},
"sugar": {
"engl": 2,
"ger": 0
},
"brie": {
"engl": 3,
"ger": 0
},
"turkey": {
"engl": 3,
"ger": 1
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,94 @@
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4313
number of nones: 2996
ingredients with over 30 substitutes: 100
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.6978900996985855
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4313
number of nones: 2996
ingredients with over 30 substitutes: 100
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.9476002782286113
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 40
number of nones: 14
ingredients with over 30 substitutes: 2
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 7.45
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 40
number of nones: 14
ingredients with over 30 substitutes: 2
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 7.65
================================
englisch:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.433616143086448
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 4361
number of nones: 0
ingredients with over 30 substitutes: 22
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 6.457234579224949
--------------------------------------------
ground truth only:
cap at 30 set to True
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.417429094236048
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7
cap at 30 set to False
english ingredients with over 30 substitutes: 22
english nones: 11
average amount of substitutes found for english ingredients: 6.440988106129917
number of ingredients in dataset: 10
number of nones: 0
ingredients with over 30 substitutes: 0
ingredients with over 100 substitutes: 0
ingredients with over 1000 substitutes: 0
average number of substitutes: 2.7

View File

@@ -0,0 +1,207 @@
from transformers import BertTokenizer
import json
def print_stats(model_substitutes_dict, cap_at_30):
print("\ncap at 30 set to " + str(cap_at_30))
evaluation_path = "evaluation/"
# synonyms_path = "synonyms.json"
found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
model_name = "final_Versions/models/vers3/output/"
with open(evaluation_path + "engl_data/substitute_pairs_foodbert_text.json", "r") as whole_json_file:
engl_list = json.load(whole_json_file)
engl_dict = {}
for foo in engl_list:
if foo[0] in engl_dict.keys():
engl_dict[foo[0]].append(foo[1])
else:
engl_dict[foo[0]] = [foo[1]]
substitute_sum = 0
over30 = 0
for ingred in engl_dict.keys():
curr_nr = len(engl_dict[ingred])
if cap_at_30:
if curr_nr > 30:
substitute_sum += 30
over30 += 1
else:
substitute_sum += curr_nr
else:
if curr_nr > 30:
over30 += 1
substitute_sum += curr_nr
print("english ingredients with over 30 substitutes: " + str(over30))
print("english nones: " + str(4372-len(engl_dict.keys())))
print("average amount of substitutes found for english ingredients: " + str(substitute_sum / 4372))
# with open(found_substitutes_path, "r") as whole_json_file:
# model_substitutes_dict = json.load(whole_json_file)[model_name]
substitute_sum = 0
over100 = 0
over1000 = 0
over30 = 0
nones = 0
for ingred in model_substitutes_dict.keys():
curr_nr = len(model_substitutes_dict[ingred])
if curr_nr == 0:
nones += 1
if curr_nr > 100:
# print(ingred + ": " + str(curr_nr))
over100 += 1
if curr_nr > 1000:
# print(ingred + ": " + str(curr_nr))
over1000 += 1
if cap_at_30:
if curr_nr > 30:
substitute_sum += 30
over30 += 1
else:
substitute_sum += curr_nr
else:
if curr_nr > 30:
over30 += 1
substitute_sum += curr_nr
# print(str(substitute_sum))
print("number of ingredients in dataset: " + str(len(model_substitutes_dict.keys())))
print("number of nones: " + str(nones))
print("ingredients with over 30 substitutes: " + str(over30))
print("ingredients with over 100 substitutes: " + str(over100))
print("ingredients with over 1000 substitutes: " + str(over1000))
print("average number of substitutes: " + str(substitute_sum / len(model_substitutes_dict.keys())))
# print(str(len(model_substitutes_dict.keys())))
def main():
# with open("train_model/vocab/used_ingredients.json", "r") as used_ingredients_file:
# used_ingredients = json.load(used_ingredients_file)
# tokenizer = BertTokenizer(vocab_file='train_model/vocab/vocab.txt', do_lower_case=False, model_max_length=512,
# never_split=used_ingredients)
#
# sent = ["Die Paprika schneiden. Dann die Stücke kochen."]
#
# batch_encoding = tokenizer.batch_encode_plus(sent, add_special_tokens=True, max_length=512, truncation=True)
#
# # Get the input IDs and attention mask in tensor format
# input_ids = batch_encoding['input_ids']
# attn_mask = batch_encoding['attention_mask']
#
# print(input_ids)
# print(attn_mask)
evaluation_path = "evaluation/"
synonyms_path = "synonyms.json"
data_path = "data/"
engl_data_path = evaluation_path + "engl_data/"
found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
# model_name = "final_Versions/models/vers3/output/"
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
with open(data_path + synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
"Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
"Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Geflügelfleisch", "Wein", "Suppenfleisch"]
# synonyms_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter"],
# "Hähnchenfilet": ["Filet_Hähnchen", "Hühnerfilet"],
# "Huhn": ["Hähnchenfilet", "Filet_Hähnchen", "Hühnchenschenkel", "Hühnerbeine"],
# "Kuvertüre_Zartbitter": ["Zartbitterkuvertüre"]}
#
# model_substitutes_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter", "Kuvertüre_Zartbitter", "Zartbitterkuvertüre", "Nutella"],
# "Schokolade_Zartbitter": ["Kuvertüre_Zartbitter", "Weiße_Schokolade", "Zartbitterschokolade"],
# "Huhn": ["Hähnchenfilet", "Schweinelende"],
# "Dill": ["Petersilie"]}
final_dict = {}
new_syn_dict = {}
# get base word for all synonyms
for ingred in synonyms_dict.keys():
if ingred not in category_subs:
for syn in synonyms_dict[ingred]:
new_syn_dict[syn] = ingred
#
for ingred in model_substitutes_dict.keys():
if ingred not in new_syn_dict.keys():
final_dict[ingred] = set()
for ingred in model_substitutes_dict.keys():
curr_set = set()
for sub in model_substitutes_dict[ingred]:
if sub in new_syn_dict:
curr_set.add(new_syn_dict[sub])
else:
curr_set.add(sub)
if ingred not in new_syn_dict:
final_dict[ingred] |= curr_set
else:
test = new_syn_dict[ingred]
final_dict[test] |= curr_set
# print(final_dict)
for ingred in final_dict.keys():
if ingred in final_dict[ingred]:
final_dict[ingred].remove(ingred)
new_final_dict = {}
for ingred in final_dict.keys():
new_final_dict[ingred] = list(final_dict[ingred])
with open(found_substitutes_path, "r") as whole_json_file:
new_final_dict = json.load(whole_json_file)
print_stats(new_final_dict, cap_at_30=True)
print_stats(new_final_dict, cap_at_30=False)
print("--------------------------------------------\nground truth only: ")
with open("data/ground_truth.json", "r") as whole_json_file:
ground_truth = json.load(whole_json_file)
ground_truth_only = {}
for ingred in new_final_dict.keys():
if ingred in ground_truth.keys():
ground_truth_only[ingred] = new_final_dict[ingred]
print_stats(ground_truth_only, cap_at_30=True)
print_stats(ground_truth_only, cap_at_30=False)
print("================================\nenglisch:")
with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
engl_list = json.load(whole_json_file)
with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file:
engl_ground_truth = json.load(whole_json_file)
engl_dict = {}
for foo in engl_list:
if foo[0] in engl_dict.keys():
engl_dict[foo[0]].append(foo[1])
else:
engl_dict[foo[0]] = [foo[1]]
print_stats(engl_dict, cap_at_30=True)
print_stats(engl_dict, cap_at_30=False)
print("--------------------------------------------\nground truth only: ")
ground_truth_only = {}
for ingred in engl_dict.keys():
if ingred in engl_ground_truth.keys():
ground_truth_only[ingred] = engl_dict[ingred]
print_stats(ground_truth_only, cap_at_30=True)
print_stats(ground_truth_only, cap_at_30=False)
main()