initial commit of project
This commit is contained in:
0
evaluation/README.md
Normal file
0
evaluation/README.md
Normal file
29
evaluation/add_unused_ingredients.py
Normal file
29
evaluation/add_unused_ingredients.py
Normal file
@@ -0,0 +1,29 @@
|
||||
import json
|
||||
|
||||
def main():
|
||||
eval_path = "final_Versions/models/vers2/eval/"
|
||||
file_name = "substitute_pairs_65.json"
|
||||
found_substitutes_path = eval_path + file_name
|
||||
with open(found_substitutes_path, "r") as whole_json_file:
|
||||
model_substitutes_dict = json.load(whole_json_file)
|
||||
|
||||
data_path = "data/"
|
||||
occurances_path = "mult_ingredients_nice.json"
|
||||
with open(data_path + occurances_path, "r") as whole_json_file:
|
||||
occurrences_dict = json.load(whole_json_file)
|
||||
|
||||
all_substitutes = {}
|
||||
for ingredient in occurrences_dict.keys():
|
||||
if ingredient not in model_substitutes_dict.keys():
|
||||
all_substitutes[ingredient] = []
|
||||
# print(ingredient)
|
||||
else:
|
||||
all_substitutes[ingredient] = model_substitutes_dict[ingredient]
|
||||
|
||||
print(str(len(all_substitutes.keys())))
|
||||
out_path = eval_path + "complete_" + file_name
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(all_substitutes, f, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
main()
|
||||
110
evaluation/dataset_stats.py
Normal file
110
evaluation/dataset_stats.py
Normal file
@@ -0,0 +1,110 @@
|
||||
import json
|
||||
import statistics
|
||||
|
||||
|
||||
def dataset(full_dataset_path):
|
||||
all_urls = []
|
||||
|
||||
with open(full_dataset_path, "r") as whole_json_file:
|
||||
full_dataset = json.load(whole_json_file)
|
||||
counter = 0
|
||||
ingredient_counter = 0
|
||||
ingredient_lengths = []
|
||||
pic_counter = 0
|
||||
comment_counter = 0
|
||||
no_comments = 0
|
||||
comment_lengths = []
|
||||
instruction_counter = 0
|
||||
instruction_lengths = []
|
||||
for url in full_dataset.keys():
|
||||
ingredient_counter += len(full_dataset[url]['ingredients'])
|
||||
ingredient_lengths.append(len(full_dataset[url]['ingredients']))
|
||||
if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg":
|
||||
pic_counter += 1
|
||||
if full_dataset[url]['comments']:
|
||||
comment_lengths.append(len(full_dataset[url]['comments']))
|
||||
comment_counter += len(full_dataset[url]['comments'])
|
||||
else:
|
||||
comment_lengths.append(0)
|
||||
no_comments += 1
|
||||
instruction_counter += len(full_dataset[url]['instructions'])
|
||||
instruction_lengths.append(len(full_dataset[url]['instructions']))
|
||||
counter += 1
|
||||
print(counter)
|
||||
if url not in all_urls:
|
||||
all_urls.append(url)
|
||||
print("number of recipes: " + str(len(full_dataset.keys())))
|
||||
print("\n")
|
||||
print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys())))
|
||||
print("median ingredient count: " + str(statistics.median(ingredient_lengths)))
|
||||
print("\n")
|
||||
print("number of recipes with picture: " + str(pic_counter))
|
||||
print("\n")
|
||||
print("number of comments: " + str(comment_counter))
|
||||
print("number of recipes withOUT comments: " + str(no_comments))
|
||||
print("average amount of comments: " + str(comment_counter/len(full_dataset.keys())))
|
||||
print("median comment count: " + str(statistics.median(comment_lengths)))
|
||||
print("\n")
|
||||
print("total instruction count: " + str(instruction_counter))
|
||||
print("average instruction count: " + str(instruction_counter / len(full_dataset.keys())))
|
||||
print("median instruction count: " + str(statistics.median(instruction_lengths)))
|
||||
|
||||
|
||||
def ingredients_before(full_dataset_path):
|
||||
counter = 0
|
||||
with open(full_dataset_path, "r") as whole_json_file:
|
||||
full_dataset = json.load(whole_json_file)
|
||||
all_ingredients = []
|
||||
for url in full_dataset.keys():
|
||||
counter += 1
|
||||
print(counter)
|
||||
for ingred in full_dataset[url]['ingredients']:
|
||||
if ingred not in all_ingredients:
|
||||
all_ingredients.append(ingred)
|
||||
print(str(len(all_ingredients)))
|
||||
|
||||
def ingredient_stats():
|
||||
ingredients_list_path = "data/mult_ingredients_nice.json"
|
||||
ingredients_instructions_path = "data/cleaned_steps_occurrance.json"
|
||||
|
||||
with open(ingredients_list_path, "r") as whole_json_file:
|
||||
ingred_lists = json.load(whole_json_file)
|
||||
|
||||
with open(ingredients_instructions_path, "r") as whole_json_file:
|
||||
ingred_instruct = json.load(whole_json_file)
|
||||
|
||||
print("in ingredient lists: ")
|
||||
ingred_counts = []
|
||||
ingred_sum = 0
|
||||
for ingred in ingred_lists.keys():
|
||||
ingred_counts.append(ingred_lists[ingred])
|
||||
ingred_sum += ingred_lists[ingred]
|
||||
|
||||
print("average: " + str(ingred_sum/len(ingred_lists.keys())))
|
||||
print("median: " + str(statistics.median(ingred_counts)))
|
||||
|
||||
print("in instructions: ")
|
||||
instruct_counts = []
|
||||
instruct_sum = 0
|
||||
none_counts = 0
|
||||
for ingred in ingred_instruct.keys():
|
||||
instruct_counts.append(ingred_instruct[ingred])
|
||||
instruct_sum += ingred_instruct[ingred]
|
||||
if ingred_instruct[ingred] <5 :
|
||||
none_counts += 1
|
||||
|
||||
print("average: " + str(instruct_sum / len(ingred_instruct.keys())))
|
||||
print("median: " + str(statistics.median(instruct_counts)))
|
||||
print("nones: " + str(none_counts))
|
||||
sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1]))
|
||||
print(sorted_instruct)
|
||||
|
||||
|
||||
def main():
|
||||
before_dataset_path = "data/dataset_fin.json"
|
||||
full_dataset_path = "Versions/vers3/full_dataset.json"
|
||||
# dataset(full_dataset_path)
|
||||
# ingredients_before(before_dataset_path)
|
||||
ingredient_stats()
|
||||
|
||||
main()
|
||||
216
evaluation/engl_data/engl_ground_truth.json
Normal file
216
evaluation/engl_data/engl_ground_truth.json
Normal file
@@ -0,0 +1,216 @@
|
||||
{
|
||||
"carrot": [
|
||||
"parsnip",
|
||||
"daikon",
|
||||
"turnip",
|
||||
"celery",
|
||||
"squash",
|
||||
"celery root",
|
||||
"sweet potato",
|
||||
"yam",
|
||||
"radish",
|
||||
"potato",
|
||||
"pumpkin",
|
||||
"green papaya",
|
||||
"swede",
|
||||
"beet",
|
||||
"rutabaga",
|
||||
"red bell pepper",
|
||||
"yellow squash",
|
||||
"butternut squash",
|
||||
"root vegetable",
|
||||
"parsley root"
|
||||
],
|
||||
"cherry": [
|
||||
"acerola",
|
||||
"apricot",
|
||||
"plum",
|
||||
"nectarine",
|
||||
"raspberry",
|
||||
"grape",
|
||||
"strawberry",
|
||||
"currant",
|
||||
"blackberry",
|
||||
"frozen mixed berry",
|
||||
"peach",
|
||||
"cranberry",
|
||||
"dried cranberry",
|
||||
"blueberry",
|
||||
"maraschino",
|
||||
"berry",
|
||||
"prune"
|
||||
],
|
||||
"chicken": [
|
||||
"turkey",
|
||||
"rabbit",
|
||||
"oyster mushroom",
|
||||
"squab",
|
||||
"veal",
|
||||
"fish",
|
||||
"tofu",
|
||||
"beef",
|
||||
"extra firm tofu",
|
||||
"pork",
|
||||
"seitan",
|
||||
"duck",
|
||||
"capon",
|
||||
"lamb",
|
||||
"venison",
|
||||
"mushroom",
|
||||
"shrimp",
|
||||
"quail",
|
||||
"goose"
|
||||
],
|
||||
"parsley": [
|
||||
"chervil",
|
||||
"cilantro",
|
||||
"tarragon",
|
||||
"basil",
|
||||
"oregano",
|
||||
"chopped cilantro",
|
||||
"lovage",
|
||||
"dill",
|
||||
"fresh coriander",
|
||||
"coriander",
|
||||
"rosemary",
|
||||
"caper",
|
||||
"fresh cilantro",
|
||||
"fresh dill",
|
||||
"thyme",
|
||||
"fresh oregano",
|
||||
"chive",
|
||||
"mint",
|
||||
"fresh basil",
|
||||
"fresh thyme",
|
||||
"dried basil",
|
||||
"dried oregano",
|
||||
"fresh chive",
|
||||
"dried thyme"
|
||||
],
|
||||
"chocolate": [
|
||||
"truffle",
|
||||
"nutella",
|
||||
"ganache",
|
||||
"cocoa powder",
|
||||
"sugar",
|
||||
"jam",
|
||||
"marshmallow",
|
||||
"cocoa",
|
||||
"candy",
|
||||
"caramel",
|
||||
"peanut butter"
|
||||
],
|
||||
"bacon": [
|
||||
"pancetta",
|
||||
"prosciutto",
|
||||
"speck",
|
||||
"smoked sausage",
|
||||
"smoked ham",
|
||||
"parma ham",
|
||||
"ham",
|
||||
"salami",
|
||||
"pepperoni",
|
||||
"guanciale",
|
||||
"chorizo",
|
||||
"salt pork",
|
||||
"kielbasa",
|
||||
"pork rind",
|
||||
"cubed ham",
|
||||
"italian sausage",
|
||||
"crouton",
|
||||
"capicola",
|
||||
"hard salami",
|
||||
"lardon",
|
||||
"cooked ham",
|
||||
"corned beef",
|
||||
"bologna"
|
||||
],
|
||||
"kale": [
|
||||
"collard green",
|
||||
"turnip green",
|
||||
"spinach",
|
||||
"chinese cabbage",
|
||||
"leek",
|
||||
"escarole",
|
||||
"spring green",
|
||||
"chard",
|
||||
"green cabbage",
|
||||
"savoy cabbage",
|
||||
"cabbage",
|
||||
"cauliflower",
|
||||
"collard",
|
||||
"watercres",
|
||||
"arugula",
|
||||
"broccoli rabe",
|
||||
"spinach leaves",
|
||||
"lettuce",
|
||||
"romaine lettuce",
|
||||
"baby spinach",
|
||||
"mizuna"
|
||||
],
|
||||
"sugar": [
|
||||
"splenda",
|
||||
"honey",
|
||||
"stevia",
|
||||
"sweetener",
|
||||
"liquid stevia",
|
||||
"corn syrup",
|
||||
"splenda granular",
|
||||
"liquid sweetener",
|
||||
"brown rice syrup",
|
||||
"turbinado",
|
||||
"maple syrup",
|
||||
"pure maple syrup",
|
||||
"jaggery",
|
||||
"sweetened condensed milk",
|
||||
"artificial sweetener",
|
||||
"agave nectar",
|
||||
"sweet chocolate",
|
||||
"chocolate",
|
||||
"caramel",
|
||||
"vanilla",
|
||||
"molasse",
|
||||
"golden syrup",
|
||||
"syrup"
|
||||
],
|
||||
"brie": [
|
||||
"camembert",
|
||||
"reblochon",
|
||||
"gorgonzola",
|
||||
"cheese spread",
|
||||
"cheddar",
|
||||
"goat cheese",
|
||||
"havarti",
|
||||
"boursin",
|
||||
"blue cheese",
|
||||
"roquefort",
|
||||
"monterey jack",
|
||||
"gouda",
|
||||
"fontina",
|
||||
"provolone cheese",
|
||||
"stilton",
|
||||
"feta",
|
||||
"processed cheese"
|
||||
],
|
||||
"turkey": [
|
||||
"chicken",
|
||||
"rabbit",
|
||||
"duck",
|
||||
"ham",
|
||||
"pheasant",
|
||||
"goose",
|
||||
"capon",
|
||||
"beef",
|
||||
"venison",
|
||||
"lamb",
|
||||
"pork",
|
||||
"hen",
|
||||
"roast beef",
|
||||
"veal",
|
||||
"poultry",
|
||||
"chicken breast",
|
||||
"chicken thigh",
|
||||
"quail",
|
||||
"pork chop"
|
||||
]
|
||||
}
|
||||
1
evaluation/engl_data/engl_multimodal_old.json
Normal file
1
evaluation/engl_data/engl_multimodal_old.json
Normal file
File diff suppressed because one or more lines are too long
113542
evaluation/engl_data/engl_substitutes.json
Normal file
113542
evaluation/engl_data/engl_substitutes.json
Normal file
File diff suppressed because it is too large
Load Diff
14
evaluation/engl_data/engl_synonyms.json
Normal file
14
evaluation/engl_data/engl_synonyms.json
Normal file
@@ -0,0 +1,14 @@
|
||||
{
|
||||
"squash": ["pumpkin"],
|
||||
"sweet potato": ["yam"],
|
||||
"cilantro": ["coriander"],
|
||||
"fresh cilantro": ["chopped cilantro", "fresh coriander"],
|
||||
"dill": ["fresh dill"],
|
||||
"oregano": ["fresh oregano"],
|
||||
"basil": ["fresh basil"],
|
||||
"thyme": ["fresh thyme"],
|
||||
"chive": ["fresh chive"],
|
||||
"cabbage":["collard"],
|
||||
"maple syrup": ["pure maple syrup"],
|
||||
"sweetener": ["artificial sweetener"]
|
||||
}
|
||||
203
evaluation/engl_data/revised_engl_ground_truth.json
Normal file
203
evaluation/engl_data/revised_engl_ground_truth.json
Normal file
@@ -0,0 +1,203 @@
|
||||
{
|
||||
"carrot": [
|
||||
"parsnip",
|
||||
"daikon",
|
||||
"turnip",
|
||||
"celery",
|
||||
"squash",
|
||||
"celery root",
|
||||
"sweet potato",
|
||||
"radish",
|
||||
"potato",
|
||||
"green papaya",
|
||||
"swede",
|
||||
"beet",
|
||||
"rutabaga",
|
||||
"red bell pepper",
|
||||
"yellow squash",
|
||||
"butternut squash",
|
||||
"root vegetable",
|
||||
"parsley root"
|
||||
],
|
||||
"cherry": [
|
||||
"acerola",
|
||||
"apricot",
|
||||
"plum",
|
||||
"nectarine",
|
||||
"raspberry",
|
||||
"grape",
|
||||
"strawberry",
|
||||
"currant",
|
||||
"blackberry",
|
||||
"frozen mixed berry",
|
||||
"peach",
|
||||
"cranberry",
|
||||
"dried cranberry",
|
||||
"blueberry",
|
||||
"maraschino",
|
||||
"berry",
|
||||
"prune"
|
||||
],
|
||||
"chicken": [
|
||||
"turkey",
|
||||
"rabbit",
|
||||
"oyster mushroom",
|
||||
"squab",
|
||||
"veal",
|
||||
"fish",
|
||||
"tofu",
|
||||
"beef",
|
||||
"extra firm tofu",
|
||||
"pork",
|
||||
"seitan",
|
||||
"duck",
|
||||
"capon",
|
||||
"lamb",
|
||||
"venison",
|
||||
"mushroom",
|
||||
"shrimp",
|
||||
"quail",
|
||||
"goose"
|
||||
],
|
||||
"parsley": [
|
||||
"chervil",
|
||||
"cilantro",
|
||||
"tarragon",
|
||||
"basil",
|
||||
"oregano",
|
||||
"lovage",
|
||||
"dill",
|
||||
"rosemary",
|
||||
"caper",
|
||||
"fresh cilantro",
|
||||
"thyme",
|
||||
"chive",
|
||||
"mint",
|
||||
"dried basil",
|
||||
"dried oregano",
|
||||
"dried thyme"
|
||||
],
|
||||
"chocolate": [
|
||||
"truffle",
|
||||
"nutella",
|
||||
"ganache",
|
||||
"cocoa powder",
|
||||
"sugar",
|
||||
"jam",
|
||||
"marshmallow",
|
||||
"cocoa",
|
||||
"candy",
|
||||
"caramel",
|
||||
"peanut butter"
|
||||
],
|
||||
"bacon": [
|
||||
"pancetta",
|
||||
"prosciutto",
|
||||
"speck",
|
||||
"smoked sausage",
|
||||
"smoked ham",
|
||||
"parma ham",
|
||||
"ham",
|
||||
"salami",
|
||||
"pepperoni",
|
||||
"guanciale",
|
||||
"chorizo",
|
||||
"salt pork",
|
||||
"kielbasa",
|
||||
"pork rind",
|
||||
"cubed ham",
|
||||
"italian sausage",
|
||||
"crouton",
|
||||
"capicola",
|
||||
"hard salami",
|
||||
"lardon",
|
||||
"cooked ham",
|
||||
"corned beef",
|
||||
"bologna"
|
||||
],
|
||||
"kale": [
|
||||
"collard green",
|
||||
"turnip green",
|
||||
"spinach",
|
||||
"chinese cabbage",
|
||||
"leek",
|
||||
"escarole",
|
||||
"spring green",
|
||||
"chard",
|
||||
"green cabbage",
|
||||
"savoy cabbage",
|
||||
"cabbage",
|
||||
"cauliflower",
|
||||
"watercres",
|
||||
"arugula",
|
||||
"broccoli rabe",
|
||||
"spinach leaves",
|
||||
"lettuce",
|
||||
"romaine lettuce",
|
||||
"baby spinach",
|
||||
"mizuna"
|
||||
],
|
||||
"sugar": [
|
||||
"splenda",
|
||||
"honey",
|
||||
"stevia",
|
||||
"sweetener",
|
||||
"liquid stevia",
|
||||
"corn syrup",
|
||||
"splenda granular",
|
||||
"liquid sweetener",
|
||||
"brown rice syrup",
|
||||
"turbinado",
|
||||
"maple syrup",
|
||||
"jaggery",
|
||||
"sweetened condensed milk",
|
||||
"agave nectar",
|
||||
"sweet chocolate",
|
||||
"chocolate",
|
||||
"caramel",
|
||||
"vanilla",
|
||||
"molasse",
|
||||
"golden syrup",
|
||||
"syrup"
|
||||
],
|
||||
"brie": [
|
||||
"camembert",
|
||||
"reblochon",
|
||||
"gorgonzola",
|
||||
"cheese spread",
|
||||
"cheddar",
|
||||
"goat cheese",
|
||||
"havarti",
|
||||
"boursin",
|
||||
"blue cheese",
|
||||
"roquefort",
|
||||
"monterey jack",
|
||||
"gouda",
|
||||
"fontina",
|
||||
"provolone cheese",
|
||||
"stilton",
|
||||
"feta",
|
||||
"processed cheese"
|
||||
],
|
||||
"turkey": [
|
||||
"chicken",
|
||||
"rabbit",
|
||||
"duck",
|
||||
"ham",
|
||||
"pheasant",
|
||||
"goose",
|
||||
"capon",
|
||||
"beef",
|
||||
"venison",
|
||||
"lamb",
|
||||
"pork",
|
||||
"hen",
|
||||
"roast beef",
|
||||
"veal",
|
||||
"poultry",
|
||||
"chicken breast",
|
||||
"chicken thigh",
|
||||
"quail",
|
||||
"pork chop"
|
||||
]
|
||||
}
|
||||
1
evaluation/engl_data/substitute_pairs_foodbert_text.json
Normal file
1
evaluation/engl_data/substitute_pairs_foodbert_text.json
Normal file
File diff suppressed because one or more lines are too long
168
evaluation/engl_data/translation.json
Normal file
168
evaluation/engl_data/translation.json
Normal file
@@ -0,0 +1,168 @@
|
||||
{
|
||||
"carrot": "Karotte",
|
||||
"parsnip": "Pastinake",
|
||||
"daikon": "Rettich",
|
||||
"turnip": "Steckrübe",
|
||||
"celery": "Staudensellerie",
|
||||
"squash": "Kürbis",
|
||||
"sweet potato": "Süßkartoffel",
|
||||
"yam": "Süßkartoffel",
|
||||
"radish": "Radieschen",
|
||||
"potato": "Kartoffel",
|
||||
"pumpkin": "Kürbis",
|
||||
"beet": "Rübe",
|
||||
"red bell pepper": "Paprika_rot",
|
||||
"butternut squash": "Butternusskürbis",
|
||||
"parsley root": "Petersilienwurzel",
|
||||
"cherry": "Kirsche",
|
||||
"apricot": "Aprikose",
|
||||
"plum": "Pflaume",
|
||||
"nectarine": "Nektarine",
|
||||
"raspberry": "Himbeeren",
|
||||
"grape": "Weintrauben",
|
||||
"strawberry": "Erdbeeren",
|
||||
"currant": "Johannisbeeren",
|
||||
"blackberry": "Brombeeren",
|
||||
"frozen mixed berry": "Beeren_gemischte",
|
||||
"peach": "Pfirsich",
|
||||
"cranberry": "Cranberries",
|
||||
"dried cranberry": "Cranberries_getrocknet",
|
||||
"blueberry": "Blaubeeren",
|
||||
"maraschino": "Maraschino",
|
||||
"berry": "Beeren",
|
||||
"prune": "Trockenpflaumen",
|
||||
"chicken": "Huhn",
|
||||
"turkey": "Truthahn",
|
||||
"rabbit": "Kaninchen",
|
||||
"oyster mushroom": "Austernpilze",
|
||||
"veal": "Kalbfleisch",
|
||||
"fish": "Fisch",
|
||||
"tofu": "Tofu",
|
||||
"beef": "Rindfleisch",
|
||||
"extra firm tofu": "Tofu_fester",
|
||||
"pork": "Schweinefleisch",
|
||||
"seitan": "Seitan",
|
||||
"duck": "Ente",
|
||||
"lamb": "Lamm",
|
||||
"venison": "Wildfleisch",
|
||||
"mushroom": "Pilze",
|
||||
"shrimp": "Shrimps",
|
||||
"quail": "Wachtel",
|
||||
"goose": "Gans",
|
||||
"parsley": "Petersilie",
|
||||
"chervil": "Kerbel",
|
||||
"cilantro": "Koriander",
|
||||
"tarragon": "Estragon",
|
||||
"basil": "Basilikum",
|
||||
"oregano": "Oregano",
|
||||
"chopped cilantro": "Koriandergrün",
|
||||
"lovage": "Liebstöckel",
|
||||
"dill": "Dill",
|
||||
"fresh coriander": "Koriandergrün",
|
||||
"coriander": "Koriander",
|
||||
"rosemary": "Rosmarin",
|
||||
"caper": "Kapern",
|
||||
"fresh cilantro": "Koriandergrün",
|
||||
"fresh dill": "Dill",
|
||||
"thyme": "Thymian",
|
||||
"fresh oregano": "Oregano",
|
||||
"chive": "Schnittlauch",
|
||||
"mint": "Minze",
|
||||
"fresh basil": "Basilikum",
|
||||
"fresh thyme": "Thymian",
|
||||
"dried basil": "Basilikum_getrockneter",
|
||||
"dried oregano": "Oregano_getrocknet",
|
||||
"fresh chive": "Schnittlauch",
|
||||
"dried thyme": "Thymian_getrocknet",
|
||||
"chocolate": "Schokolade",
|
||||
"nutella": "Nutella",
|
||||
"cocoa powder": "Kakaopulver_Instant",
|
||||
"sugar": "Zucker",
|
||||
"jam": "Marmelade",
|
||||
"marshmallow": "Marshmallow",
|
||||
"cocoa": "Kakao",
|
||||
"candy": "Süßigkeiten",
|
||||
"peanut butter": "Erdnussbutter",
|
||||
"bacon": "Frühstücksspeck",
|
||||
"pancetta": "Pancetta",
|
||||
"prosciutto": "Schinken_Prosciutto",
|
||||
"speck": "Speck",
|
||||
"smoked ham": "Schinken_rohen",
|
||||
"parma ham": "Parmaschinken",
|
||||
"ham": "Kochschinken",
|
||||
"salami": "Salami",
|
||||
"chorizo": "Chorizo",
|
||||
"kielbasa": "Wurst_Krakauer",
|
||||
"pork rind": "Schweineschwarte",
|
||||
"cubed ham": "Schinkenwürfel",
|
||||
"crouton": "Croûtons",
|
||||
"lardon": "Speckwürfel",
|
||||
"cooked ham": "Kochschinken",
|
||||
"corned beef": "Corned_Beef",
|
||||
"bologna": "Wurst_Mortadella",
|
||||
"kale": "Grünkohl",
|
||||
"spinach": "Spinat",
|
||||
"chinese cabbage": "Chinakohl",
|
||||
"leek": "Lauch",
|
||||
"escarole": "Endiviensalat",
|
||||
"chard": "Mangold",
|
||||
"savoy cabbage": "Wirsing",
|
||||
"cabbage": "Kohl",
|
||||
"cauliflower": "Blumenkohl",
|
||||
"collard": "Kohl",
|
||||
"watercres": "Brunnenkresse",
|
||||
"arugula": "Rucola",
|
||||
"spinach leaves": "Blattspinat",
|
||||
"lettuce": "Kopfsalat",
|
||||
"romaine lettuce": "Römersalat",
|
||||
"baby spinach": "Babyspinat",
|
||||
"sugar": "Zucker",
|
||||
"honey": "Honig",
|
||||
"stevia": "Stevia",
|
||||
"sweetener": "Süßstoff",
|
||||
"liquid stevia": "Stevia_flüssig",
|
||||
"liquid sweetener": "Süßstoff_flüssigen",
|
||||
"brown rice syrup": "Reissirup",
|
||||
"maple syrup": "Ahornsirup",
|
||||
"pure maple syrup": "Ahornsirup",
|
||||
"sweetened condensed milk": "Kondensmilch_gezuckerte",
|
||||
"artificial sweetener": "Süßstoff",
|
||||
"agave nectar": "Agavendicksaft",
|
||||
"chocolate": "Schokolade",
|
||||
"vanilla": "Vanille",
|
||||
"molasse": "Melasse",
|
||||
"golden syrup": "Zuckerrübensirup",
|
||||
"syrup": "Sirup",
|
||||
"brie": "Brie",
|
||||
"camembert": "Camembert",
|
||||
"gorgonzola": "Gorgonzola",
|
||||
"cheese spread": "Schmelzkäse",
|
||||
"cheddar": "Cheddarkäse",
|
||||
"goat cheese": "Ziegenkäse",
|
||||
"boursin": "Doppelrahmfrischkäse",
|
||||
"blue cheese": "Blauschimmelkäse",
|
||||
"roquefort": "Roquefort",
|
||||
"gouda": "Gouda",
|
||||
"fontina": "Käse_Fontina",
|
||||
"provolone cheese": "Käse_Provolone",
|
||||
"feta": "Feta_Käse",
|
||||
"processed cheese": "Scheiblettenkäse",
|
||||
"turkey": "Truthahn",
|
||||
"chicken": "Huhn",
|
||||
"rabbit": "Kaninchen",
|
||||
"duck": "Ente",
|
||||
"ham": "Schinken",
|
||||
"pheasant": "Fasan",
|
||||
"goose": "Gans",
|
||||
"beef": "Rindfleisch",
|
||||
"venison": "Wildfleisch",
|
||||
"lamb": "Lammfleisch",
|
||||
"pork": "Schweinefleisch",
|
||||
"roast beef": "Roastbeef",
|
||||
"veal": "Kalbfleisch",
|
||||
"poultry": "Geflügelfleisch",
|
||||
"chicken breast": "Hähnchenfilet",
|
||||
"chicken thigh": "Hühnerkeule",
|
||||
"quail": "Wachtel",
|
||||
"pork chop": "schweinekotelett"
|
||||
}
|
||||
523
evaluation/evaluate.py
Normal file
523
evaluation/evaluate.py
Normal file
@@ -0,0 +1,523 @@
|
||||
import json
|
||||
import statistics
|
||||
|
||||
data_path = "data/"
|
||||
occurances_path = "mult_ingredients_nice.json"
|
||||
ground_truth_path = "ground_truth.json"
|
||||
engl_data_path = "evaluation/engl_data/"
|
||||
|
||||
evaluation_path = "evaluation/"
|
||||
synonyms_path = "synonyms.json"
|
||||
|
||||
found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
|
||||
# model_name = "Versions/vers3/"
|
||||
|
||||
german_ground_truth = {
|
||||
"Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen", "Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel"],
|
||||
"Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren", "Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren", "Maraschino", "Beeren", "Trockenpflaumen"],
|
||||
"Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester", "Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"],
|
||||
"Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill", "Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze", "Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"],
|
||||
"Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten", "Erdnussbutter"],
|
||||
"Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken", "Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons", "Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"],
|
||||
"Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl", "Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"],
|
||||
"Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup", "Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup", "Sirup"],
|
||||
"Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse", "Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse", "Scheiblettenkäse"],
|
||||
"Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch", "Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule", "Wachtel", "schweinekotelett", "Wildfleisch"]
|
||||
}
|
||||
|
||||
|
||||
def no_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
|
||||
if get_occurrences:
|
||||
with open(data_path + occurances_path, "r") as whole_json_file:
|
||||
occurrences_dict = json.load(whole_json_file)
|
||||
|
||||
if not ground_truth_dict:
|
||||
with open(data_path+ground_truth_path, "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
if synonyms:
|
||||
with open(data_path + synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
else:
|
||||
synonyms_dict = {}
|
||||
|
||||
if not found_substitutes_dict:
|
||||
with open(found_substitutes_path, "r") as whole_json_file:
|
||||
model_substitutes_dict = json.load(whole_json_file)
|
||||
else:
|
||||
model_substitutes_dict = found_substitutes_dict
|
||||
|
||||
found_ground_ingr = {}
|
||||
correctly_found = 0
|
||||
incorrectly_found = 0
|
||||
average_precision = 0.0
|
||||
average_recall = 0.0
|
||||
number_correct_subs_found_overall = []
|
||||
total_number_subs_found_overall = []
|
||||
# base ingredient without synonyms, substitutes with synonyms
|
||||
for base_ingred in ground_truth_dict.keys():
|
||||
if get_occurrences:
|
||||
occurrences = occurrences_dict[base_ingred]
|
||||
found_substitutes = model_substitutes_dict[base_ingred].copy()
|
||||
|
||||
# if len(found_substitutes) > 30:
|
||||
# found_substitutes = found_substitutes[:30]
|
||||
|
||||
found = []
|
||||
# remove synonyms of base ingredient
|
||||
new_found_substitutes = []
|
||||
for subst in found_substitutes:
|
||||
if base_ingred in synonyms_dict.keys():
|
||||
if subst not in synonyms_dict[base_ingred]:
|
||||
new_found_substitutes.append(subst)
|
||||
else:
|
||||
new_found_substitutes.append(subst)
|
||||
found_substitutes = new_found_substitutes
|
||||
|
||||
# check which substitutes were found
|
||||
for subst in ground_truth_dict[base_ingred]:
|
||||
# only add substitute if not already added
|
||||
if subst in found_substitutes and subst not in found:
|
||||
found.append(subst)
|
||||
found_substitutes.remove(subst)
|
||||
|
||||
# check if synonyms of substitute were found
|
||||
# check if ingredient has synonyms
|
||||
if subst in synonyms_dict.keys():
|
||||
for synon in synonyms_dict[subst]:
|
||||
if synon in found_substitutes:
|
||||
if synon not in found and subst not in found:
|
||||
found.append(subst)
|
||||
found_substitutes.remove(synon)
|
||||
# if base_ingred == "Erdbeere":
|
||||
print(base_ingred + ": " + str(found_substitutes))
|
||||
found_ground_ingr[base_ingred] = found
|
||||
# print(base_ingred + ": ")
|
||||
# if get_occurrences:
|
||||
# print("occurrences in dataset: " + str(occurrences))
|
||||
# print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
|
||||
# print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
|
||||
# print("correctly found substitutes: " + str(found))
|
||||
# print("incorrectly found substitutes: " + str(found_substitutes))
|
||||
# print("-----------------------------\n")
|
||||
if len(found) > 0:
|
||||
average_precision += len(found)/(len(found) + len(found_substitutes))
|
||||
# print(len(found))
|
||||
average_recall += len(found)/len(ground_truth_dict[base_ingred])
|
||||
correctly_found += len(found)
|
||||
incorrectly_found += len(found_substitutes)
|
||||
number_correct_subs_found_overall.append(len(found))
|
||||
total_number_subs_found_overall.append(len(found) + len(found_substitutes))
|
||||
|
||||
print("average precision: " + str(average_precision/40))
|
||||
print("average recall: " + str(average_recall/40))
|
||||
print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
|
||||
print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
|
||||
return found_ground_ingr
|
||||
|
||||
|
||||
def merge_lists(all_lists):
|
||||
max_len = 0
|
||||
min_len = 99999
|
||||
output = []
|
||||
for curr_list in all_lists:
|
||||
if len(curr_list) < min_len:
|
||||
min_len = len(curr_list)
|
||||
if len(curr_list) > max_len:
|
||||
max_len = len(curr_list)
|
||||
for index_counter in range(max_len):
|
||||
for curr_list in all_lists:
|
||||
if index_counter < len(curr_list):
|
||||
if curr_list[index_counter] not in output:
|
||||
output.append(curr_list[index_counter])
|
||||
return output
|
||||
|
||||
|
||||
def with_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
|
||||
if get_occurrences:
|
||||
with open(data_path + occurances_path, "r") as whole_json_file:
|
||||
occurrences_dict = json.load(whole_json_file)
|
||||
|
||||
if not ground_truth_dict:
|
||||
with open(data_path+ground_truth_path, "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
|
||||
if synonyms:
|
||||
with open(data_path + synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
else:
|
||||
synonyms_dict = {}
|
||||
|
||||
if not found_substitutes_dict:
|
||||
with open(found_substitutes_path, "r") as whole_json_file:
|
||||
model_substitutes_dict = json.load(whole_json_file)
|
||||
else:
|
||||
model_substitutes_dict = found_substitutes_dict
|
||||
|
||||
correctly_found = 0
|
||||
incorrectly_found = 0
|
||||
average_precision = 0.0
|
||||
average_recall = 0.0
|
||||
number_correct_subs_found_overall = []
|
||||
total_number_subs_found_overall = []
|
||||
|
||||
found_ground_ingr = {}
|
||||
# base ingredient with synonyms, substitutes with synonyms
|
||||
for base_ingred in ground_truth_dict.keys():
|
||||
base_synonyms = [base_ingred]
|
||||
if get_occurrences:
|
||||
occurrences = 0
|
||||
# get list of all synonyms of base ingredient
|
||||
if base_ingred in synonyms_dict.keys():
|
||||
synonyms = synonyms_dict[base_ingred]
|
||||
base_synonyms = base_synonyms + synonyms
|
||||
found_substitutes = []
|
||||
all_substitutes = []
|
||||
# get top 30 substitutes of each base synonym
|
||||
for synon in base_synonyms:
|
||||
if get_occurrences:
|
||||
occurrences += occurrences_dict[synon]
|
||||
all_substitutes.append(model_substitutes_dict[synon].copy())
|
||||
# synon_subs = model_substitutes_dict[synon].copy()
|
||||
# if len(synon_subs) > 30:
|
||||
# synon_subs = synon_subs[:30]
|
||||
# for sub in synon_subs:
|
||||
# if sub not in found_substitutes:
|
||||
# found_substitutes.append(sub)
|
||||
found_substitutes = merge_lists(all_substitutes)
|
||||
else:
|
||||
found_substitutes = model_substitutes_dict[base_ingred].copy()
|
||||
|
||||
if len(found_substitutes) > 30:
|
||||
found_substitutes = found_substitutes[:30]
|
||||
|
||||
found = []
|
||||
|
||||
# remove all base synonyms from found substitutes
|
||||
new_found_substitutes = []
|
||||
for subst in found_substitutes:
|
||||
if subst not in base_synonyms:
|
||||
new_found_substitutes.append(subst)
|
||||
found_substitutes = new_found_substitutes
|
||||
|
||||
# check which substitutes were found
|
||||
for subst in ground_truth_dict[base_ingred]:
|
||||
# only add substitute if not already added
|
||||
if subst in found_substitutes and subst not in found:
|
||||
found.append(subst)
|
||||
found_substitutes.remove(subst)
|
||||
|
||||
# check if synonyms of substitute were found
|
||||
# check if ingredient has synonyms
|
||||
if subst in synonyms_dict.keys():
|
||||
for synon in synonyms_dict[subst]:
|
||||
if synon in found_substitutes:
|
||||
if synon not in found and subst not in found:
|
||||
found.append(subst)
|
||||
found_substitutes.remove(synon)
|
||||
|
||||
found_ground_ingr[base_ingred] = found
|
||||
# print(base_ingred + ": ")
|
||||
# if get_occurrences:
|
||||
# print("occurrences in dataset: " + str(occurrences))
|
||||
# print("number of synonyms incl. original word: " + str(len(base_synonyms)))
|
||||
# print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
|
||||
# print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
|
||||
# print("correctly found substitutes: " + str(found))
|
||||
# print("incorrectly found substitutes: " + str(found_substitutes))
|
||||
# print("-----------------------------\n")
|
||||
|
||||
if len(found) > 0:
|
||||
average_precision += len(found) / (len(found) + len(found_substitutes))
|
||||
average_recall += len(found) / len(ground_truth_dict[base_ingred])
|
||||
correctly_found += len(found)
|
||||
incorrectly_found += len(found_substitutes)
|
||||
number_correct_subs_found_overall.append(len(found))
|
||||
total_number_subs_found_overall.append(len(found) + len(found_substitutes))
|
||||
|
||||
print("average precision: " + str(average_precision / 40))
|
||||
print("average recall: " + str(average_recall / 40))
|
||||
print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
|
||||
print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
|
||||
|
||||
return found_ground_ingr
|
||||
|
||||
|
||||
def translate_engl_ground_truth(ground_truth, ger_transl):
|
||||
new_ground_truth = {}
|
||||
for base_ingr in ground_truth.keys():
|
||||
new_ground_truth[ger_transl[base_ingr]] = []
|
||||
for subst in ground_truth[base_ingr]:
|
||||
if subst in ger_transl.keys():
|
||||
new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst])
|
||||
|
||||
return new_ground_truth
|
||||
|
||||
|
||||
def with_base_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
|
||||
if get_occurrences:
|
||||
with open(data_path + occurances_path, "r") as whole_json_file:
|
||||
occurrences_dict = json.load(whole_json_file)
|
||||
|
||||
if not ground_truth_dict:
|
||||
with open(data_path+ground_truth_path, "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
|
||||
if synonyms:
|
||||
with open(data_path + synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
else:
|
||||
synonyms_dict = {}
|
||||
|
||||
if not found_substitutes_dict:
|
||||
with open(found_substitutes_path, "r") as whole_json_file:
|
||||
model_substitutes_dict = json.load(whole_json_file)
|
||||
else:
|
||||
model_substitutes_dict = found_substitutes_dict
|
||||
|
||||
found_ground_ingr = {}
|
||||
# base ingredient with synonyms, substitutes with synonyms
|
||||
for base_ingred in ground_truth_dict.keys():
|
||||
base_synonyms = [base_ingred]
|
||||
if get_occurrences:
|
||||
occurrences = 0
|
||||
# get list of all synonyms of base ingredient
|
||||
if base_ingred in synonyms_dict.keys():
|
||||
synonyms = synonyms_dict[base_ingred]
|
||||
base_synonyms = base_synonyms + synonyms
|
||||
found_substitutes = []
|
||||
all_substitutes = []
|
||||
# get top 30 substitutes of each base synonym
|
||||
for synon in base_synonyms:
|
||||
if get_occurrences:
|
||||
occurrences += occurrences_dict[synon]
|
||||
all_substitutes.append(model_substitutes_dict[synon].copy())
|
||||
|
||||
found_substitutes = merge_lists(all_substitutes)
|
||||
else:
|
||||
found_substitutes = model_substitutes_dict[base_ingred].copy()
|
||||
|
||||
if len(found_substitutes) > 30:
|
||||
found_substitutes = found_substitutes[:30]
|
||||
|
||||
found = []
|
||||
|
||||
# remove all base synonyms from found substitutes
|
||||
new_found_substitutes = []
|
||||
for subst in found_substitutes:
|
||||
if subst not in base_synonyms:
|
||||
new_found_substitutes.append(subst)
|
||||
found_substitutes = new_found_substitutes
|
||||
|
||||
# check which substitutes were found
|
||||
for subst in ground_truth_dict[base_ingred]:
|
||||
# only add substitute if not already added
|
||||
if subst in found_substitutes and subst not in found:
|
||||
found.append(subst)
|
||||
found_substitutes.remove(subst)
|
||||
|
||||
# check if synonyms of substitute were found
|
||||
# check if ingredient has synonyms
|
||||
# if subst in synonyms_dict.keys():
|
||||
# for synon in synonyms_dict[subst]:
|
||||
# if synon in found_substitutes:
|
||||
# if synon not in found and subst not in found:
|
||||
# found.append(subst)
|
||||
# found_substitutes.remove(synon)
|
||||
|
||||
|
||||
|
||||
found_ground_ingr[base_ingred] = found
|
||||
print(base_ingred + ": ")
|
||||
if get_occurrences:
|
||||
print("occurrences in dataset: " + str(occurrences))
|
||||
print("number of synonyms incl. original word: " + str(len(base_synonyms)))
|
||||
print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
|
||||
print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
|
||||
print("correctly found substitutes: " + str(found))
|
||||
print("incorrectly found substitutes: " + str(found_substitutes))
|
||||
print("-----------------------------\n")
|
||||
|
||||
return found_ground_ingr
|
||||
|
||||
|
||||
def engl_compare():
|
||||
# with open(data_path + occurances_path, "r") as whole_json_file:
|
||||
# occurrences_dict = json.load(whole_json_file)
|
||||
|
||||
with open(engl_data_path + "translation.json", "r") as whole_json_file:
|
||||
ger_transl = json.load(whole_json_file)
|
||||
|
||||
# with open(data_path + synonyms_path, "r") as whole_json_file:
|
||||
# synonyms_dict = json.load(whole_json_file)
|
||||
|
||||
with open(found_substitutes_path, "r") as whole_json_file:
|
||||
model_substitutes_dict = json.load(whole_json_file)
|
||||
|
||||
with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
|
||||
engl_list = json.load(whole_json_file)
|
||||
|
||||
with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file:
|
||||
engl_ground_truth = json.load(whole_json_file)
|
||||
|
||||
engl_dict = {}
|
||||
for foo in engl_list:
|
||||
if foo[0] in engl_dict.keys():
|
||||
engl_dict[foo[0]].append(foo[1])
|
||||
else:
|
||||
engl_dict[foo[0]] = [foo[1]]
|
||||
|
||||
translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl)
|
||||
|
||||
# without any synonyms
|
||||
print("Engl compare without any synonyms:")
|
||||
engl_replacements = {}
|
||||
# ger_replacements = {}
|
||||
for ingred in engl_ground_truth.keys():
|
||||
found = []
|
||||
incorr = []
|
||||
found_ger = []
|
||||
incorr_ger = []
|
||||
engl_replacements[ingred] = {}
|
||||
engl_replacements[ingred]["engl"] = 0
|
||||
engl_replacements[ingred]["ger"] = 0
|
||||
# ger_replacements[ingred] = 0
|
||||
if ingred in engl_dict.keys():
|
||||
for sub in engl_ground_truth[ingred]:
|
||||
if sub in engl_dict[ingred]:
|
||||
engl_replacements[ingred]["engl"] += 1
|
||||
found.append(sub)
|
||||
if ger_transl[ingred] in model_substitutes_dict.keys():
|
||||
for sub in german_ground_truth[ger_transl[ingred]]:
|
||||
if sub in model_substitutes_dict[ger_transl[ingred]]:
|
||||
engl_replacements[ingred]["ger"] += 1
|
||||
found_ger.append(sub)
|
||||
# ger_replacements[ingred] += 1
|
||||
for found_sub in engl_dict[ingred]:
|
||||
if found_sub not in engl_ground_truth[ingred]:
|
||||
incorr.append(found_sub)
|
||||
for found_sub in model_substitutes_dict[ger_transl[ingred]]:
|
||||
if found_sub not in translated_ground_truth[ger_transl[ingred]]:
|
||||
incorr_ger.append(found_sub)
|
||||
|
||||
|
||||
print(ger_transl[ingred] + ": ")
|
||||
print("number of found substitutes: " + str(len(found_ger)) + "/" + str(len(translated_ground_truth[ger_transl[ingred]])))
|
||||
print("correctly found substitutes: " + str(len(found_ger)) + "/" + str(len(found_ger) + len(incorr_ger)))
|
||||
print("correctly found substitutes: " + str(found_ger))
|
||||
print("incorrectly found substitutes: " + str(incorr_ger))
|
||||
print("-----------------------------\n")
|
||||
|
||||
print(ingred + ": ")
|
||||
print("number of found substitutes: " + str(len(found)) + "/" + str(len(engl_ground_truth[ingred])))
|
||||
print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(incorr)))
|
||||
print("correctly found substitutes: " + str(found))
|
||||
print("incorrectly found substitutes: " + str(incorr))
|
||||
print("-----------------------------\n")
|
||||
|
||||
with open(evaluation_path + "engl_comparison_results/engl_no_syn.json", 'w') as f:
|
||||
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
# with synonyms of substitutes
|
||||
print("Engl compare with synonyms of substitutes only:")
|
||||
# german
|
||||
new_german_result = no_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
|
||||
#engl
|
||||
new_engl_result = no_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
|
||||
|
||||
engl_replacements = {}
|
||||
for ingred in engl_ground_truth.keys():
|
||||
engl_replacements[ingred] = {}
|
||||
engl_replacements[ingred]["engl"] = 0
|
||||
engl_replacements[ingred]["ger"] = 0
|
||||
if ingred in new_engl_result.keys():
|
||||
for sub in engl_ground_truth[ingred]:
|
||||
if sub in new_engl_result[ingred]:
|
||||
engl_replacements[ingred]["engl"] += 1
|
||||
if ger_transl[ingred] in new_german_result.keys():
|
||||
for sub in german_ground_truth[ger_transl[ingred]]:
|
||||
if sub in new_german_result[ger_transl[ingred]]:
|
||||
engl_replacements[ingred]["ger"] += 1
|
||||
|
||||
with open(evaluation_path + "engl_comparison_results/engl_sub_syn.json", 'w') as f:
|
||||
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# with synonyms for substitutes and base words
|
||||
print("Engl compare with synonyms of both:")
|
||||
# german
|
||||
new_german_result = with_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
|
||||
# engl
|
||||
new_engl_result = with_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
|
||||
|
||||
engl_replacements = {}
|
||||
for ingred in engl_ground_truth.keys():
|
||||
engl_replacements[ingred] = {}
|
||||
engl_replacements[ingred]["engl"] = 0
|
||||
engl_replacements[ingred]["ger"] = 0
|
||||
if ingred in new_engl_result.keys():
|
||||
for sub in engl_ground_truth[ingred]:
|
||||
if sub in new_engl_result[ingred]:
|
||||
engl_replacements[ingred]["engl"] += 1
|
||||
if ger_transl[ingred] in new_german_result.keys():
|
||||
for sub in german_ground_truth[ger_transl[ingred]]:
|
||||
if sub in new_german_result[ger_transl[ingred]]:
|
||||
engl_replacements[ingred]["ger"] += 1
|
||||
|
||||
with open(evaluation_path + "engl_comparison_results/engl_all_syn.json", 'w') as f:
|
||||
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# with synonyms for base words
|
||||
print("Engl compare with synonyms of base words only:")
|
||||
|
||||
# german
|
||||
new_german_result = with_base_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
|
||||
# engl
|
||||
new_engl_result = with_base_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict,
|
||||
get_occurrences=False, synonyms=False)
|
||||
|
||||
engl_replacements = {}
|
||||
for ingred in engl_ground_truth.keys():
|
||||
engl_replacements[ingred] = {}
|
||||
engl_replacements[ingred]["engl"] = 0
|
||||
engl_replacements[ingred]["ger"] = 0
|
||||
if ingred in new_engl_result.keys():
|
||||
for sub in engl_ground_truth[ingred]:
|
||||
if sub in new_engl_result[ingred]:
|
||||
engl_replacements[ingred]["engl"] += 1
|
||||
if ger_transl[ingred] in new_german_result.keys():
|
||||
for sub in german_ground_truth[ger_transl[ingred]]:
|
||||
if sub in new_german_result[ger_transl[ingred]]:
|
||||
engl_replacements[ingred]["ger"] += 1
|
||||
|
||||
with open(evaluation_path + "engl_comparison_results/engl_base_syn.json", 'w') as f:
|
||||
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
|
||||
|
||||
print("test")
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
# compare english and german results
|
||||
# engl_compare()
|
||||
|
||||
print("--------------------------------------------------------")
|
||||
print("--------------------------------------------------------")
|
||||
print("--------------------------------------------------------\n")
|
||||
|
||||
# get results, synonyms only used in substitutes
|
||||
no_synonyms()
|
||||
|
||||
print("--------------------------------------------------------")
|
||||
print("--------------------------------------------------------")
|
||||
print("--------------------------------------------------------\n")
|
||||
|
||||
# get results, synonyms used in substitutes and base ingredients
|
||||
with_synonyms()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
main()
|
||||
288
evaluation/final_eval.py
Normal file
288
evaluation/final_eval.py
Normal file
@@ -0,0 +1,288 @@
|
||||
import json
|
||||
import statistics
|
||||
import helpers.revise_substitutes as revise_subs
|
||||
|
||||
|
||||
def eval_dataset(substitutes_dict):
|
||||
nones = 0
|
||||
all_lengths = []
|
||||
for ingredient in substitutes_dict.keys():
|
||||
if len(substitutes_dict[ingredient]) == 0:
|
||||
nones += 1
|
||||
all_lengths.append(len(substitutes_dict[ingredient]))
|
||||
|
||||
print("number of ingredients: " + str(len(substitutes_dict.keys())))
|
||||
print("number of nones: " + str(nones))
|
||||
print("average number of subs: " + str(sum(all_lengths) / len(substitutes_dict.keys())))
|
||||
print("median number of subs: " + str(statistics.median(all_lengths)))
|
||||
print("largest number of subs: " + str(max(all_lengths)))
|
||||
print("smallest number of subs: " + str(min(all_lengths)))
|
||||
|
||||
|
||||
def translate_engl_ground_truth(ground_truth, ger_transl):
|
||||
new_ground_truth = {}
|
||||
for base_ingr in ground_truth.keys():
|
||||
new_ground_truth[ger_transl[base_ingr]] = []
|
||||
for subst in ground_truth[base_ingr]:
|
||||
if subst in ger_transl.keys():
|
||||
new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst])
|
||||
# else:
|
||||
# print("translation error: " + subst)
|
||||
|
||||
return new_ground_truth
|
||||
|
||||
|
||||
def eval_ground_truth(substitutes_dict, ground_truth_dict):
|
||||
total_corr_int = 0
|
||||
total_corr_list = []
|
||||
total_incorr_int = 0
|
||||
total_incorr_list = []
|
||||
total_subs_ground_truth = 0
|
||||
test_prec = 0
|
||||
|
||||
highest_prec = [0,[]]
|
||||
highest_recall = [0,[]]
|
||||
|
||||
other_corr = 0
|
||||
other_incorr = 0
|
||||
ger_corr = 0
|
||||
ger_incorr = 0
|
||||
ger_total = 0
|
||||
other_total = 0
|
||||
|
||||
german_words = ["Spätzle", "Schwarzbrot", "Schupfnudeln", "Bratwürste_Nürnberger", "Vanillinzucker", "Bier", "Semmelknödel", "Rote_Bete", "Eisbeine", "Spargel_weiß"]
|
||||
|
||||
for ingredient in ground_truth_dict:
|
||||
correct = 0
|
||||
incorrect = 0
|
||||
correct_list = []
|
||||
incorrect_list = []
|
||||
|
||||
# print("\n" + ingredient + ": " + str(len(substitutes_dict[ingredient])))
|
||||
for sub in substitutes_dict[ingredient]:
|
||||
if sub in ground_truth_dict[ingredient]:
|
||||
# print(sub)
|
||||
correct += 1
|
||||
correct_list.append(sub)
|
||||
else:
|
||||
incorrect += 1
|
||||
incorrect_list.append(sub)
|
||||
total_corr_int += correct
|
||||
total_incorr_int += incorrect
|
||||
total_corr_list.append(correct)
|
||||
total_incorr_list.append(incorrect)
|
||||
total_subs_ground_truth += len(ground_truth_dict[ingredient])
|
||||
if correct > 0:
|
||||
curr_recall = correct/len(ground_truth_dict[ingredient])
|
||||
curr_prec = correct/(correct+incorrect)
|
||||
test_prec += curr_prec
|
||||
if curr_prec == highest_prec[0]:
|
||||
highest_prec[1].append(ingredient)
|
||||
if curr_prec > highest_prec[0]:
|
||||
highest_prec[0] = curr_prec
|
||||
highest_prec[1] = [ingredient]
|
||||
if curr_recall == highest_recall[0]:
|
||||
highest_recall[1].append(ingredient)
|
||||
if curr_recall > highest_recall[0]:
|
||||
highest_recall[0] = curr_recall
|
||||
highest_recall[1] = [ingredient]
|
||||
print(ingredient + ": " + str(curr_prec) + " ..... " + str(curr_recall))
|
||||
if ingredient in german_words:
|
||||
ger_corr += correct
|
||||
ger_incorr += incorrect
|
||||
else:
|
||||
other_corr += correct
|
||||
other_incorr += incorrect
|
||||
|
||||
if ingredient == "Zucker":
|
||||
print("correct: " + str(correct_list) + ", incorrect: " + str(incorrect_list))
|
||||
ger_total = ger_corr + ger_incorr
|
||||
other_total = other_corr + other_incorr
|
||||
|
||||
print("ger_total: " + str(ger_total/10))
|
||||
print("other_total: " + str(other_total/30))
|
||||
|
||||
# print(correct)
|
||||
print(ingredient + ": " + str(correct_list) + " / " + str(incorrect_list))
|
||||
|
||||
print("precision: " + str(total_corr_int / (total_corr_int + total_incorr_int)))
|
||||
print("(average precision:) " + str(test_prec/40))
|
||||
print("recall: " + str(total_corr_int / total_subs_ground_truth))
|
||||
print("median number of correct subs (ground truth): " + str(statistics.median(total_corr_list)))
|
||||
print("average number of correct subs (ground truth): " + str(statistics.mean(total_corr_list)))
|
||||
at_least_3 = 0
|
||||
no_corr = 0
|
||||
for nr in total_corr_list:
|
||||
if nr >= 3:
|
||||
at_least_3 += 1
|
||||
if nr < 1:
|
||||
no_corr += 1
|
||||
print("ingredients with at least 3 correct substitutes: " + str(at_least_3))
|
||||
print("ingredients with no correct substitutes: " + str(no_corr))
|
||||
print("highest precision: " + str(highest_prec[1]) + ": " + str(highest_prec[0]))
|
||||
print("highest recall: " + str(highest_recall[1]) + ": " + str(highest_recall[0]))
|
||||
|
||||
# print("german precision: " + str(ger_corr/(ger_corr + ger_incorr)))
|
||||
# print("german correct:" + str(ger_corr))
|
||||
# print("precision rest: " + str(other_corr/(other_corr + other_incorr)))
|
||||
# print("other correct: " + str(other_corr))
|
||||
|
||||
|
||||
def get_ground_truth_substitutes(substitutes_dict, ground_truth_dict):
|
||||
ground_truth_substitutes = {}
|
||||
for ingredient in ground_truth_dict:
|
||||
ground_truth_substitutes[ingredient] = substitutes_dict[ingredient]
|
||||
return ground_truth_substitutes
|
||||
|
||||
|
||||
|
||||
def main():
|
||||
substitutes_path = "final_Versions/models/vers3/eval/complete_substitute_pairs_50.json"
|
||||
with open(substitutes_path, "r") as whole_json_file:
|
||||
substitutes_dict = json.load(whole_json_file)
|
||||
|
||||
ground_truth_path = "data/ground_truth.json"
|
||||
with open(ground_truth_path, "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
|
||||
print("no synonyms at all:")
|
||||
print("entire dataset")
|
||||
eval_dataset(substitutes_dict)
|
||||
print("\nonly ground truth:")
|
||||
ground_truth_substitutes0 = get_ground_truth_substitutes(substitutes_dict, ground_truth_dict)
|
||||
# print(ground_truth_substitutes["Truthahn"])
|
||||
eval_dataset(ground_truth_substitutes0)
|
||||
eval_ground_truth(substitutes_dict, ground_truth_dict)
|
||||
|
||||
print("======================================")
|
||||
|
||||
print("\nsynonyms of substitutes only: ")
|
||||
new_substitutes_dict1 = substitutes_dict.copy()
|
||||
new_substitutes_dict1 = revise_subs.combined_substitutes_dict(new_substitutes_dict1)
|
||||
print("entire dataset")
|
||||
eval_dataset(new_substitutes_dict1)
|
||||
print("\nonly ground truth:")
|
||||
ground_truth_substitutes1 = get_ground_truth_substitutes(new_substitutes_dict1, ground_truth_dict)
|
||||
# print(ground_truth_substitutes["Truthahn"])
|
||||
eval_dataset(ground_truth_substitutes1)
|
||||
eval_ground_truth(new_substitutes_dict1, ground_truth_dict)
|
||||
|
||||
print("======================================")
|
||||
|
||||
print("\nsynonyms of everything: ")
|
||||
new_substitutes_dict2 = substitutes_dict.copy()
|
||||
new_substitutes_dict2 = revise_subs.combine_all_synonyms(new_substitutes_dict2)
|
||||
print("entire dataset")
|
||||
eval_dataset(new_substitutes_dict2)
|
||||
print("\nonly ground truth:")
|
||||
ground_truth_substitutes2 = get_ground_truth_substitutes(new_substitutes_dict2, ground_truth_dict)
|
||||
# print(ground_truth_substitutes["Truthahn"])
|
||||
eval_dataset(ground_truth_substitutes2)
|
||||
eval_ground_truth(new_substitutes_dict2, ground_truth_dict)
|
||||
|
||||
print("======================================")
|
||||
print("======================================")
|
||||
|
||||
print("English Evaluation")
|
||||
|
||||
data_path = "data/"
|
||||
occurances_path = "mult_ingredients_nice.json"
|
||||
ground_truth_path = "ground_truth.json"
|
||||
engl_data_path = "evaluation/engl_data/"
|
||||
|
||||
evaluation_path = "evaluation/"
|
||||
synonyms_path = "synonyms.json"
|
||||
|
||||
german_ground_truth = {
|
||||
"Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen",
|
||||
"Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel", "Rübe"],
|
||||
"Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren",
|
||||
"Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren",
|
||||
"Maraschino", "Beeren", "Trockenpflaumen"],
|
||||
"Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester",
|
||||
"Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"],
|
||||
"Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill",
|
||||
"Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze",
|
||||
"Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"],
|
||||
"Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten",
|
||||
"Erdnussbutter"],
|
||||
"Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken",
|
||||
"Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons",
|
||||
"Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"],
|
||||
"Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl",
|
||||
"Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"],
|
||||
"Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup",
|
||||
"Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup",
|
||||
"Sirup"],
|
||||
"Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse",
|
||||
"Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse",
|
||||
"Scheiblettenkäse"],
|
||||
"Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch",
|
||||
"Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule",
|
||||
"Wachtel", "schweinekotelett", "Wildfleisch"]
|
||||
}
|
||||
|
||||
with open(engl_data_path + "translation.json", "r") as whole_json_file:
|
||||
ger_transl = json.load(whole_json_file)
|
||||
|
||||
with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
|
||||
engl_list = json.load(whole_json_file)
|
||||
|
||||
with open(engl_data_path + "revised_engl_ground_truth.json", "r") as whole_json_file:
|
||||
engl_ground_truth = json.load(whole_json_file)
|
||||
|
||||
engl_dict = {}
|
||||
for foo in engl_list:
|
||||
if foo[0] in engl_dict.keys():
|
||||
engl_dict[foo[0]].append(foo[1])
|
||||
else:
|
||||
engl_dict[foo[0]] = [foo[1]]
|
||||
|
||||
# translate english ground truth to german for comparison
|
||||
# any ingredients that aren't in the german dataset are removed
|
||||
# translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl)
|
||||
|
||||
print("Eval English results")
|
||||
print("entire dataset")
|
||||
eval_dataset(engl_dict)
|
||||
orig_engl_dict = engl_dict.copy()
|
||||
# print("turkey results: " + str(orig_engl_dict["turkey"]))
|
||||
|
||||
print("\nonly ground truth:")
|
||||
ground_truth_substitutes_engl = get_ground_truth_substitutes(orig_engl_dict, engl_ground_truth)
|
||||
# print(ground_truth_substitutes)
|
||||
eval_dataset(ground_truth_substitutes_engl)
|
||||
eval_ground_truth(ground_truth_substitutes_engl, engl_ground_truth)
|
||||
|
||||
print("\n\nEval method 1:")
|
||||
engl_dict1 = engl_dict.copy()
|
||||
engl_dict1 = revise_subs.engl_combined_substitutes_dict(engl_dict1)
|
||||
print("entire dataset")
|
||||
eval_dataset(engl_dict1)
|
||||
print("\nonly ground truth:")
|
||||
ground_truth_substitutes_engl = get_ground_truth_substitutes(engl_dict1, engl_ground_truth)
|
||||
# print(ground_truth_substitutes["Truthahn"])
|
||||
eval_dataset(ground_truth_substitutes_engl)
|
||||
eval_ground_truth(ground_truth_substitutes_engl, engl_ground_truth)
|
||||
|
||||
|
||||
print("\nReevaluate German Data: ")
|
||||
eval_ground_truth(substitutes_dict, german_ground_truth)
|
||||
print("\nwith syn1")
|
||||
eval_ground_truth(new_substitutes_dict1, german_ground_truth)
|
||||
# print("Truthahn results 1: " + str(new_substitutes_dict1["Truthahn"]))
|
||||
print("\nwith syn2")
|
||||
eval_ground_truth(new_substitutes_dict2, german_ground_truth)
|
||||
# print("Truthahn results 2: " + str(new_substitutes_dict2["Truthahn"]))
|
||||
|
||||
|
||||
#
|
||||
# engl_substitutes_dict = get_ground_truth_substitutes(engl_dict1, german_ground_truth)
|
||||
#
|
||||
# engl_new_substitutes_dict1 = new_substitutes_dict1.copy()
|
||||
# engl_new_substitutes_dict2 = new_substitutes_dict2.copy()
|
||||
|
||||
|
||||
|
||||
|
||||
main()
|
||||
28
evaluation/find_ground_truth_ingredients.py
Normal file
28
evaluation/find_ground_truth_ingredients.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import json
|
||||
import random
|
||||
|
||||
def main():
|
||||
data_path = "data/"
|
||||
ingredients_path = "mult_ingredients_nice.json"
|
||||
with open(data_path + ingredients_path, "r") as whole_json_file:
|
||||
all_ingredients = json.load(whole_json_file)
|
||||
|
||||
rare = []
|
||||
frequent = []
|
||||
for ingredient in all_ingredients.keys():
|
||||
if all_ingredients[ingredient] >= 1000:
|
||||
frequent.append(ingredient)
|
||||
elif all_ingredients[ingredient] >= 100 and all_ingredients[ingredient] <= 200:
|
||||
rare.append(ingredient)
|
||||
|
||||
picked_rare = random.sample(rare, 10)
|
||||
picked_frequent = random.sample(frequent, 10)
|
||||
|
||||
print("rare: ")
|
||||
print(picked_rare)
|
||||
print("\nfrequent: ")
|
||||
print(picked_frequent)
|
||||
|
||||
|
||||
|
||||
main()
|
||||
197
evaluation/generate_substitutes.py
Normal file
197
evaluation/generate_substitutes.py
Normal file
@@ -0,0 +1,197 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from sklearn.decomposition import PCA
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.helpers.approx_knn_classifier import ApproxKNNClassifier
|
||||
from evaluation.helpers.generate_ingredient_embeddings import generate_food_embedding_dict
|
||||
from evaluation.helpers.knn_classifier import KNNClassifier
|
||||
|
||||
|
||||
def avg(values):
|
||||
summed = sum(values)
|
||||
length = len(values)
|
||||
return summed / length
|
||||
|
||||
|
||||
def custom_potential_neighbors_sort(potential_neighbors):
|
||||
# First sort by how often something was nearby, if this is equal, use the smaller distance
|
||||
sorted_neighbors = sorted(potential_neighbors.items(), key=lambda x: (len(x[1]), -avg(x[1])), reverse=True)
|
||||
return sorted_neighbors
|
||||
|
||||
|
||||
def filter_out_forbidden_neigbours(ingredient_name, potential_neighbors):
|
||||
'''
|
||||
Neigbors that are the same as the ingredient are to be removed, additional rules such as mozeralla & mozeralla_cheese, penne & penne_pasta can be added here
|
||||
'''
|
||||
banned_keys = {ingredient_name}
|
||||
|
||||
# Ban ingredients that contain ingredient_name
|
||||
for ingredient in potential_neighbors.keys():
|
||||
if ingredient == ingredient_name:
|
||||
banned_keys.add(ingredient)
|
||||
# if ingredient_name in ingredient.split('_'):
|
||||
# banned_keys.add(ingredient)
|
||||
|
||||
filtered_potential_neighbors = {key: value for key, value in potential_neighbors.items() if
|
||||
key not in banned_keys}
|
||||
|
||||
return filtered_potential_neighbors
|
||||
|
||||
|
||||
def get_nearest_N_neigbours(ingredient_name, ingredients_to_embeddings, all_ingredient_labels,
|
||||
knn_classifier: Union[KNNClassifier, ApproxKNNClassifier], thresh = 50):
|
||||
ingredient_embeddings = ingredients_to_embeddings[ingredient_name]
|
||||
all_distances, all_indices = knn_classifier.k_nearest_neighbors(ingredient_embeddings)
|
||||
|
||||
potential_neighbors = defaultdict(list)
|
||||
|
||||
for i in range(len(ingredient_embeddings)):
|
||||
labels = all_ingredient_labels[all_indices[i]]
|
||||
distances = all_distances[i]
|
||||
|
||||
for label, distance in zip(labels, distances):
|
||||
potential_neighbors[label].append(distance)
|
||||
|
||||
potential_neighbors = filter_out_forbidden_neigbours(ingredient_name, potential_neighbors)
|
||||
sorted_neighbors = custom_potential_neighbors_sort(potential_neighbors)
|
||||
sorted_neighbors2 = []
|
||||
for key, value in sorted_neighbors:
|
||||
if len(value) >= thresh:
|
||||
sorted_neighbors2.append((key, value))
|
||||
# sorted_neighbors = [(key, value) for key, value in sorted_neighbors if len(value) >= len(ingredient_embeddings)] # remove too rare ones
|
||||
# further removal
|
||||
relative_lengths = [len(elem[1]) / (len(sorted_neighbors2[0][1])) for elem in sorted_neighbors2]
|
||||
final_neighbors = []
|
||||
for idx in range(len(relative_lengths)):
|
||||
if relative_lengths[idx] >= 0.0: # Currently doesn't sort anything out
|
||||
final_neighbors.append(sorted_neighbors2[idx])
|
||||
|
||||
try:
|
||||
return list(zip(*final_neighbors))[0]
|
||||
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
|
||||
def clean_ingredient_name(ingredient_name, normalization_fixes):
|
||||
words = ingredient_name.split('_')
|
||||
cleaned_words = []
|
||||
for word in words:
|
||||
if word in normalization_fixes:
|
||||
cleaned_words.append(normalization_fixes[word])
|
||||
else:
|
||||
cleaned_words.append(word)
|
||||
|
||||
return ' '.join(cleaned_words)
|
||||
|
||||
|
||||
def clean_substitutes(subtitutes, normalization_fixes):
|
||||
cleaned_subtitutes = []
|
||||
for subtitute in subtitutes:
|
||||
cleaned_subtitutes.append(clean_ingredient_name(subtitute, normalization_fixes))
|
||||
|
||||
return cleaned_subtitutes
|
||||
|
||||
|
||||
# def test_eval():
|
||||
# return ["Zucker", "Eier", "Reis", "Spaghetti", "Wein", "Gouda_junger"]
|
||||
|
||||
|
||||
def main():
|
||||
# models = ["Versions/vers1/", "Versions/vers2/"]
|
||||
# models = ["final_Versions/models/vers1/", "final_Versions/models/vers2/", "final_Versions/models/vers3/"]
|
||||
models = ["final_Versions/models/vers2/"]
|
||||
thresh = 100
|
||||
# models = ["test/"]
|
||||
|
||||
# os.makedirs('data/eval')
|
||||
|
||||
# test_substitute_pairs_path = 'Versions/test_substitute_pairs.json'
|
||||
|
||||
# normalization_fixes_path = Path('data/eval/normalization_correction.json')
|
||||
max_embedding_count = 100
|
||||
# image_embedding_dim = 768
|
||||
approx_knn = True
|
||||
|
||||
# compare models
|
||||
for curr_model in models:
|
||||
# os.makedirs(curr_model + "eval/")
|
||||
substitute_pairs_path = curr_model + "eval/substitute_pairs_" + str(thresh) + ".json"
|
||||
|
||||
# get embeddings for all ingredients
|
||||
ingredients_to_embeddings = generate_food_embedding_dict(max_sentence_count=max_embedding_count, model_path=curr_model+"output/", eval_path=curr_model + "eval/", dataset_path=curr_model+"dataset/")
|
||||
|
||||
all_ingredient_embeddings = []
|
||||
all_ingredient_labels = []
|
||||
|
||||
# make list of all ingredients and all embeddings
|
||||
for key, value in ingredients_to_embeddings.items():
|
||||
all_ingredient_embeddings.append(value)
|
||||
all_ingredient_labels.extend([key] * len(value))
|
||||
|
||||
all_ingredient_embeddings = np.concatenate(all_ingredient_embeddings)
|
||||
all_ingredient_labels = np.stack(all_ingredient_labels)
|
||||
|
||||
# get knn classifier
|
||||
if approx_knn:
|
||||
knn_classifier: Union[KNNClassifier, ApproxKNNClassifier] = ApproxKNNClassifier(
|
||||
all_ingredient_embeddings=all_ingredient_embeddings,
|
||||
max_embedding_count=max_embedding_count, save_path=Path(curr_model + "eval/" + 'approx_knn_classifier.ann'))
|
||||
else:
|
||||
knn_classifier: Union[KNNClassifier, ApproxKNNClassifier] = KNNClassifier(
|
||||
all_ingredient_embeddings=all_ingredient_embeddings,
|
||||
max_embedding_count=max_embedding_count, save_path=Path(curr_model + "eval/" + 'knn_classifier.joblib'))
|
||||
|
||||
# get substitutes via knn classifier
|
||||
substitute_pairs = set()
|
||||
none_counter = 0
|
||||
subst_dict = {}
|
||||
for ingredient_name in tqdm(ingredients_to_embeddings.keys(), total=len(ingredients_to_embeddings)):
|
||||
substitutes = get_nearest_N_neigbours(ingredient_name=ingredient_name,
|
||||
ingredients_to_embeddings=ingredients_to_embeddings,
|
||||
all_ingredient_labels=all_ingredient_labels,
|
||||
knn_classifier=knn_classifier, thresh=thresh)
|
||||
|
||||
if substitutes is None:
|
||||
none_counter += 1
|
||||
subst_dict[ingredient_name] = []
|
||||
else:
|
||||
subst_dict[ingredient_name] = list(substitutes)
|
||||
|
||||
#
|
||||
# cleaned_substitutes = clean_substitutes(substitutes, normalization_fixes)
|
||||
# for cleaned_substitute in cleaned_substitutes:
|
||||
# substitute_pairs.add((clean_ingredient_name(ingredient_name, normalization_fixes), cleaned_substitute))
|
||||
|
||||
with open(substitute_pairs_path, 'w') as f:
|
||||
json.dump(subst_dict, f, ensure_ascii=False, indent=4)
|
||||
print(f'Nones: {none_counter}')
|
||||
|
||||
|
||||
# output = {}
|
||||
# for ing in ingredients:
|
||||
# output[ing] = []
|
||||
# for model in all_subs.keys():
|
||||
# for ing in ingredients:
|
||||
# output[ing].append(all_subs[model][ing])
|
||||
#
|
||||
# with open(test_substitute_pairs_path, 'w') as f:
|
||||
# json.dump(output, f, ensure_ascii=False, indent=4)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
48
evaluation/ground_truth_stats.py
Normal file
48
evaluation/ground_truth_stats.py
Normal file
@@ -0,0 +1,48 @@
|
||||
import json
|
||||
import statistics
|
||||
|
||||
|
||||
def main():
|
||||
ground_truth_path = "data/ground_truth.json"
|
||||
# ground_truth_path = "evaluation/engl_data/engl_ground_truth.json"
|
||||
with open(ground_truth_path, "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
ingredients_path = "data/mult_ingredients_nice.json"
|
||||
# ingredients_path = "data/cleaned_steps_occurrance.json"
|
||||
with open(ingredients_path, "r") as whole_json_file:
|
||||
ingredients_occurrences = json.load(whole_json_file)
|
||||
synonyms_path = "data/synonyms.json"
|
||||
with open(synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
|
||||
german_words = ["Spätzle", "Schwarzbrot", "Schupfnudeln", "Bratwürste_Nürnberger", "Vanillinzucker", "Bier", "Semmelknödel", "Rote_Bete", "Eisbeine", "Spargel_weiß"]
|
||||
german_total = 0
|
||||
other_total = 0
|
||||
|
||||
all_counts = []
|
||||
occurrence_count = []
|
||||
for base in ground_truth_dict.keys():
|
||||
# print(base + " substitutes: " + str(len(ground_truth_dict[base])))
|
||||
all_counts.append(len(ground_truth_dict[base]))
|
||||
curr_occurrences = ingredients_occurrences[base]
|
||||
if base in synonyms_dict.keys():
|
||||
for syn in synonyms_dict[base]:
|
||||
curr_occurrences += ingredients_occurrences[syn]
|
||||
occurrence_count.append(curr_occurrences)
|
||||
print(base + " occurrences: " + str(curr_occurrences))
|
||||
if base in german_words:
|
||||
german_total += len(ground_truth_dict[base])
|
||||
else:
|
||||
other_total += len(ground_truth_dict[base])
|
||||
|
||||
print("Average: " + str(statistics.mean(all_counts)))
|
||||
print("Median: " + str(statistics.median(all_counts)))
|
||||
print("Standard deviation: " + str(statistics.stdev(all_counts)))
|
||||
print("Min: " + str(min(all_counts)))
|
||||
print("Max: " + str(max(all_counts)))
|
||||
|
||||
# print("german total: " + str(german_total))
|
||||
# print("other total: " + str(other_total))
|
||||
|
||||
|
||||
main()
|
||||
45
evaluation/helpers/approx_knn_classifier.py
Normal file
45
evaluation/helpers/approx_knn_classifier.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from annoy import AnnoyIndex
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# Full guide https://github.com/spotify/annoy
|
||||
class ApproxKNNClassifier:
|
||||
def __init__(self, all_ingredient_embeddings, max_embedding_count,
|
||||
save_path=Path('data/eval/approx_knn_classifier.ann'), n_trees=10):
|
||||
|
||||
vector_length = all_ingredient_embeddings.shape[-1]
|
||||
self.max_embedding_count = max_embedding_count
|
||||
if save_path.exists():
|
||||
print('Loading Existing Approx Classifier')
|
||||
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')
|
||||
self.approx_knn_classifier.load(str(save_path)) # super fast, will just mmap the file
|
||||
else:
|
||||
|
||||
# To make sure we don't just get ourselves: add max_embedding_count
|
||||
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular') # Length of item vector that will be indexed
|
||||
for i in tqdm(range(len(all_ingredient_embeddings)), total=len(all_ingredient_embeddings), desc='Creating Approx Classifier'):
|
||||
self.approx_knn_classifier.add_item(i, all_ingredient_embeddings[i])
|
||||
|
||||
self.approx_knn_classifier.build(n_trees)
|
||||
print('Saving Approx Classifier')
|
||||
self.approx_knn_classifier.save(str(save_path))
|
||||
|
||||
def k_nearest_neighbors(self, ingredient_embeddings):
|
||||
all_indices, all_distances = [], []
|
||||
for idx, ingredient_embedding in enumerate(
|
||||
ingredient_embeddings): # search_k gives you a run-time tradeoff between better accuracy and speed currently defaults
|
||||
indices, distances = self.approx_knn_classifier.get_nns_by_vector(ingredient_embedding, self.max_embedding_count + 200, include_distances=True)
|
||||
all_indices.append(indices)
|
||||
all_distances.append(distances)
|
||||
|
||||
return np.stack(all_distances), np.stack(all_indices)
|
||||
152
evaluation/helpers/generate_ingredient_embeddings.py
Normal file
152
evaluation/helpers/generate_ingredient_embeddings.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.helpers.prediction_model import PredictionModel
|
||||
|
||||
|
||||
def _generate_food_sentence_dict(model_path):
|
||||
with open('data/mult_ingredients_nice.json', "r") as f:
|
||||
food_items = json.load(f)
|
||||
food_items_set = set(food_items.keys())
|
||||
|
||||
with open(model_path + 'training_data.txt', "r") as f:
|
||||
train_instruction_sentences = f.read().splitlines()
|
||||
# remove overlong sentences
|
||||
train_instruction_sentences = [s for s in train_instruction_sentences if len(s.split()) <= 100]
|
||||
|
||||
with open(model_path + 'testing_data.txt', "r") as f:
|
||||
test_instruction_sentences = f.read().splitlines()
|
||||
# remove overlong sentences
|
||||
test_instruction_sentences = [s for s in test_instruction_sentences if len(s.split()) <= 100]
|
||||
|
||||
instruction_sentences = train_instruction_sentences + test_instruction_sentences
|
||||
|
||||
food_to_sentences_dict = defaultdict(list)
|
||||
for sentence in instruction_sentences:
|
||||
words = re.sub("[^\w]-'", " ", sentence).split()
|
||||
for word in words:
|
||||
if word in food_items_set:
|
||||
food_to_sentences_dict[word].append(sentence)
|
||||
|
||||
return food_to_sentences_dict
|
||||
|
||||
|
||||
def _random_sample_with_min_count(population, k):
|
||||
if len(population) <= k:
|
||||
return population
|
||||
else:
|
||||
return random.sample(population, k)
|
||||
|
||||
|
||||
def sample_random_sentence_dict(model_path, max_sentence_count):
|
||||
food_to_sentences_dict = _generate_food_sentence_dict(model_path=model_path)
|
||||
# only keep 100 randomly selected sentences
|
||||
food_to_sentences_dict_random_samples = {food: _random_sample_with_min_count(sentences, max_sentence_count) for
|
||||
food, sentences in food_to_sentences_dict.items()}
|
||||
return food_to_sentences_dict_random_samples
|
||||
|
||||
|
||||
def _map_ingredients_to_input_ids(model_path):
|
||||
with open('data/mult_ingredients_nice.json', "r") as f:
|
||||
ingredients = json.load(f).keys()
|
||||
model = PredictionModel(model_path)
|
||||
ingredient_ids = model.tokenizer.convert_tokens_to_ids(ingredients)
|
||||
|
||||
ingredient_ids_dict = dict(zip(ingredients, ingredient_ids))
|
||||
|
||||
return ingredient_ids_dict
|
||||
|
||||
|
||||
def _merge_synonmys(food_to_embeddings_dict, max_sentence_count):
|
||||
synonmy_replacements_path = Path('foodbert_embeddings/data/synonmy_replacements.json')
|
||||
synonmy_replacements = {}
|
||||
|
||||
merged_dict = defaultdict(list)
|
||||
# Merge ingredients
|
||||
for key, value in food_to_embeddings_dict.items():
|
||||
if key in synonmy_replacements:
|
||||
key_to_use = synonmy_replacements[key]
|
||||
else:
|
||||
key_to_use = key
|
||||
|
||||
merged_dict[key_to_use].append(value)
|
||||
|
||||
merged_dict = {k: np.concatenate(v) for k, v in merged_dict.items()}
|
||||
# When embedding count exceeds maximum allowed, reduce back to requested count
|
||||
for key, value in merged_dict.items():
|
||||
if len(value) > max_sentence_count:
|
||||
index = np.random.choice(value.shape[0], max_sentence_count, replace=False)
|
||||
new_value = value[index]
|
||||
merged_dict[key] = new_value
|
||||
|
||||
return merged_dict
|
||||
|
||||
|
||||
def generate_food_embedding_dict(max_sentence_count, model_path, eval_path='data/eval/', dataset_path="output"):
|
||||
'''
|
||||
Creates a dict where the keys are the ingredients and the values are a list of embeddings with length max_sentence_count or less if there are less occurences
|
||||
These embeddings are used in generate_substitutes.py to predict substitutes
|
||||
'''
|
||||
food_to_embeddings_dict_path = Path(eval_path + 'food_embeddings_dict.pkl')
|
||||
if food_to_embeddings_dict_path.exists():
|
||||
with food_to_embeddings_dict_path.open('rb') as f:
|
||||
food_to_embeddings_dict = pickle.load(f)
|
||||
|
||||
# # delete keys if we deleted ingredients
|
||||
# old_ingredients = set(food_to_embeddings_dict.keys())
|
||||
# with open('train_model/vocab/used_ingredients.json', "r") as f:
|
||||
# new_ingredients = set(json.load(f))
|
||||
#
|
||||
# keys_to_delete = old_ingredients.difference(new_ingredients)
|
||||
# for key in keys_to_delete:
|
||||
# food_to_embeddings_dict.pop(key, None) # delete key if it exists
|
||||
#
|
||||
# # merge new synonyms
|
||||
# food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
|
||||
#
|
||||
# with food_to_embeddings_dict_path.open('wb') as f:
|
||||
# pickle.dump(food_to_embeddings_dict, f) # Overwrite dict with cleaned version
|
||||
|
||||
return food_to_embeddings_dict
|
||||
|
||||
print('Sampling Random Sentences')
|
||||
food_to_sentences_dict_random_samples = sample_random_sentence_dict(model_path=dataset_path, max_sentence_count=max_sentence_count)
|
||||
food_to_embeddings_dict = defaultdict(list)
|
||||
print('Mapping Ingredients to Input Ids')
|
||||
all_ingredient_ids = _map_ingredients_to_input_ids(model_path=model_path)
|
||||
|
||||
prediction_model = PredictionModel(model_path=model_path)
|
||||
|
||||
for food, sentences in tqdm(food_to_sentences_dict_random_samples.items(), total=len(food_to_sentences_dict_random_samples),
|
||||
desc='Calculating Embeddings for Food items'):
|
||||
embeddings, ingredient_ids = prediction_model.predict_embeddings(sentences)
|
||||
# get embedding of food word
|
||||
embeddings_flat = embeddings.view((-1, 768))
|
||||
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
|
||||
food_id = all_ingredient_ids[food]
|
||||
food_embeddings = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
|
||||
food_to_embeddings_dict[food].extend(food_embeddings)
|
||||
|
||||
food_to_embeddings_dict = {k: np.stack(v) for k, v in food_to_embeddings_dict.items()}
|
||||
# Clean synonmy
|
||||
food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
|
||||
|
||||
with food_to_embeddings_dict_path.open('wb') as f:
|
||||
pickle.dump(food_to_embeddings_dict, f)
|
||||
|
||||
return food_to_embeddings_dict
|
||||
38
evaluation/helpers/instructions_dataset.py
Normal file
38
evaluation/helpers/instructions_dataset.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
import torch
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
class InstructionsDataset(Dataset):
|
||||
def __init__(self, tokenizer, sentences):
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
batch_encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, max_length=512, truncation=True)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
self.examples = self._tensorize_batch([torch.tensor(elem) for elem in self.examples])
|
||||
|
||||
def _tensorize_batch(self, examples) -> torch.Tensor:
|
||||
length_of_first = examples[0].size(0)
|
||||
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
|
||||
if are_tensors_same_length:
|
||||
return torch.stack(examples, dim=0)
|
||||
else:
|
||||
if self.tokenizer._pad_token is None:
|
||||
raise ValueError(
|
||||
"You are attempting to pad samples but the tokenizer you are using"
|
||||
f" ({self.tokenizer.__class__.__name__}) does not have one."
|
||||
)
|
||||
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.examples[i]
|
||||
36
evaluation/helpers/knn_classifier.py
Normal file
36
evaluation/helpers/knn_classifier.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import joblib
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
|
||||
|
||||
class KNNClassifier:
|
||||
def __init__(self, all_ingredient_embeddings, max_embedding_count,
|
||||
save_path=Path('data/eval/knn_classifier.joblib')):
|
||||
|
||||
if save_path.exists():
|
||||
print('Loading Existing Classifier')
|
||||
self.knn_classifier: NearestNeighbors = joblib.load(save_path)
|
||||
else:
|
||||
print('Training New Classifier')
|
||||
# To make sure we don't just get ourselves: add max_embedding_count
|
||||
self.knn_classifier: NearestNeighbors = NearestNeighbors(n_neighbors=max_embedding_count + 200, n_jobs=12,
|
||||
algorithm='brute') # kd_tree, ball_tree or brute
|
||||
self.knn_classifier.fit(all_ingredient_embeddings)
|
||||
|
||||
print('Saving Classifier')
|
||||
joblib.dump(self.knn_classifier, save_path)
|
||||
|
||||
print(f'\nKNN with: {self.knn_classifier._fit_method} and leaf size: {self.knn_classifier.leaf_size}\n')
|
||||
|
||||
def k_nearest_neighbors(self, ingredient_embeddings):
|
||||
distances, indices = self.knn_classifier.kneighbors(ingredient_embeddings, return_distance=True)
|
||||
|
||||
return distances, indices
|
||||
53
evaluation/helpers/prediction_model.py
Normal file
53
evaluation/helpers/prediction_model.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
import json
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import BertModel, BertTokenizer
|
||||
|
||||
from evaluation.helpers.instructions_dataset import InstructionsDataset
|
||||
|
||||
|
||||
class PredictionModel:
|
||||
|
||||
def __init__(self, model_path=''):
|
||||
self.model: BertModel = BertModel.from_pretrained(
|
||||
pretrained_model_name_or_path=model_path)
|
||||
with open('train_model/vocab/used_ingredients.json', 'r') as f:
|
||||
used_ingredients = json.load(f)
|
||||
self.tokenizer = BertTokenizer(vocab_file='train_model/vocab/bert_vocab.txt', do_lower_case=False,
|
||||
max_len=512, never_split=used_ingredients, truncation=True)
|
||||
|
||||
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
||||
|
||||
self.model.to(self.device)
|
||||
|
||||
def predict_embeddings(self, sentences):
|
||||
dataset = InstructionsDataset(tokenizer=self.tokenizer, sentences=sentences)
|
||||
dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)
|
||||
|
||||
embeddings = []
|
||||
ingredient_ids = []
|
||||
for batch in dataloader:
|
||||
batch = batch.to(self.device)
|
||||
with torch.no_grad():
|
||||
embeddings_batch = self.model(batch)
|
||||
embeddings.extend(embeddings_batch[0])
|
||||
ingredient_ids.extend(batch)
|
||||
|
||||
return torch.stack(embeddings), ingredient_ids
|
||||
|
||||
def compute_embedding_for_ingredient(self, sentence, ingredient_name):
|
||||
embeddings, ingredient_ids = self.predict_embeddings([sentence])
|
||||
embeddings_flat = embeddings.view((-1, 768))
|
||||
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
|
||||
food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name)
|
||||
food_embedding = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
|
||||
|
||||
return food_embedding[0]
|
||||
166
evaluation/helpers/revise_substitutes.py
Normal file
166
evaluation/helpers/revise_substitutes.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import json
|
||||
synonyms_path = "data/synonyms.json"
|
||||
ground_truth_path = "data/ground_truth.json"
|
||||
category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
|
||||
"Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
|
||||
"Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Truthahn", "Wein"]
|
||||
|
||||
with open("final_Versions/models/vers3/eval/complete_substitute_pairs_50.json") as f:
|
||||
sub_dict = json.load(f)
|
||||
|
||||
|
||||
def engl_combined_substitutes_dict(found_substitutes_dict):
|
||||
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
with open("evaluation/engl_data/revised_engl_ground_truth.json", "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
|
||||
reversed_synonyms_dict = get_reversed_syn_dict(is_engl=True)
|
||||
|
||||
new_found_sub_dict = {}
|
||||
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
new_found_sub_dict[ingredient] = []
|
||||
current_subs = set()
|
||||
for sub in found_substitutes_dict[ingredient]:
|
||||
# delete substitute if it is the same as the ingredient
|
||||
if sub == ingredient:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# delete substitute if it is a synonym of the ingredient
|
||||
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# if substitute is a synonym of sth
|
||||
if sub in reversed_synonyms_dict.keys():
|
||||
if len(reversed_synonyms_dict[sub]) == 1:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
|
||||
|
||||
else:
|
||||
current_subs.add(sub)
|
||||
|
||||
new_found_sub_dict[ingredient] += list(current_subs)
|
||||
return new_found_sub_dict
|
||||
|
||||
|
||||
# merges substitutes with their synonyms, replaces synonyms with base synonym
|
||||
def combined_substitutes_dict(found_substitutes_dict):
|
||||
with open(synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
with open(ground_truth_path, "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
|
||||
|
||||
reversed_synonyms_dict = get_reversed_syn_dict()
|
||||
|
||||
new_found_sub_dict = {}
|
||||
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
new_found_sub_dict[ingredient] = []
|
||||
current_subs = set()
|
||||
for sub in found_substitutes_dict[ingredient]:
|
||||
# delete substitute if it is the same as the ingredient
|
||||
if sub == ingredient:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# delete substitute if it is a synonym of the ingredient
|
||||
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# if substitute is a synonym of sth
|
||||
if sub in reversed_synonyms_dict.keys():
|
||||
if len(reversed_synonyms_dict[sub]) == 1:
|
||||
if reversed_synonyms_dict[sub][0] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
if ingredient in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
current_subs.add(sub)
|
||||
elif len(reversed_synonyms_dict[sub]) == 2:
|
||||
if ingredient in category_subs:
|
||||
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][1])
|
||||
else:
|
||||
print(reversed_synonyms_dict[sub])
|
||||
else:
|
||||
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][1])
|
||||
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
print(reversed_synonyms_dict[sub])
|
||||
else:
|
||||
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
|
||||
|
||||
else:
|
||||
current_subs.add(sub)
|
||||
|
||||
new_found_sub_dict[ingredient] += list(current_subs)
|
||||
return new_found_sub_dict
|
||||
|
||||
|
||||
# combine substitutes found for an ingredient and its synonyms
|
||||
# also combine synonyms in substitutes
|
||||
def combine_all_synonyms(found_substitutes_dict):
|
||||
reversed_synonyms_dict = get_reversed_syn_dict_no_cat()
|
||||
|
||||
new_found_sub_dict = {}
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
if ingredient not in reversed_synonyms_dict.keys():
|
||||
new_found_sub_dict[ingredient] = set()
|
||||
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
if ingredient in reversed_synonyms_dict.keys():
|
||||
new_found_sub_dict[reversed_synonyms_dict[ingredient][0]] |= set(found_substitutes_dict[ingredient])
|
||||
else:
|
||||
new_found_sub_dict[ingredient] |= set(found_substitutes_dict[ingredient])
|
||||
|
||||
new_found_sub_dict_list = {}
|
||||
|
||||
for ingredient in new_found_sub_dict.keys():
|
||||
new_found_sub_dict_list[ingredient] = list(new_found_sub_dict[ingredient])
|
||||
|
||||
return combined_substitutes_dict(new_found_sub_dict_list)
|
||||
|
||||
|
||||
|
||||
def get_reversed_syn_dict(is_engl=False):
|
||||
if is_engl:
|
||||
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
else:
|
||||
with open(synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
reversed_synonyms_dict = {}
|
||||
for ingredient in synonyms_dict.keys():
|
||||
for syn in synonyms_dict[ingredient]:
|
||||
if syn not in reversed_synonyms_dict.keys():
|
||||
reversed_synonyms_dict[syn] = []
|
||||
reversed_synonyms_dict[syn].append(ingredient)
|
||||
|
||||
return reversed_synonyms_dict
|
||||
|
||||
def get_reversed_syn_dict_no_cat():
|
||||
with open(synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
reversed_synonyms_dict = {}
|
||||
for ingredient in synonyms_dict.keys():
|
||||
if ingredient not in category_subs:
|
||||
for syn in synonyms_dict[ingredient]:
|
||||
if syn not in reversed_synonyms_dict.keys():
|
||||
reversed_synonyms_dict[syn] = []
|
||||
reversed_synonyms_dict[syn].append(ingredient)
|
||||
|
||||
return reversed_synonyms_dict
|
||||
|
||||
|
||||
combined_substitutes_dict(sub_dict)
|
||||
42
evaluation/results/vers1/threshold50/engl_all_syn.json
Normal file
42
evaluation/results/vers1/threshold50/engl_all_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 4
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers1/threshold50/engl_base_syn.json
Normal file
42
evaluation/results/vers1/threshold50/engl_base_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers1/threshold50/engl_no_syn.json
Normal file
42
evaluation/results/vers1/threshold50/engl_no_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers1/threshold50/engl_sub_syn.json
Normal file
42
evaluation/results/vers1/threshold50/engl_sub_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
1294
evaluation/results/vers1/threshold50/evaluate50.txt
Normal file
1294
evaluation/results/vers1/threshold50/evaluate50.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,94 @@
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 3035
|
||||
ingredients with over 30 substitutes: 71
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.044284720612103
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 3035
|
||||
ingredients with over 30 substitutes: 71
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.3382796197542315
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 29
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.75
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 29
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.75
|
||||
================================
|
||||
englisch:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.433616143086448
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.457234579224949
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
42
evaluation/results/vers2/threshold50/engl_all_syn.json
Normal file
42
evaluation/results/vers2/threshold50/engl_all_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 2
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 6
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 1
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers2/threshold50/engl_base_syn.json
Normal file
42
evaluation/results/vers2/threshold50/engl_base_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers2/threshold50/engl_no_syn.json
Normal file
42
evaluation/results/vers2/threshold50/engl_no_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers2/threshold50/engl_sub_syn.json
Normal file
42
evaluation/results/vers2/threshold50/engl_sub_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 3
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
}
|
||||
}
|
||||
1294
evaluation/results/vers2/threshold50/evaluate50.txt
Normal file
1294
evaluation/results/vers2/threshold50/evaluate50.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,94 @@
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 2294
|
||||
ingredients with over 30 substitutes: 272
|
||||
ingredients with over 100 substitutes: 10
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 4.630883375840482
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 2294
|
||||
ingredients with over 30 substitutes: 272
|
||||
ingredients with over 100 substitutes: 10
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.102480871782982
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 21
|
||||
ingredients with over 30 substitutes: 1
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 5.375
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 21
|
||||
ingredients with over 30 substitutes: 1
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 5.6
|
||||
================================
|
||||
englisch:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.433616143086448
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.457234579224949
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
42
evaluation/results/vers3/threshold50/engl_all_syn.json
Normal file
42
evaluation/results/vers3/threshold50/engl_all_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 3
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 4
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 5
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 8
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 4
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 7
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 4
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold50/engl_base_syn.json
Normal file
42
evaluation/results/vers3/threshold50/engl_base_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 3
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 3
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 3
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 6
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold50/engl_no_syn.json
Normal file
42
evaluation/results/vers3/threshold50/engl_no_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 2
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 1
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 6
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold50/engl_sub_syn.json
Normal file
42
evaluation/results/vers3/threshold50/engl_sub_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 2
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 5
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 2
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 2
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 7
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 4
|
||||
}
|
||||
}
|
||||
1294
evaluation/results/vers3/threshold50/evaluate50.txt
Normal file
1294
evaluation/results/vers3/threshold50/evaluate50.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,93 @@
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 1604
|
||||
ingredients with over 30 substitutes: 573
|
||||
ingredients with over 100 substitutes: 4
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 8.59169951309993
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 1604
|
||||
ingredients with over 30 substitutes: 573
|
||||
ingredients with over 100 substitutes: 4
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 11.539067934152563
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 3
|
||||
ingredients with over 30 substitutes: 14
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 16.8
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 3
|
||||
ingredients with over 30 substitutes: 14
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 23.25
|
||||
================================
|
||||
englisch:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.433616143086448
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.457234579224949
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
42
evaluation/results/vers3/threshold65/engl_all_syn.json
Normal file
42
evaluation/results/vers3/threshold65/engl_all_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 3
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 4
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 5
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 5
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 1
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 2
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 3
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold65/engl_base_syn.json
Normal file
42
evaluation/results/vers3/threshold65/engl_base_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 3
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 3
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 1
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold65/engl_no_syn.json
Normal file
42
evaluation/results/vers3/threshold65/engl_no_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 1
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold65/engl_sub_syn.json
Normal file
42
evaluation/results/vers3/threshold65/engl_sub_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 5
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 2
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 3
|
||||
}
|
||||
}
|
||||
1293
evaluation/results/vers3/threshold65/evaluate65.txt
Normal file
1293
evaluation/results/vers3/threshold65/evaluate65.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,94 @@
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 2353
|
||||
ingredients with over 30 substitutes: 231
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 4.939485277069325
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 2353
|
||||
ingredients with over 30 substitutes: 231
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 5.935311847901692
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 8
|
||||
ingredients with over 30 substitutes: 5
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 11.95
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 8
|
||||
ingredients with over 30 substitutes: 5
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 13.4
|
||||
================================
|
||||
englisch:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.433616143086448
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.457234579224949
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
42
evaluation/results/vers3/threshold85/engl_all_syn.json
Normal file
42
evaluation/results/vers3/threshold85/engl_all_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 1
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 4
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 5
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 3
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold85/engl_base_syn.json
Normal file
42
evaluation/results/vers3/threshold85/engl_base_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 2
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold85/engl_no_syn.json
Normal file
42
evaluation/results/vers3/threshold85/engl_no_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
42
evaluation/results/vers3/threshold85/engl_sub_syn.json
Normal file
42
evaluation/results/vers3/threshold85/engl_sub_syn.json
Normal file
@@ -0,0 +1,42 @@
|
||||
{
|
||||
"carrot": {
|
||||
"engl": 0,
|
||||
"ger": 0
|
||||
},
|
||||
"cherry": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chicken": {
|
||||
"engl": 1,
|
||||
"ger": 0
|
||||
},
|
||||
"parsley": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"chocolate": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"bacon": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"kale": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"sugar": {
|
||||
"engl": 2,
|
||||
"ger": 0
|
||||
},
|
||||
"brie": {
|
||||
"engl": 3,
|
||||
"ger": 0
|
||||
},
|
||||
"turkey": {
|
||||
"engl": 3,
|
||||
"ger": 1
|
||||
}
|
||||
}
|
||||
1293
evaluation/results/vers3/threshold85/evaluate85.txt
Normal file
1293
evaluation/results/vers3/threshold85/evaluate85.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,94 @@
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 2996
|
||||
ingredients with over 30 substitutes: 100
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.6978900996985855
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4313
|
||||
number of nones: 2996
|
||||
ingredients with over 30 substitutes: 100
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.9476002782286113
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 14
|
||||
ingredients with over 30 substitutes: 2
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 7.45
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 40
|
||||
number of nones: 14
|
||||
ingredients with over 30 substitutes: 2
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 7.65
|
||||
================================
|
||||
englisch:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.433616143086448
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 4361
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 22
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 6.457234579224949
|
||||
--------------------------------------------
|
||||
ground truth only:
|
||||
|
||||
cap at 30 set to True
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.417429094236048
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
cap at 30 set to False
|
||||
english ingredients with over 30 substitutes: 22
|
||||
english nones: 11
|
||||
average amount of substitutes found for english ingredients: 6.440988106129917
|
||||
number of ingredients in dataset: 10
|
||||
number of nones: 0
|
||||
ingredients with over 30 substitutes: 0
|
||||
ingredients with over 100 substitutes: 0
|
||||
ingredients with over 1000 substitutes: 0
|
||||
average number of substitutes: 2.7
|
||||
|
||||
207
evaluation/stats_engl_substitutes_compare.py
Normal file
207
evaluation/stats_engl_substitutes_compare.py
Normal file
@@ -0,0 +1,207 @@
|
||||
from transformers import BertTokenizer
|
||||
import json
|
||||
|
||||
|
||||
def print_stats(model_substitutes_dict, cap_at_30):
|
||||
print("\ncap at 30 set to " + str(cap_at_30))
|
||||
evaluation_path = "evaluation/"
|
||||
# synonyms_path = "synonyms.json"
|
||||
|
||||
found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
|
||||
model_name = "final_Versions/models/vers3/output/"
|
||||
|
||||
with open(evaluation_path + "engl_data/substitute_pairs_foodbert_text.json", "r") as whole_json_file:
|
||||
engl_list = json.load(whole_json_file)
|
||||
|
||||
engl_dict = {}
|
||||
for foo in engl_list:
|
||||
if foo[0] in engl_dict.keys():
|
||||
engl_dict[foo[0]].append(foo[1])
|
||||
else:
|
||||
engl_dict[foo[0]] = [foo[1]]
|
||||
|
||||
substitute_sum = 0
|
||||
over30 = 0
|
||||
for ingred in engl_dict.keys():
|
||||
curr_nr = len(engl_dict[ingred])
|
||||
if cap_at_30:
|
||||
if curr_nr > 30:
|
||||
substitute_sum += 30
|
||||
over30 += 1
|
||||
else:
|
||||
substitute_sum += curr_nr
|
||||
else:
|
||||
if curr_nr > 30:
|
||||
over30 += 1
|
||||
substitute_sum += curr_nr
|
||||
print("english ingredients with over 30 substitutes: " + str(over30))
|
||||
print("english nones: " + str(4372-len(engl_dict.keys())))
|
||||
print("average amount of substitutes found for english ingredients: " + str(substitute_sum / 4372))
|
||||
|
||||
# with open(found_substitutes_path, "r") as whole_json_file:
|
||||
# model_substitutes_dict = json.load(whole_json_file)[model_name]
|
||||
|
||||
substitute_sum = 0
|
||||
over100 = 0
|
||||
over1000 = 0
|
||||
over30 = 0
|
||||
nones = 0
|
||||
|
||||
for ingred in model_substitutes_dict.keys():
|
||||
|
||||
curr_nr = len(model_substitutes_dict[ingred])
|
||||
if curr_nr == 0:
|
||||
nones += 1
|
||||
if curr_nr > 100:
|
||||
# print(ingred + ": " + str(curr_nr))
|
||||
over100 += 1
|
||||
if curr_nr > 1000:
|
||||
# print(ingred + ": " + str(curr_nr))
|
||||
over1000 += 1
|
||||
if cap_at_30:
|
||||
if curr_nr > 30:
|
||||
substitute_sum += 30
|
||||
over30 += 1
|
||||
else:
|
||||
substitute_sum += curr_nr
|
||||
else:
|
||||
if curr_nr > 30:
|
||||
over30 += 1
|
||||
substitute_sum += curr_nr
|
||||
# print(str(substitute_sum))
|
||||
print("number of ingredients in dataset: " + str(len(model_substitutes_dict.keys())))
|
||||
print("number of nones: " + str(nones))
|
||||
print("ingredients with over 30 substitutes: " + str(over30))
|
||||
print("ingredients with over 100 substitutes: " + str(over100))
|
||||
print("ingredients with over 1000 substitutes: " + str(over1000))
|
||||
print("average number of substitutes: " + str(substitute_sum / len(model_substitutes_dict.keys())))
|
||||
# print(str(len(model_substitutes_dict.keys())))
|
||||
|
||||
|
||||
def main():
|
||||
# with open("train_model/vocab/used_ingredients.json", "r") as used_ingredients_file:
|
||||
# used_ingredients = json.load(used_ingredients_file)
|
||||
# tokenizer = BertTokenizer(vocab_file='train_model/vocab/vocab.txt', do_lower_case=False, model_max_length=512,
|
||||
# never_split=used_ingredients)
|
||||
#
|
||||
# sent = ["Die Paprika schneiden. Dann die Stücke kochen."]
|
||||
#
|
||||
# batch_encoding = tokenizer.batch_encode_plus(sent, add_special_tokens=True, max_length=512, truncation=True)
|
||||
#
|
||||
# # Get the input IDs and attention mask in tensor format
|
||||
# input_ids = batch_encoding['input_ids']
|
||||
# attn_mask = batch_encoding['attention_mask']
|
||||
#
|
||||
# print(input_ids)
|
||||
# print(attn_mask)
|
||||
|
||||
evaluation_path = "evaluation/"
|
||||
synonyms_path = "synonyms.json"
|
||||
data_path = "data/"
|
||||
|
||||
engl_data_path = evaluation_path + "engl_data/"
|
||||
|
||||
found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
|
||||
# model_name = "final_Versions/models/vers3/output/"
|
||||
|
||||
with open(found_substitutes_path, "r") as whole_json_file:
|
||||
model_substitutes_dict = json.load(whole_json_file)
|
||||
|
||||
with open(data_path + synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
|
||||
category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
|
||||
"Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
|
||||
"Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Geflügelfleisch", "Wein", "Suppenfleisch"]
|
||||
|
||||
# synonyms_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter"],
|
||||
# "Hähnchenfilet": ["Filet_Hähnchen", "Hühnerfilet"],
|
||||
# "Huhn": ["Hähnchenfilet", "Filet_Hähnchen", "Hühnchenschenkel", "Hühnerbeine"],
|
||||
# "Kuvertüre_Zartbitter": ["Zartbitterkuvertüre"]}
|
||||
#
|
||||
# model_substitutes_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter", "Kuvertüre_Zartbitter", "Zartbitterkuvertüre", "Nutella"],
|
||||
# "Schokolade_Zartbitter": ["Kuvertüre_Zartbitter", "Weiße_Schokolade", "Zartbitterschokolade"],
|
||||
# "Huhn": ["Hähnchenfilet", "Schweinelende"],
|
||||
# "Dill": ["Petersilie"]}
|
||||
|
||||
|
||||
final_dict = {}
|
||||
|
||||
new_syn_dict = {}
|
||||
# get base word for all synonyms
|
||||
for ingred in synonyms_dict.keys():
|
||||
if ingred not in category_subs:
|
||||
for syn in synonyms_dict[ingred]:
|
||||
new_syn_dict[syn] = ingred
|
||||
|
||||
#
|
||||
for ingred in model_substitutes_dict.keys():
|
||||
if ingred not in new_syn_dict.keys():
|
||||
final_dict[ingred] = set()
|
||||
|
||||
for ingred in model_substitutes_dict.keys():
|
||||
curr_set = set()
|
||||
for sub in model_substitutes_dict[ingred]:
|
||||
if sub in new_syn_dict:
|
||||
curr_set.add(new_syn_dict[sub])
|
||||
else:
|
||||
curr_set.add(sub)
|
||||
if ingred not in new_syn_dict:
|
||||
final_dict[ingred] |= curr_set
|
||||
else:
|
||||
test = new_syn_dict[ingred]
|
||||
final_dict[test] |= curr_set
|
||||
# print(final_dict)
|
||||
for ingred in final_dict.keys():
|
||||
if ingred in final_dict[ingred]:
|
||||
final_dict[ingred].remove(ingred)
|
||||
|
||||
new_final_dict = {}
|
||||
for ingred in final_dict.keys():
|
||||
new_final_dict[ingred] = list(final_dict[ingred])
|
||||
|
||||
with open(found_substitutes_path, "r") as whole_json_file:
|
||||
new_final_dict = json.load(whole_json_file)
|
||||
|
||||
print_stats(new_final_dict, cap_at_30=True)
|
||||
print_stats(new_final_dict, cap_at_30=False)
|
||||
|
||||
print("--------------------------------------------\nground truth only: ")
|
||||
|
||||
with open("data/ground_truth.json", "r") as whole_json_file:
|
||||
ground_truth = json.load(whole_json_file)
|
||||
|
||||
ground_truth_only = {}
|
||||
for ingred in new_final_dict.keys():
|
||||
if ingred in ground_truth.keys():
|
||||
ground_truth_only[ingred] = new_final_dict[ingred]
|
||||
|
||||
print_stats(ground_truth_only, cap_at_30=True)
|
||||
print_stats(ground_truth_only, cap_at_30=False)
|
||||
|
||||
print("================================\nenglisch:")
|
||||
with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
|
||||
engl_list = json.load(whole_json_file)
|
||||
with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file:
|
||||
engl_ground_truth = json.load(whole_json_file)
|
||||
|
||||
engl_dict = {}
|
||||
for foo in engl_list:
|
||||
if foo[0] in engl_dict.keys():
|
||||
engl_dict[foo[0]].append(foo[1])
|
||||
else:
|
||||
engl_dict[foo[0]] = [foo[1]]
|
||||
print_stats(engl_dict, cap_at_30=True)
|
||||
print_stats(engl_dict, cap_at_30=False)
|
||||
|
||||
print("--------------------------------------------\nground truth only: ")
|
||||
|
||||
ground_truth_only = {}
|
||||
for ingred in engl_dict.keys():
|
||||
if ingred in engl_ground_truth.keys():
|
||||
ground_truth_only[ingred] = engl_dict[ingred]
|
||||
|
||||
print_stats(ground_truth_only, cap_at_30=True)
|
||||
print_stats(ground_truth_only, cap_at_30=False)
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user