import json import statistics data_path = "data/" occurances_path = "mult_ingredients_nice.json" ground_truth_path = "ground_truth.json" engl_data_path = "evaluation/engl_data/" evaluation_path = "evaluation/" synonyms_path = "synonyms.json" found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json" # model_name = "Versions/vers3/" german_ground_truth = { "Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen", "Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel"], "Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren", "Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren", "Maraschino", "Beeren", "Trockenpflaumen"], "Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester", "Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"], "Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill", "Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze", "Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"], "Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten", "Erdnussbutter"], "Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken", "Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons", "Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"], "Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl", "Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"], "Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup", "Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup", "Sirup"], "Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse", "Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse", "Scheiblettenkäse"], "Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch", "Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule", "Wachtel", "schweinekotelett", "Wildfleisch"] } def no_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True): if get_occurrences: with open(data_path + occurances_path, "r") as whole_json_file: occurrences_dict = json.load(whole_json_file) if not ground_truth_dict: with open(data_path+ground_truth_path, "r") as whole_json_file: ground_truth_dict = json.load(whole_json_file) if synonyms: with open(data_path + synonyms_path, "r") as whole_json_file: synonyms_dict = json.load(whole_json_file) else: synonyms_dict = {} if not found_substitutes_dict: with open(found_substitutes_path, "r") as whole_json_file: model_substitutes_dict = json.load(whole_json_file) else: model_substitutes_dict = found_substitutes_dict found_ground_ingr = {} correctly_found = 0 incorrectly_found = 0 average_precision = 0.0 average_recall = 0.0 number_correct_subs_found_overall = [] total_number_subs_found_overall = [] # base ingredient without synonyms, substitutes with synonyms for base_ingred in ground_truth_dict.keys(): if get_occurrences: occurrences = occurrences_dict[base_ingred] found_substitutes = model_substitutes_dict[base_ingred].copy() # if len(found_substitutes) > 30: # found_substitutes = found_substitutes[:30] found = [] # remove synonyms of base ingredient new_found_substitutes = [] for subst in found_substitutes: if base_ingred in synonyms_dict.keys(): if subst not in synonyms_dict[base_ingred]: new_found_substitutes.append(subst) else: new_found_substitutes.append(subst) found_substitutes = new_found_substitutes # check which substitutes were found for subst in ground_truth_dict[base_ingred]: # only add substitute if not already added if subst in found_substitutes and subst not in found: found.append(subst) found_substitutes.remove(subst) # check if synonyms of substitute were found # check if ingredient has synonyms if subst in synonyms_dict.keys(): for synon in synonyms_dict[subst]: if synon in found_substitutes: if synon not in found and subst not in found: found.append(subst) found_substitutes.remove(synon) # if base_ingred == "Erdbeere": print(base_ingred + ": " + str(found_substitutes)) found_ground_ingr[base_ingred] = found # print(base_ingred + ": ") # if get_occurrences: # print("occurrences in dataset: " + str(occurrences)) # print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred]))) # print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes))) # print("correctly found substitutes: " + str(found)) # print("incorrectly found substitutes: " + str(found_substitutes)) # print("-----------------------------\n") if len(found) > 0: average_precision += len(found)/(len(found) + len(found_substitutes)) # print(len(found)) average_recall += len(found)/len(ground_truth_dict[base_ingred]) correctly_found += len(found) incorrectly_found += len(found_substitutes) number_correct_subs_found_overall.append(len(found)) total_number_subs_found_overall.append(len(found) + len(found_substitutes)) print("average precision: " + str(average_precision/40)) print("average recall: " + str(average_recall/40)) print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall))) print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall))) return found_ground_ingr def merge_lists(all_lists): max_len = 0 min_len = 99999 output = [] for curr_list in all_lists: if len(curr_list) < min_len: min_len = len(curr_list) if len(curr_list) > max_len: max_len = len(curr_list) for index_counter in range(max_len): for curr_list in all_lists: if index_counter < len(curr_list): if curr_list[index_counter] not in output: output.append(curr_list[index_counter]) return output def with_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True): if get_occurrences: with open(data_path + occurances_path, "r") as whole_json_file: occurrences_dict = json.load(whole_json_file) if not ground_truth_dict: with open(data_path+ground_truth_path, "r") as whole_json_file: ground_truth_dict = json.load(whole_json_file) if synonyms: with open(data_path + synonyms_path, "r") as whole_json_file: synonyms_dict = json.load(whole_json_file) else: synonyms_dict = {} if not found_substitutes_dict: with open(found_substitutes_path, "r") as whole_json_file: model_substitutes_dict = json.load(whole_json_file) else: model_substitutes_dict = found_substitutes_dict correctly_found = 0 incorrectly_found = 0 average_precision = 0.0 average_recall = 0.0 number_correct_subs_found_overall = [] total_number_subs_found_overall = [] found_ground_ingr = {} # base ingredient with synonyms, substitutes with synonyms for base_ingred in ground_truth_dict.keys(): base_synonyms = [base_ingred] if get_occurrences: occurrences = 0 # get list of all synonyms of base ingredient if base_ingred in synonyms_dict.keys(): synonyms = synonyms_dict[base_ingred] base_synonyms = base_synonyms + synonyms found_substitutes = [] all_substitutes = [] # get top 30 substitutes of each base synonym for synon in base_synonyms: if get_occurrences: occurrences += occurrences_dict[synon] all_substitutes.append(model_substitutes_dict[synon].copy()) # synon_subs = model_substitutes_dict[synon].copy() # if len(synon_subs) > 30: # synon_subs = synon_subs[:30] # for sub in synon_subs: # if sub not in found_substitutes: # found_substitutes.append(sub) found_substitutes = merge_lists(all_substitutes) else: found_substitutes = model_substitutes_dict[base_ingred].copy() if len(found_substitutes) > 30: found_substitutes = found_substitutes[:30] found = [] # remove all base synonyms from found substitutes new_found_substitutes = [] for subst in found_substitutes: if subst not in base_synonyms: new_found_substitutes.append(subst) found_substitutes = new_found_substitutes # check which substitutes were found for subst in ground_truth_dict[base_ingred]: # only add substitute if not already added if subst in found_substitutes and subst not in found: found.append(subst) found_substitutes.remove(subst) # check if synonyms of substitute were found # check if ingredient has synonyms if subst in synonyms_dict.keys(): for synon in synonyms_dict[subst]: if synon in found_substitutes: if synon not in found and subst not in found: found.append(subst) found_substitutes.remove(synon) found_ground_ingr[base_ingred] = found # print(base_ingred + ": ") # if get_occurrences: # print("occurrences in dataset: " + str(occurrences)) # print("number of synonyms incl. original word: " + str(len(base_synonyms))) # print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred]))) # print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes))) # print("correctly found substitutes: " + str(found)) # print("incorrectly found substitutes: " + str(found_substitutes)) # print("-----------------------------\n") if len(found) > 0: average_precision += len(found) / (len(found) + len(found_substitutes)) average_recall += len(found) / len(ground_truth_dict[base_ingred]) correctly_found += len(found) incorrectly_found += len(found_substitutes) number_correct_subs_found_overall.append(len(found)) total_number_subs_found_overall.append(len(found) + len(found_substitutes)) print("average precision: " + str(average_precision / 40)) print("average recall: " + str(average_recall / 40)) print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall))) print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall))) return found_ground_ingr def translate_engl_ground_truth(ground_truth, ger_transl): new_ground_truth = {} for base_ingr in ground_truth.keys(): new_ground_truth[ger_transl[base_ingr]] = [] for subst in ground_truth[base_ingr]: if subst in ger_transl.keys(): new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst]) return new_ground_truth def with_base_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True): if get_occurrences: with open(data_path + occurances_path, "r") as whole_json_file: occurrences_dict = json.load(whole_json_file) if not ground_truth_dict: with open(data_path+ground_truth_path, "r") as whole_json_file: ground_truth_dict = json.load(whole_json_file) if synonyms: with open(data_path + synonyms_path, "r") as whole_json_file: synonyms_dict = json.load(whole_json_file) else: synonyms_dict = {} if not found_substitutes_dict: with open(found_substitutes_path, "r") as whole_json_file: model_substitutes_dict = json.load(whole_json_file) else: model_substitutes_dict = found_substitutes_dict found_ground_ingr = {} # base ingredient with synonyms, substitutes with synonyms for base_ingred in ground_truth_dict.keys(): base_synonyms = [base_ingred] if get_occurrences: occurrences = 0 # get list of all synonyms of base ingredient if base_ingred in synonyms_dict.keys(): synonyms = synonyms_dict[base_ingred] base_synonyms = base_synonyms + synonyms found_substitutes = [] all_substitutes = [] # get top 30 substitutes of each base synonym for synon in base_synonyms: if get_occurrences: occurrences += occurrences_dict[synon] all_substitutes.append(model_substitutes_dict[synon].copy()) found_substitutes = merge_lists(all_substitutes) else: found_substitutes = model_substitutes_dict[base_ingred].copy() if len(found_substitutes) > 30: found_substitutes = found_substitutes[:30] found = [] # remove all base synonyms from found substitutes new_found_substitutes = [] for subst in found_substitutes: if subst not in base_synonyms: new_found_substitutes.append(subst) found_substitutes = new_found_substitutes # check which substitutes were found for subst in ground_truth_dict[base_ingred]: # only add substitute if not already added if subst in found_substitutes and subst not in found: found.append(subst) found_substitutes.remove(subst) # check if synonyms of substitute were found # check if ingredient has synonyms # if subst in synonyms_dict.keys(): # for synon in synonyms_dict[subst]: # if synon in found_substitutes: # if synon not in found and subst not in found: # found.append(subst) # found_substitutes.remove(synon) found_ground_ingr[base_ingred] = found print(base_ingred + ": ") if get_occurrences: print("occurrences in dataset: " + str(occurrences)) print("number of synonyms incl. original word: " + str(len(base_synonyms))) print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred]))) print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes))) print("correctly found substitutes: " + str(found)) print("incorrectly found substitutes: " + str(found_substitutes)) print("-----------------------------\n") return found_ground_ingr def engl_compare(): # with open(data_path + occurances_path, "r") as whole_json_file: # occurrences_dict = json.load(whole_json_file) with open(engl_data_path + "translation.json", "r") as whole_json_file: ger_transl = json.load(whole_json_file) # with open(data_path + synonyms_path, "r") as whole_json_file: # synonyms_dict = json.load(whole_json_file) with open(found_substitutes_path, "r") as whole_json_file: model_substitutes_dict = json.load(whole_json_file) with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file: engl_list = json.load(whole_json_file) with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file: engl_ground_truth = json.load(whole_json_file) engl_dict = {} for foo in engl_list: if foo[0] in engl_dict.keys(): engl_dict[foo[0]].append(foo[1]) else: engl_dict[foo[0]] = [foo[1]] translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl) # without any synonyms print("Engl compare without any synonyms:") engl_replacements = {} # ger_replacements = {} for ingred in engl_ground_truth.keys(): found = [] incorr = [] found_ger = [] incorr_ger = [] engl_replacements[ingred] = {} engl_replacements[ingred]["engl"] = 0 engl_replacements[ingred]["ger"] = 0 # ger_replacements[ingred] = 0 if ingred in engl_dict.keys(): for sub in engl_ground_truth[ingred]: if sub in engl_dict[ingred]: engl_replacements[ingred]["engl"] += 1 found.append(sub) if ger_transl[ingred] in model_substitutes_dict.keys(): for sub in german_ground_truth[ger_transl[ingred]]: if sub in model_substitutes_dict[ger_transl[ingred]]: engl_replacements[ingred]["ger"] += 1 found_ger.append(sub) # ger_replacements[ingred] += 1 for found_sub in engl_dict[ingred]: if found_sub not in engl_ground_truth[ingred]: incorr.append(found_sub) for found_sub in model_substitutes_dict[ger_transl[ingred]]: if found_sub not in translated_ground_truth[ger_transl[ingred]]: incorr_ger.append(found_sub) print(ger_transl[ingred] + ": ") print("number of found substitutes: " + str(len(found_ger)) + "/" + str(len(translated_ground_truth[ger_transl[ingred]]))) print("correctly found substitutes: " + str(len(found_ger)) + "/" + str(len(found_ger) + len(incorr_ger))) print("correctly found substitutes: " + str(found_ger)) print("incorrectly found substitutes: " + str(incorr_ger)) print("-----------------------------\n") print(ingred + ": ") print("number of found substitutes: " + str(len(found)) + "/" + str(len(engl_ground_truth[ingred]))) print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(incorr))) print("correctly found substitutes: " + str(found)) print("incorrectly found substitutes: " + str(incorr)) print("-----------------------------\n") with open(evaluation_path + "engl_comparison_results/engl_no_syn.json", 'w') as f: json.dump(engl_replacements, f, ensure_ascii=False, indent=4) # with synonyms of substitutes print("Engl compare with synonyms of substitutes only:") # german new_german_result = no_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False) #engl new_engl_result = no_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False) engl_replacements = {} for ingred in engl_ground_truth.keys(): engl_replacements[ingred] = {} engl_replacements[ingred]["engl"] = 0 engl_replacements[ingred]["ger"] = 0 if ingred in new_engl_result.keys(): for sub in engl_ground_truth[ingred]: if sub in new_engl_result[ingred]: engl_replacements[ingred]["engl"] += 1 if ger_transl[ingred] in new_german_result.keys(): for sub in german_ground_truth[ger_transl[ingred]]: if sub in new_german_result[ger_transl[ingred]]: engl_replacements[ingred]["ger"] += 1 with open(evaluation_path + "engl_comparison_results/engl_sub_syn.json", 'w') as f: json.dump(engl_replacements, f, ensure_ascii=False, indent=4) # with synonyms for substitutes and base words print("Engl compare with synonyms of both:") # german new_german_result = with_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False) # engl new_engl_result = with_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False) engl_replacements = {} for ingred in engl_ground_truth.keys(): engl_replacements[ingred] = {} engl_replacements[ingred]["engl"] = 0 engl_replacements[ingred]["ger"] = 0 if ingred in new_engl_result.keys(): for sub in engl_ground_truth[ingred]: if sub in new_engl_result[ingred]: engl_replacements[ingred]["engl"] += 1 if ger_transl[ingred] in new_german_result.keys(): for sub in german_ground_truth[ger_transl[ingred]]: if sub in new_german_result[ger_transl[ingred]]: engl_replacements[ingred]["ger"] += 1 with open(evaluation_path + "engl_comparison_results/engl_all_syn.json", 'w') as f: json.dump(engl_replacements, f, ensure_ascii=False, indent=4) # with synonyms for base words print("Engl compare with synonyms of base words only:") # german new_german_result = with_base_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False) # engl new_engl_result = with_base_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False) engl_replacements = {} for ingred in engl_ground_truth.keys(): engl_replacements[ingred] = {} engl_replacements[ingred]["engl"] = 0 engl_replacements[ingred]["ger"] = 0 if ingred in new_engl_result.keys(): for sub in engl_ground_truth[ingred]: if sub in new_engl_result[ingred]: engl_replacements[ingred]["engl"] += 1 if ger_transl[ingred] in new_german_result.keys(): for sub in german_ground_truth[ger_transl[ingred]]: if sub in new_german_result[ger_transl[ingred]]: engl_replacements[ingred]["ger"] += 1 with open(evaluation_path + "engl_comparison_results/engl_base_syn.json", 'w') as f: json.dump(engl_replacements, f, ensure_ascii=False, indent=4) print("test") def main(): # compare english and german results # engl_compare() print("--------------------------------------------------------") print("--------------------------------------------------------") print("--------------------------------------------------------\n") # get results, synonyms only used in substitutes no_synonyms() print("--------------------------------------------------------") print("--------------------------------------------------------") print("--------------------------------------------------------\n") # get results, synonyms used in substitutes and base ingredients with_synonyms() main()