from transformers import BertTokenizer import json def print_stats(model_substitutes_dict, cap_at_30): print("\ncap at 30 set to " + str(cap_at_30)) evaluation_path = "evaluation/" # synonyms_path = "synonyms.json" found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json" model_name = "final_Versions/models/vers3/output/" with open(evaluation_path + "engl_data/substitute_pairs_foodbert_text.json", "r") as whole_json_file: engl_list = json.load(whole_json_file) engl_dict = {} for foo in engl_list: if foo[0] in engl_dict.keys(): engl_dict[foo[0]].append(foo[1]) else: engl_dict[foo[0]] = [foo[1]] substitute_sum = 0 over30 = 0 for ingred in engl_dict.keys(): curr_nr = len(engl_dict[ingred]) if cap_at_30: if curr_nr > 30: substitute_sum += 30 over30 += 1 else: substitute_sum += curr_nr else: if curr_nr > 30: over30 += 1 substitute_sum += curr_nr print("english ingredients with over 30 substitutes: " + str(over30)) print("english nones: " + str(4372-len(engl_dict.keys()))) print("average amount of substitutes found for english ingredients: " + str(substitute_sum / 4372)) # with open(found_substitutes_path, "r") as whole_json_file: # model_substitutes_dict = json.load(whole_json_file)[model_name] substitute_sum = 0 over100 = 0 over1000 = 0 over30 = 0 nones = 0 for ingred in model_substitutes_dict.keys(): curr_nr = len(model_substitutes_dict[ingred]) if curr_nr == 0: nones += 1 if curr_nr > 100: # print(ingred + ": " + str(curr_nr)) over100 += 1 if curr_nr > 1000: # print(ingred + ": " + str(curr_nr)) over1000 += 1 if cap_at_30: if curr_nr > 30: substitute_sum += 30 over30 += 1 else: substitute_sum += curr_nr else: if curr_nr > 30: over30 += 1 substitute_sum += curr_nr # print(str(substitute_sum)) print("number of ingredients in dataset: " + str(len(model_substitutes_dict.keys()))) print("number of nones: " + str(nones)) print("ingredients with over 30 substitutes: " + str(over30)) print("ingredients with over 100 substitutes: " + str(over100)) print("ingredients with over 1000 substitutes: " + str(over1000)) print("average number of substitutes: " + str(substitute_sum / len(model_substitutes_dict.keys()))) # print(str(len(model_substitutes_dict.keys()))) def main(): # with open("train_model/vocab/used_ingredients.json", "r") as used_ingredients_file: # used_ingredients = json.load(used_ingredients_file) # tokenizer = BertTokenizer(vocab_file='train_model/vocab/vocab.txt', do_lower_case=False, model_max_length=512, # never_split=used_ingredients) # # sent = ["Die Paprika schneiden. Dann die Stücke kochen."] # # batch_encoding = tokenizer.batch_encode_plus(sent, add_special_tokens=True, max_length=512, truncation=True) # # # Get the input IDs and attention mask in tensor format # input_ids = batch_encoding['input_ids'] # attn_mask = batch_encoding['attention_mask'] # # print(input_ids) # print(attn_mask) evaluation_path = "evaluation/" synonyms_path = "synonyms.json" data_path = "data/" engl_data_path = evaluation_path + "engl_data/" found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json" # model_name = "final_Versions/models/vers3/output/" with open(found_substitutes_path, "r") as whole_json_file: model_substitutes_dict = json.load(whole_json_file) with open(data_path + synonyms_path, "r") as whole_json_file: synonyms_dict = json.load(whole_json_file) category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm", "Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur", "Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Geflügelfleisch", "Wein", "Suppenfleisch"] # synonyms_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter"], # "Hähnchenfilet": ["Filet_Hähnchen", "Hühnerfilet"], # "Huhn": ["Hähnchenfilet", "Filet_Hähnchen", "Hühnchenschenkel", "Hühnerbeine"], # "Kuvertüre_Zartbitter": ["Zartbitterkuvertüre"]} # # model_substitutes_dict = {"Zartbitterschokolade": ["Schokolade_Zartbitter", "Kuvertüre_Zartbitter", "Zartbitterkuvertüre", "Nutella"], # "Schokolade_Zartbitter": ["Kuvertüre_Zartbitter", "Weiße_Schokolade", "Zartbitterschokolade"], # "Huhn": ["Hähnchenfilet", "Schweinelende"], # "Dill": ["Petersilie"]} final_dict = {} new_syn_dict = {} # get base word for all synonyms for ingred in synonyms_dict.keys(): if ingred not in category_subs: for syn in synonyms_dict[ingred]: new_syn_dict[syn] = ingred # for ingred in model_substitutes_dict.keys(): if ingred not in new_syn_dict.keys(): final_dict[ingred] = set() for ingred in model_substitutes_dict.keys(): curr_set = set() for sub in model_substitutes_dict[ingred]: if sub in new_syn_dict: curr_set.add(new_syn_dict[sub]) else: curr_set.add(sub) if ingred not in new_syn_dict: final_dict[ingred] |= curr_set else: test = new_syn_dict[ingred] final_dict[test] |= curr_set # print(final_dict) for ingred in final_dict.keys(): if ingred in final_dict[ingred]: final_dict[ingred].remove(ingred) new_final_dict = {} for ingred in final_dict.keys(): new_final_dict[ingred] = list(final_dict[ingred]) with open(found_substitutes_path, "r") as whole_json_file: new_final_dict = json.load(whole_json_file) print_stats(new_final_dict, cap_at_30=True) print_stats(new_final_dict, cap_at_30=False) print("--------------------------------------------\nground truth only: ") with open("data/ground_truth.json", "r") as whole_json_file: ground_truth = json.load(whole_json_file) ground_truth_only = {} for ingred in new_final_dict.keys(): if ingred in ground_truth.keys(): ground_truth_only[ingred] = new_final_dict[ingred] print_stats(ground_truth_only, cap_at_30=True) print_stats(ground_truth_only, cap_at_30=False) print("================================\nenglisch:") with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file: engl_list = json.load(whole_json_file) with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file: engl_ground_truth = json.load(whole_json_file) engl_dict = {} for foo in engl_list: if foo[0] in engl_dict.keys(): engl_dict[foo[0]].append(foo[1]) else: engl_dict[foo[0]] = [foo[1]] print_stats(engl_dict, cap_at_30=True) print_stats(engl_dict, cap_at_30=False) print("--------------------------------------------\nground truth only: ") ground_truth_only = {} for ingred in engl_dict.keys(): if ingred in engl_ground_truth.keys(): ground_truth_only[ingred] = engl_dict[ingred] print_stats(ground_truth_only, cap_at_30=True) print_stats(ground_truth_only, cap_at_30=False) main()