added to README files, added full dataset versions to data

This commit is contained in:
2021-04-15 20:19:09 +02:00
parent cf40ad15fb
commit 1ea0677029
9 changed files with 61 additions and 543 deletions

View File

@@ -0,0 +1,18 @@
Some parameters (model version, etc.) need to be adjusted in all scripts.
## Generate Substitute Recommendations
**generate_substitutes.py** is used to generate the substitute recommendations for each model using various scoring thresholds. Model version and scoring threshold need to be specified.
## Prepare Data for Evaluation
**find_ground_truth_ingredients.py** was used to find "rare" and "frequent" ingredients for the ground truth.
Ingredients for which no substitute recommendations are found need to be added to the substitute-JSON file. This is done using **add_unused_ingredients.py**
## Evaluation
An intermediate evaluation was done using **stats_engl_substitutes_compare.py** to gain insight into the various versions of the substitute recommendations. However, this script is not used for the final evaluation.
The ingredient substitute recommendations made using each FoodBERT version can be evaluated using **final_eval.py**.
The version that is to be used has to be adjusted in the first line of the main().
Stats for the dataset and the ground truth can be found using **dataset_stats.py** and **ground_truth_stats.py**, respectively.

View File

@@ -1,523 +0,0 @@
import json
import statistics
data_path = "data/"
occurances_path = "mult_ingredients_nice.json"
ground_truth_path = "ground_truth.json"
engl_data_path = "evaluation/engl_data/"
evaluation_path = "evaluation/"
synonyms_path = "synonyms.json"
found_substitutes_path = "final_Versions/models/vers2/eval/complete_substitute_pairs_50.json"
# model_name = "Versions/vers3/"
german_ground_truth = {
"Karotte": ["Pastinake", "Steckrübe", "Staudensellerie", "Kürbis", "Süßkartoffel", "Rettich", "Radieschen", "Kartoffel", "Paprika_rot", "Butternusskürbis", "Petersilienwurzel"],
"Kirsche": ["Aprikose", "Pflaume", "Nektarine", "Himbeeren", "Weintrauben", "Erdbeere", "Johannisbeeren", "Brombeeren", "Beeren_gemischte", "Pfirsich", "Cranberries", "Cranberries_getrocknet", "Blaubeeren", "Maraschino", "Beeren", "Trockenpflaumen"],
"Huhn": ["Truthahn", "Kaninchen", "Austernpilze", "Kalbfleisch", "Fisch", "Tofu", "Rindfleisch", "Tofu_fester", "Schweinefleisch", "Seitan", "Ente", "Lamm", "Pilze", "Shrimps", "Wachtel", "Gans", "Wildfleisch"],
"Petersilie": ["Kerbel", "Koriander", "Estragon", "Basilikum", "Oregano", "Liebstöckel", "Dill", "Koriandergrün", "Rosmarin", "Kapern", "Thymian", "Schnittlauch", "Minze", "Basilikum_getrockneter", "Oregano_getrocknet", "Thymian_getrocknet"],
"Schokolade": ["Nutella", "Kakaopulver_Instant", "Zucker", "Marmelade", "Marshmallow", "Kakao", "Süßigkeiten", "Erdnussbutter"],
"Frühstücksspeck": ["Pancetta", "Schinken_Prosciutto", "Speck", "Schinken_rohen", "Parmaschinken", "Schinken", "Salami", "Chorizo", "Wurst_Krakauer", "Schweineschwarte", "Schinkenwürfel", "Croûtons", "Speckwürfel", "Kochschinken", "Corned_Beef", "Wurst_Mortadella"],
"Grünkohl": ["Spinat", "Chinakohl", "Lauch", "Endiviensalat", "Mangold", "Wirsing", "Kohl", "Blumenkohl", "Brunnenkresse", "Rucola", "Blattspinat", "Kopfsalat", "Römersalat", "Babyspinat"],
"Zucker": ["Honig", "Stevia", "Süßstoff", "Stevia_flüssig", "Süßstoff_flüssigen", "Reissirup", "Ahornsirup", "Kondensmilch_gezuckerte", "Agavendicksaft", "Schokolade", "Vanille", "Melasse", "Zuckerrübensirup", "Sirup"],
"Brie": ["Camembert", "Gorgonzola", "Schmelzkäse", "Cheddarkäse", "Ziegenkäse", "Doppelrahmfrischkäse", "Blauschimmelkäse", "Roquefort", "Gouda", "Käse_Fontina", "Käse_Provolone", "Feta_Käse", "Scheiblettenkäse"],
"Truthahn": ["Huhn", "Kaninchen", "Ente", "Kochschinken", "Fasan", "Gans", "Rindfleisch", "Lammfleisch", "Schweinefleisch", "Roastbeef", "Kalbfleisch", "Geflügelfleisch", "Hähnchenfilet", "Hühnerkeule", "Wachtel", "schweinekotelett", "Wildfleisch"]
}
def no_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
if get_occurrences:
with open(data_path + occurances_path, "r") as whole_json_file:
occurrences_dict = json.load(whole_json_file)
if not ground_truth_dict:
with open(data_path+ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
if synonyms:
with open(data_path + synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
synonyms_dict = {}
if not found_substitutes_dict:
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
else:
model_substitutes_dict = found_substitutes_dict
found_ground_ingr = {}
correctly_found = 0
incorrectly_found = 0
average_precision = 0.0
average_recall = 0.0
number_correct_subs_found_overall = []
total_number_subs_found_overall = []
# base ingredient without synonyms, substitutes with synonyms
for base_ingred in ground_truth_dict.keys():
if get_occurrences:
occurrences = occurrences_dict[base_ingred]
found_substitutes = model_substitutes_dict[base_ingred].copy()
# if len(found_substitutes) > 30:
# found_substitutes = found_substitutes[:30]
found = []
# remove synonyms of base ingredient
new_found_substitutes = []
for subst in found_substitutes:
if base_ingred in synonyms_dict.keys():
if subst not in synonyms_dict[base_ingred]:
new_found_substitutes.append(subst)
else:
new_found_substitutes.append(subst)
found_substitutes = new_found_substitutes
# check which substitutes were found
for subst in ground_truth_dict[base_ingred]:
# only add substitute if not already added
if subst in found_substitutes and subst not in found:
found.append(subst)
found_substitutes.remove(subst)
# check if synonyms of substitute were found
# check if ingredient has synonyms
if subst in synonyms_dict.keys():
for synon in synonyms_dict[subst]:
if synon in found_substitutes:
if synon not in found and subst not in found:
found.append(subst)
found_substitutes.remove(synon)
# if base_ingred == "Erdbeere":
print(base_ingred + ": " + str(found_substitutes))
found_ground_ingr[base_ingred] = found
# print(base_ingred + ": ")
# if get_occurrences:
# print("occurrences in dataset: " + str(occurrences))
# print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
# print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
# print("correctly found substitutes: " + str(found))
# print("incorrectly found substitutes: " + str(found_substitutes))
# print("-----------------------------\n")
if len(found) > 0:
average_precision += len(found)/(len(found) + len(found_substitutes))
# print(len(found))
average_recall += len(found)/len(ground_truth_dict[base_ingred])
correctly_found += len(found)
incorrectly_found += len(found_substitutes)
number_correct_subs_found_overall.append(len(found))
total_number_subs_found_overall.append(len(found) + len(found_substitutes))
print("average precision: " + str(average_precision/40))
print("average recall: " + str(average_recall/40))
print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
return found_ground_ingr
def merge_lists(all_lists):
max_len = 0
min_len = 99999
output = []
for curr_list in all_lists:
if len(curr_list) < min_len:
min_len = len(curr_list)
if len(curr_list) > max_len:
max_len = len(curr_list)
for index_counter in range(max_len):
for curr_list in all_lists:
if index_counter < len(curr_list):
if curr_list[index_counter] not in output:
output.append(curr_list[index_counter])
return output
def with_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
if get_occurrences:
with open(data_path + occurances_path, "r") as whole_json_file:
occurrences_dict = json.load(whole_json_file)
if not ground_truth_dict:
with open(data_path+ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
if synonyms:
with open(data_path + synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
synonyms_dict = {}
if not found_substitutes_dict:
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
else:
model_substitutes_dict = found_substitutes_dict
correctly_found = 0
incorrectly_found = 0
average_precision = 0.0
average_recall = 0.0
number_correct_subs_found_overall = []
total_number_subs_found_overall = []
found_ground_ingr = {}
# base ingredient with synonyms, substitutes with synonyms
for base_ingred in ground_truth_dict.keys():
base_synonyms = [base_ingred]
if get_occurrences:
occurrences = 0
# get list of all synonyms of base ingredient
if base_ingred in synonyms_dict.keys():
synonyms = synonyms_dict[base_ingred]
base_synonyms = base_synonyms + synonyms
found_substitutes = []
all_substitutes = []
# get top 30 substitutes of each base synonym
for synon in base_synonyms:
if get_occurrences:
occurrences += occurrences_dict[synon]
all_substitutes.append(model_substitutes_dict[synon].copy())
# synon_subs = model_substitutes_dict[synon].copy()
# if len(synon_subs) > 30:
# synon_subs = synon_subs[:30]
# for sub in synon_subs:
# if sub not in found_substitutes:
# found_substitutes.append(sub)
found_substitutes = merge_lists(all_substitutes)
else:
found_substitutes = model_substitutes_dict[base_ingred].copy()
if len(found_substitutes) > 30:
found_substitutes = found_substitutes[:30]
found = []
# remove all base synonyms from found substitutes
new_found_substitutes = []
for subst in found_substitutes:
if subst not in base_synonyms:
new_found_substitutes.append(subst)
found_substitutes = new_found_substitutes
# check which substitutes were found
for subst in ground_truth_dict[base_ingred]:
# only add substitute if not already added
if subst in found_substitutes and subst not in found:
found.append(subst)
found_substitutes.remove(subst)
# check if synonyms of substitute were found
# check if ingredient has synonyms
if subst in synonyms_dict.keys():
for synon in synonyms_dict[subst]:
if synon in found_substitutes:
if synon not in found and subst not in found:
found.append(subst)
found_substitutes.remove(synon)
found_ground_ingr[base_ingred] = found
# print(base_ingred + ": ")
# if get_occurrences:
# print("occurrences in dataset: " + str(occurrences))
# print("number of synonyms incl. original word: " + str(len(base_synonyms)))
# print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
# print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
# print("correctly found substitutes: " + str(found))
# print("incorrectly found substitutes: " + str(found_substitutes))
# print("-----------------------------\n")
if len(found) > 0:
average_precision += len(found) / (len(found) + len(found_substitutes))
average_recall += len(found) / len(ground_truth_dict[base_ingred])
correctly_found += len(found)
incorrectly_found += len(found_substitutes)
number_correct_subs_found_overall.append(len(found))
total_number_subs_found_overall.append(len(found) + len(found_substitutes))
print("average precision: " + str(average_precision / 40))
print("average recall: " + str(average_recall / 40))
print("median number of correctly found subs: " + str(statistics.median(number_correct_subs_found_overall)))
print("median number of found subs overall: " + str(statistics.median(total_number_subs_found_overall)))
return found_ground_ingr
def translate_engl_ground_truth(ground_truth, ger_transl):
new_ground_truth = {}
for base_ingr in ground_truth.keys():
new_ground_truth[ger_transl[base_ingr]] = []
for subst in ground_truth[base_ingr]:
if subst in ger_transl.keys():
new_ground_truth[ger_transl[base_ingr]].append(ger_transl[subst])
return new_ground_truth
def with_base_synonyms(ground_truth_dict=None, found_substitutes_dict=None, get_occurrences=True, synonyms=True):
if get_occurrences:
with open(data_path + occurances_path, "r") as whole_json_file:
occurrences_dict = json.load(whole_json_file)
if not ground_truth_dict:
with open(data_path+ground_truth_path, "r") as whole_json_file:
ground_truth_dict = json.load(whole_json_file)
if synonyms:
with open(data_path + synonyms_path, "r") as whole_json_file:
synonyms_dict = json.load(whole_json_file)
else:
synonyms_dict = {}
if not found_substitutes_dict:
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
else:
model_substitutes_dict = found_substitutes_dict
found_ground_ingr = {}
# base ingredient with synonyms, substitutes with synonyms
for base_ingred in ground_truth_dict.keys():
base_synonyms = [base_ingred]
if get_occurrences:
occurrences = 0
# get list of all synonyms of base ingredient
if base_ingred in synonyms_dict.keys():
synonyms = synonyms_dict[base_ingred]
base_synonyms = base_synonyms + synonyms
found_substitutes = []
all_substitutes = []
# get top 30 substitutes of each base synonym
for synon in base_synonyms:
if get_occurrences:
occurrences += occurrences_dict[synon]
all_substitutes.append(model_substitutes_dict[synon].copy())
found_substitutes = merge_lists(all_substitutes)
else:
found_substitutes = model_substitutes_dict[base_ingred].copy()
if len(found_substitutes) > 30:
found_substitutes = found_substitutes[:30]
found = []
# remove all base synonyms from found substitutes
new_found_substitutes = []
for subst in found_substitutes:
if subst not in base_synonyms:
new_found_substitutes.append(subst)
found_substitutes = new_found_substitutes
# check which substitutes were found
for subst in ground_truth_dict[base_ingred]:
# only add substitute if not already added
if subst in found_substitutes and subst not in found:
found.append(subst)
found_substitutes.remove(subst)
# check if synonyms of substitute were found
# check if ingredient has synonyms
# if subst in synonyms_dict.keys():
# for synon in synonyms_dict[subst]:
# if synon in found_substitutes:
# if synon not in found and subst not in found:
# found.append(subst)
# found_substitutes.remove(synon)
found_ground_ingr[base_ingred] = found
print(base_ingred + ": ")
if get_occurrences:
print("occurrences in dataset: " + str(occurrences))
print("number of synonyms incl. original word: " + str(len(base_synonyms)))
print("number of found substitutes: " + str(len(found)) + "/" + str(len(ground_truth_dict[base_ingred])))
print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(found_substitutes)))
print("correctly found substitutes: " + str(found))
print("incorrectly found substitutes: " + str(found_substitutes))
print("-----------------------------\n")
return found_ground_ingr
def engl_compare():
# with open(data_path + occurances_path, "r") as whole_json_file:
# occurrences_dict = json.load(whole_json_file)
with open(engl_data_path + "translation.json", "r") as whole_json_file:
ger_transl = json.load(whole_json_file)
# with open(data_path + synonyms_path, "r") as whole_json_file:
# synonyms_dict = json.load(whole_json_file)
with open(found_substitutes_path, "r") as whole_json_file:
model_substitutes_dict = json.load(whole_json_file)
with open(engl_data_path + "substitute_pairs_foodbert_text.json", "r") as whole_json_file:
engl_list = json.load(whole_json_file)
with open(engl_data_path + "engl_ground_truth.json", "r") as whole_json_file:
engl_ground_truth = json.load(whole_json_file)
engl_dict = {}
for foo in engl_list:
if foo[0] in engl_dict.keys():
engl_dict[foo[0]].append(foo[1])
else:
engl_dict[foo[0]] = [foo[1]]
translated_ground_truth = translate_engl_ground_truth(engl_ground_truth, ger_transl)
# without any synonyms
print("Engl compare without any synonyms:")
engl_replacements = {}
# ger_replacements = {}
for ingred in engl_ground_truth.keys():
found = []
incorr = []
found_ger = []
incorr_ger = []
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
# ger_replacements[ingred] = 0
if ingred in engl_dict.keys():
for sub in engl_ground_truth[ingred]:
if sub in engl_dict[ingred]:
engl_replacements[ingred]["engl"] += 1
found.append(sub)
if ger_transl[ingred] in model_substitutes_dict.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in model_substitutes_dict[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
found_ger.append(sub)
# ger_replacements[ingred] += 1
for found_sub in engl_dict[ingred]:
if found_sub not in engl_ground_truth[ingred]:
incorr.append(found_sub)
for found_sub in model_substitutes_dict[ger_transl[ingred]]:
if found_sub not in translated_ground_truth[ger_transl[ingred]]:
incorr_ger.append(found_sub)
print(ger_transl[ingred] + ": ")
print("number of found substitutes: " + str(len(found_ger)) + "/" + str(len(translated_ground_truth[ger_transl[ingred]])))
print("correctly found substitutes: " + str(len(found_ger)) + "/" + str(len(found_ger) + len(incorr_ger)))
print("correctly found substitutes: " + str(found_ger))
print("incorrectly found substitutes: " + str(incorr_ger))
print("-----------------------------\n")
print(ingred + ": ")
print("number of found substitutes: " + str(len(found)) + "/" + str(len(engl_ground_truth[ingred])))
print("correctly found substitutes: " + str(len(found)) + "/" + str(len(found) + len(incorr)))
print("correctly found substitutes: " + str(found))
print("incorrectly found substitutes: " + str(incorr))
print("-----------------------------\n")
with open(evaluation_path + "engl_comparison_results/engl_no_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
# with synonyms of substitutes
print("Engl compare with synonyms of substitutes only:")
# german
new_german_result = no_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
#engl
new_engl_result = no_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
engl_replacements = {}
for ingred in engl_ground_truth.keys():
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
if ingred in new_engl_result.keys():
for sub in engl_ground_truth[ingred]:
if sub in new_engl_result[ingred]:
engl_replacements[ingred]["engl"] += 1
if ger_transl[ingred] in new_german_result.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in new_german_result[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
with open(evaluation_path + "engl_comparison_results/engl_sub_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
# with synonyms for substitutes and base words
print("Engl compare with synonyms of both:")
# german
new_german_result = with_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
# engl
new_engl_result = with_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict, get_occurrences=False, synonyms=False)
engl_replacements = {}
for ingred in engl_ground_truth.keys():
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
if ingred in new_engl_result.keys():
for sub in engl_ground_truth[ingred]:
if sub in new_engl_result[ingred]:
engl_replacements[ingred]["engl"] += 1
if ger_transl[ingred] in new_german_result.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in new_german_result[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
with open(evaluation_path + "engl_comparison_results/engl_all_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
# with synonyms for base words
print("Engl compare with synonyms of base words only:")
# german
new_german_result = with_base_synonyms(ground_truth_dict=translated_ground_truth, get_occurrences=False)
# engl
new_engl_result = with_base_synonyms(ground_truth_dict=engl_ground_truth, found_substitutes_dict=engl_dict,
get_occurrences=False, synonyms=False)
engl_replacements = {}
for ingred in engl_ground_truth.keys():
engl_replacements[ingred] = {}
engl_replacements[ingred]["engl"] = 0
engl_replacements[ingred]["ger"] = 0
if ingred in new_engl_result.keys():
for sub in engl_ground_truth[ingred]:
if sub in new_engl_result[ingred]:
engl_replacements[ingred]["engl"] += 1
if ger_transl[ingred] in new_german_result.keys():
for sub in german_ground_truth[ger_transl[ingred]]:
if sub in new_german_result[ger_transl[ingred]]:
engl_replacements[ingred]["ger"] += 1
with open(evaluation_path + "engl_comparison_results/engl_base_syn.json", 'w') as f:
json.dump(engl_replacements, f, ensure_ascii=False, indent=4)
print("test")
def main():
# compare english and german results
# engl_compare()
print("--------------------------------------------------------")
print("--------------------------------------------------------")
print("--------------------------------------------------------\n")
# get results, synonyms only used in substitutes
no_synonyms()
print("--------------------------------------------------------")
print("--------------------------------------------------------")
print("--------------------------------------------------------\n")
# get results, synonyms used in substitutes and base ingredients
with_synonyms()
main()