MasterarbeitCode/evaluation/ground_truth_stats.py

import json
import statistics


def main():
    ground_truth_path = "data/ground_truth.json"
    # ground_truth_path = "evaluation/engl_data/engl_ground_truth.json"
    with open(ground_truth_path, "r") as whole_json_file:
        ground_truth_dict = json.load(whole_json_file)
    ingredients_path = "data/mult_ingredients_nice.json"
    # ingredients_path = "data/cleaned_steps_occurrance.json"
    with open(ingredients_path, "r") as whole_json_file:
        ingredients_occurrences = json.load(whole_json_file)
    synonyms_path = "data/synonyms.json"
    with open(synonyms_path, "r") as whole_json_file:
        synonyms_dict = json.load(whole_json_file)

    german_words = ["Spätzle", "Schwarzbrot", "Schupfnudeln", "Bratwürste_Nürnberger", "Vanillinzucker", "Bier", "Semmelknödel", "Rote_Bete", "Eisbeine", "Spargel_weiß"]
    german_total = 0
    other_total = 0

    all_counts = []
    occurrence_count = []
    for base in ground_truth_dict.keys():
        # print(base + " substitutes: " + str(len(ground_truth_dict[base])))
        all_counts.append(len(ground_truth_dict[base]))
        curr_occurrences = ingredients_occurrences[base]
        if base in synonyms_dict.keys():
            for syn in synonyms_dict[base]:
                curr_occurrences += ingredients_occurrences[syn]
        occurrence_count.append(curr_occurrences)
        print(base + " occurrences: " + str(curr_occurrences))
        if base in german_words:
            german_total += len(ground_truth_dict[base])
        else:
            other_total += len(ground_truth_dict[base])

    print("Average: " + str(statistics.mean(all_counts)))
    print("Median: " + str(statistics.median(all_counts)))
    print("Standard deviation: " + str(statistics.stdev(all_counts)))
    print("Min: " + str(min(all_counts)))
    print("Max: " + str(max(all_counts)))

    # print("german total: " + str(german_total))
    # print("other total: " + str(other_total))


main()