initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/evaluation/dataset_stats.py
+++ b/evaluation/dataset_stats.py
@@ -0,0 +1,110 @@
+import json
+import statistics
+
+
+def dataset(full_dataset_path):
+    all_urls = []
+
+    with open(full_dataset_path, "r") as whole_json_file:
+        full_dataset = json.load(whole_json_file)
+    counter = 0
+    ingredient_counter = 0
+    ingredient_lengths = []
+    pic_counter = 0
+    comment_counter = 0
+    no_comments = 0
+    comment_lengths = []
+    instruction_counter = 0
+    instruction_lengths = []
+    for url in full_dataset.keys():
+        ingredient_counter += len(full_dataset[url]['ingredients'])
+        ingredient_lengths.append(len(full_dataset[url]['ingredients']))
+        if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg":
+            pic_counter += 1
+        if full_dataset[url]['comments']:
+            comment_lengths.append(len(full_dataset[url]['comments']))
+            comment_counter += len(full_dataset[url]['comments'])
+        else:
+            comment_lengths.append(0)
+            no_comments += 1
+        instruction_counter += len(full_dataset[url]['instructions'])
+        instruction_lengths.append(len(full_dataset[url]['instructions']))
+        counter += 1
+        print(counter)
+        if url not in all_urls:
+            all_urls.append(url)
+    print("number of recipes: " + str(len(full_dataset.keys())))
+    print("\n")
+    print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys())))
+    print("median ingredient count: " + str(statistics.median(ingredient_lengths)))
+    print("\n")
+    print("number of recipes with picture: " + str(pic_counter))
+    print("\n")
+    print("number of comments: " + str(comment_counter))
+    print("number of recipes withOUT comments: " + str(no_comments))
+    print("average amount of comments: " + str(comment_counter/len(full_dataset.keys())))
+    print("median comment count: " + str(statistics.median(comment_lengths)))
+    print("\n")
+    print("total instruction count: " + str(instruction_counter))
+    print("average instruction count: " + str(instruction_counter / len(full_dataset.keys())))
+    print("median instruction count: " + str(statistics.median(instruction_lengths)))
+
+
+def ingredients_before(full_dataset_path):
+    counter = 0
+    with open(full_dataset_path, "r") as whole_json_file:
+        full_dataset = json.load(whole_json_file)
+    all_ingredients = []
+    for url in full_dataset.keys():
+        counter += 1
+        print(counter)
+        for ingred in full_dataset[url]['ingredients']:
+            if ingred not in all_ingredients:
+                all_ingredients.append(ingred)
+    print(str(len(all_ingredients)))
+
+def ingredient_stats():
+    ingredients_list_path = "data/mult_ingredients_nice.json"
+    ingredients_instructions_path = "data/cleaned_steps_occurrance.json"
+
+    with open(ingredients_list_path, "r") as whole_json_file:
+        ingred_lists = json.load(whole_json_file)
+
+    with open(ingredients_instructions_path, "r") as whole_json_file:
+        ingred_instruct = json.load(whole_json_file)
+
+    print("in ingredient lists: ")
+    ingred_counts = []
+    ingred_sum = 0
+    for ingred in ingred_lists.keys():
+        ingred_counts.append(ingred_lists[ingred])
+        ingred_sum += ingred_lists[ingred]
+
+    print("average: " + str(ingred_sum/len(ingred_lists.keys())))
+    print("median: " + str(statistics.median(ingred_counts)))
+
+    print("in instructions: ")
+    instruct_counts = []
+    instruct_sum = 0
+    none_counts = 0
+    for ingred in ingred_instruct.keys():
+        instruct_counts.append(ingred_instruct[ingred])
+        instruct_sum += ingred_instruct[ingred]
+        if ingred_instruct[ingred] <5 :
+            none_counts += 1
+
+    print("average: " + str(instruct_sum / len(ingred_instruct.keys())))
+    print("median: " + str(statistics.median(instruct_counts)))
+    print("nones: " + str(none_counts))
+    sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1]))
+    print(sorted_instruct)
+
+
+def main():
+    before_dataset_path = "data/dataset_fin.json"
+    full_dataset_path = "Versions/vers3/full_dataset.json"
+    # dataset(full_dataset_path)
+    # ingredients_before(before_dataset_path)
+    ingredient_stats()
+
+main()