initial commit of project

This commit is contained in:
2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions

110
evaluation/dataset_stats.py Normal file
View File

@@ -0,0 +1,110 @@
import json
import statistics
def dataset(full_dataset_path):
all_urls = []
with open(full_dataset_path, "r") as whole_json_file:
full_dataset = json.load(whole_json_file)
counter = 0
ingredient_counter = 0
ingredient_lengths = []
pic_counter = 0
comment_counter = 0
no_comments = 0
comment_lengths = []
instruction_counter = 0
instruction_lengths = []
for url in full_dataset.keys():
ingredient_counter += len(full_dataset[url]['ingredients'])
ingredient_lengths.append(len(full_dataset[url]['ingredients']))
if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg":
pic_counter += 1
if full_dataset[url]['comments']:
comment_lengths.append(len(full_dataset[url]['comments']))
comment_counter += len(full_dataset[url]['comments'])
else:
comment_lengths.append(0)
no_comments += 1
instruction_counter += len(full_dataset[url]['instructions'])
instruction_lengths.append(len(full_dataset[url]['instructions']))
counter += 1
print(counter)
if url not in all_urls:
all_urls.append(url)
print("number of recipes: " + str(len(full_dataset.keys())))
print("\n")
print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys())))
print("median ingredient count: " + str(statistics.median(ingredient_lengths)))
print("\n")
print("number of recipes with picture: " + str(pic_counter))
print("\n")
print("number of comments: " + str(comment_counter))
print("number of recipes withOUT comments: " + str(no_comments))
print("average amount of comments: " + str(comment_counter/len(full_dataset.keys())))
print("median comment count: " + str(statistics.median(comment_lengths)))
print("\n")
print("total instruction count: " + str(instruction_counter))
print("average instruction count: " + str(instruction_counter / len(full_dataset.keys())))
print("median instruction count: " + str(statistics.median(instruction_lengths)))
def ingredients_before(full_dataset_path):
counter = 0
with open(full_dataset_path, "r") as whole_json_file:
full_dataset = json.load(whole_json_file)
all_ingredients = []
for url in full_dataset.keys():
counter += 1
print(counter)
for ingred in full_dataset[url]['ingredients']:
if ingred not in all_ingredients:
all_ingredients.append(ingred)
print(str(len(all_ingredients)))
def ingredient_stats():
ingredients_list_path = "data/mult_ingredients_nice.json"
ingredients_instructions_path = "data/cleaned_steps_occurrance.json"
with open(ingredients_list_path, "r") as whole_json_file:
ingred_lists = json.load(whole_json_file)
with open(ingredients_instructions_path, "r") as whole_json_file:
ingred_instruct = json.load(whole_json_file)
print("in ingredient lists: ")
ingred_counts = []
ingred_sum = 0
for ingred in ingred_lists.keys():
ingred_counts.append(ingred_lists[ingred])
ingred_sum += ingred_lists[ingred]
print("average: " + str(ingred_sum/len(ingred_lists.keys())))
print("median: " + str(statistics.median(ingred_counts)))
print("in instructions: ")
instruct_counts = []
instruct_sum = 0
none_counts = 0
for ingred in ingred_instruct.keys():
instruct_counts.append(ingred_instruct[ingred])
instruct_sum += ingred_instruct[ingred]
if ingred_instruct[ingred] <5 :
none_counts += 1
print("average: " + str(instruct_sum / len(ingred_instruct.keys())))
print("median: " + str(statistics.median(instruct_counts)))
print("nones: " + str(none_counts))
sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1]))
print(sorted_instruct)
def main():
before_dataset_path = "data/dataset_fin.json"
full_dataset_path = "Versions/vers3/full_dataset.json"
# dataset(full_dataset_path)
# ingredients_before(before_dataset_path)
ingredient_stats()
main()