import json import statistics def dataset(full_dataset_path): all_urls = [] with open(full_dataset_path, "r") as whole_json_file: full_dataset = json.load(whole_json_file) counter = 0 ingredient_counter = 0 ingredient_lengths = [] pic_counter = 0 comment_counter = 0 no_comments = 0 comment_lengths = [] instruction_counter = 0 instruction_lengths = [] for url in full_dataset.keys(): ingredient_counter += len(full_dataset[url]['ingredients']) ingredient_lengths.append(len(full_dataset[url]['ingredients'])) if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg": pic_counter += 1 if full_dataset[url]['comments']: comment_lengths.append(len(full_dataset[url]['comments'])) comment_counter += len(full_dataset[url]['comments']) else: comment_lengths.append(0) no_comments += 1 instruction_counter += len(full_dataset[url]['instructions']) instruction_lengths.append(len(full_dataset[url]['instructions'])) counter += 1 print(counter) if url not in all_urls: all_urls.append(url) print("number of recipes: " + str(len(full_dataset.keys()))) print("\n") print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys()))) print("median ingredient count: " + str(statistics.median(ingredient_lengths))) print("\n") print("number of recipes with picture: " + str(pic_counter)) print("\n") print("number of comments: " + str(comment_counter)) print("number of recipes withOUT comments: " + str(no_comments)) print("average amount of comments: " + str(comment_counter/len(full_dataset.keys()))) print("median comment count: " + str(statistics.median(comment_lengths))) print("\n") print("total instruction count: " + str(instruction_counter)) print("average instruction count: " + str(instruction_counter / len(full_dataset.keys()))) print("median instruction count: " + str(statistics.median(instruction_lengths))) def ingredients_before(full_dataset_path): counter = 0 with open(full_dataset_path, "r") as whole_json_file: full_dataset = json.load(whole_json_file) all_ingredients = [] for url in full_dataset.keys(): counter += 1 print(counter) for ingred in full_dataset[url]['ingredients']: if ingred not in all_ingredients: all_ingredients.append(ingred) print(str(len(all_ingredients))) def ingredient_stats(): ingredients_list_path = "data/mult_ingredients_nice.json" ingredients_instructions_path = "data/cleaned_steps_occurrance.json" with open(ingredients_list_path, "r") as whole_json_file: ingred_lists = json.load(whole_json_file) with open(ingredients_instructions_path, "r") as whole_json_file: ingred_instruct = json.load(whole_json_file) print("in ingredient lists: ") ingred_counts = [] ingred_sum = 0 for ingred in ingred_lists.keys(): ingred_counts.append(ingred_lists[ingred]) ingred_sum += ingred_lists[ingred] print("average: " + str(ingred_sum/len(ingred_lists.keys()))) print("median: " + str(statistics.median(ingred_counts))) print("in instructions: ") instruct_counts = [] instruct_sum = 0 none_counts = 0 for ingred in ingred_instruct.keys(): instruct_counts.append(ingred_instruct[ingred]) instruct_sum += ingred_instruct[ingred] if ingred_instruct[ingred] <5 : none_counts += 1 print("average: " + str(instruct_sum / len(ingred_instruct.keys()))) print("median: " + str(statistics.median(instruct_counts))) print("nones: " + str(none_counts)) sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1])) print(sorted_instruct) def main(): before_dataset_path = "data/dataset_fin.json" full_dataset_path = "Versions/vers3/full_dataset.json" # dataset(full_dataset_path) # ingredients_before(before_dataset_path) ingredient_stats() main()