110 lines
4.1 KiB
Python
110 lines
4.1 KiB
Python
import json
|
|
import statistics
|
|
|
|
|
|
def dataset(full_dataset_path):
|
|
all_urls = []
|
|
|
|
with open(full_dataset_path, "r") as whole_json_file:
|
|
full_dataset = json.load(whole_json_file)
|
|
counter = 0
|
|
ingredient_counter = 0
|
|
ingredient_lengths = []
|
|
pic_counter = 0
|
|
comment_counter = 0
|
|
no_comments = 0
|
|
comment_lengths = []
|
|
instruction_counter = 0
|
|
instruction_lengths = []
|
|
for url in full_dataset.keys():
|
|
ingredient_counter += len(full_dataset[url]['ingredients'])
|
|
ingredient_lengths.append(len(full_dataset[url]['ingredients']))
|
|
if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg":
|
|
pic_counter += 1
|
|
if full_dataset[url]['comments']:
|
|
comment_lengths.append(len(full_dataset[url]['comments']))
|
|
comment_counter += len(full_dataset[url]['comments'])
|
|
else:
|
|
comment_lengths.append(0)
|
|
no_comments += 1
|
|
instruction_counter += len(full_dataset[url]['instructions'])
|
|
instruction_lengths.append(len(full_dataset[url]['instructions']))
|
|
counter += 1
|
|
print(counter)
|
|
if url not in all_urls:
|
|
all_urls.append(url)
|
|
print("number of recipes: " + str(len(full_dataset.keys())))
|
|
print("\n")
|
|
print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys())))
|
|
print("median ingredient count: " + str(statistics.median(ingredient_lengths)))
|
|
print("\n")
|
|
print("number of recipes with picture: " + str(pic_counter))
|
|
print("\n")
|
|
print("number of comments: " + str(comment_counter))
|
|
print("number of recipes withOUT comments: " + str(no_comments))
|
|
print("average amount of comments: " + str(comment_counter/len(full_dataset.keys())))
|
|
print("median comment count: " + str(statistics.median(comment_lengths)))
|
|
print("\n")
|
|
print("total instruction count: " + str(instruction_counter))
|
|
print("average instruction count: " + str(instruction_counter / len(full_dataset.keys())))
|
|
print("median instruction count: " + str(statistics.median(instruction_lengths)))
|
|
|
|
|
|
def ingredients_before(full_dataset_path):
|
|
counter = 0
|
|
with open(full_dataset_path, "r") as whole_json_file:
|
|
full_dataset = json.load(whole_json_file)
|
|
all_ingredients = []
|
|
for url in full_dataset.keys():
|
|
counter += 1
|
|
print(counter)
|
|
for ingred in full_dataset[url]['ingredients']:
|
|
if ingred not in all_ingredients:
|
|
all_ingredients.append(ingred)
|
|
print(str(len(all_ingredients)))
|
|
|
|
def ingredient_stats():
|
|
ingredients_list_path = "data/mult_ingredients_nice.json"
|
|
ingredients_instructions_path = "data/cleaned_steps_occurrance.json"
|
|
|
|
with open(ingredients_list_path, "r") as whole_json_file:
|
|
ingred_lists = json.load(whole_json_file)
|
|
|
|
with open(ingredients_instructions_path, "r") as whole_json_file:
|
|
ingred_instruct = json.load(whole_json_file)
|
|
|
|
print("in ingredient lists: ")
|
|
ingred_counts = []
|
|
ingred_sum = 0
|
|
for ingred in ingred_lists.keys():
|
|
ingred_counts.append(ingred_lists[ingred])
|
|
ingred_sum += ingred_lists[ingred]
|
|
|
|
print("average: " + str(ingred_sum/len(ingred_lists.keys())))
|
|
print("median: " + str(statistics.median(ingred_counts)))
|
|
|
|
print("in instructions: ")
|
|
instruct_counts = []
|
|
instruct_sum = 0
|
|
none_counts = 0
|
|
for ingred in ingred_instruct.keys():
|
|
instruct_counts.append(ingred_instruct[ingred])
|
|
instruct_sum += ingred_instruct[ingred]
|
|
if ingred_instruct[ingred] <5 :
|
|
none_counts += 1
|
|
|
|
print("average: " + str(instruct_sum / len(ingred_instruct.keys())))
|
|
print("median: " + str(statistics.median(instruct_counts)))
|
|
print("nones: " + str(none_counts))
|
|
sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1]))
|
|
print(sorted_instruct)
|
|
|
|
|
|
def main():
|
|
before_dataset_path = "data/dataset_fin.json"
|
|
full_dataset_path = "Versions/vers3/full_dataset.json"
|
|
# dataset(full_dataset_path)
|
|
# ingredients_before(before_dataset_path)
|
|
ingredient_stats()
|
|
|
|
main() |