import json
import statistics


def dataset(full_dataset_path):
    all_urls = []

    with open(full_dataset_path, "r") as whole_json_file:
        full_dataset = json.load(whole_json_file)
    counter = 0
    ingredient_counter = 0
    ingredient_lengths = []
    pic_counter = 0
    comment_counter = 0
    no_comments = 0
    comment_lengths = []
    instruction_counter = 0
    instruction_lengths = []
    for url in full_dataset.keys():
        ingredient_counter += len(full_dataset[url]['ingredients'])
        ingredient_lengths.append(len(full_dataset[url]['ingredients']))
        if full_dataset[url]['image'] != "https://img.chefkoch-cdn.de/img/default/layout/recipe-nopicture.jpg":
            pic_counter += 1
        if full_dataset[url]['comments']:
            comment_lengths.append(len(full_dataset[url]['comments']))
            comment_counter += len(full_dataset[url]['comments'])
        else:
            comment_lengths.append(0)
            no_comments += 1
        instruction_counter += len(full_dataset[url]['instructions'])
        instruction_lengths.append(len(full_dataset[url]['instructions']))
        counter += 1
        print(counter)
        if url not in all_urls:
            all_urls.append(url)
    print("number of recipes: " + str(len(full_dataset.keys())))
    print("\n")
    print("average ingredient count: " + str(ingredient_counter / len(full_dataset.keys())))
    print("median ingredient count: " + str(statistics.median(ingredient_lengths)))
    print("\n")
    print("number of recipes with picture: " + str(pic_counter))
    print("\n")
    print("number of comments: " + str(comment_counter))
    print("number of recipes withOUT comments: " + str(no_comments))
    print("average amount of comments: " + str(comment_counter/len(full_dataset.keys())))
    print("median comment count: " + str(statistics.median(comment_lengths)))
    print("\n")
    print("total instruction count: " + str(instruction_counter))
    print("average instruction count: " + str(instruction_counter / len(full_dataset.keys())))
    print("median instruction count: " + str(statistics.median(instruction_lengths)))


def ingredients_before(full_dataset_path):
    counter = 0
    with open(full_dataset_path, "r") as whole_json_file:
        full_dataset = json.load(whole_json_file)
    all_ingredients = []
    for url in full_dataset.keys():
        counter += 1
        print(counter)
        for ingred in full_dataset[url]['ingredients']:
            if ingred not in all_ingredients:
                all_ingredients.append(ingred)
    print(str(len(all_ingredients)))

def ingredient_stats():
    ingredients_list_path = "data/mult_ingredients_nice.json"
    ingredients_instructions_path = "data/cleaned_steps_occurrance.json"

    with open(ingredients_list_path, "r") as whole_json_file:
        ingred_lists = json.load(whole_json_file)

    with open(ingredients_instructions_path, "r") as whole_json_file:
        ingred_instruct = json.load(whole_json_file)

    print("in ingredient lists: ")
    ingred_counts = []
    ingred_sum = 0
    for ingred in ingred_lists.keys():
        ingred_counts.append(ingred_lists[ingred])
        ingred_sum += ingred_lists[ingred]

    print("average: " + str(ingred_sum/len(ingred_lists.keys())))
    print("median: " + str(statistics.median(ingred_counts)))

    print("in instructions: ")
    instruct_counts = []
    instruct_sum = 0
    none_counts = 0
    for ingred in ingred_instruct.keys():
        instruct_counts.append(ingred_instruct[ingred])
        instruct_sum += ingred_instruct[ingred]
        if ingred_instruct[ingred] <5 :
            none_counts += 1

    print("average: " + str(instruct_sum / len(ingred_instruct.keys())))
    print("median: " + str(statistics.median(instruct_counts)))
    print("nones: " + str(none_counts))
    sorted_instruct = dict(sorted(ingred_instruct.items(), key=lambda item: item[1]))
    print(sorted_instruct)


def main():
    before_dataset_path = "data/dataset_fin.json"
    full_dataset_path = "Versions/vers3/full_dataset.json"
    # dataset(full_dataset_path)
    # ingredients_before(before_dataset_path)
    ingredient_stats()

main()