import json import re import gc import spacy from clean_dataset.lists import regex_list, cheese, milk, meat, sausage, fish, pasta, bread, rolls from colorama import Fore nlp = spacy.load('de_core_news_lg') data_path = "data/" home_path = "prep_dataset/" def make_list_str(given_list, reg): if given_list: return "null" list_str = '[' count = 0 for element in given_list: element = reg.sub(r' ', str(element)) if count != 0: list_str += ',' count += 1 list_str += '"' + str(element) + '"' list_str += ']' return list_str # creates dataset of only recipe URLs and steps def sep_sentences_file(input_file='dataset_sep_sentences.json', output_file='sep_sentences.json'): print("Making steps file") with open(data_path + input_file, "r") as whole_sep_json_file: whole_dataset = json.load(whole_sep_json_file) # all_sentences = {} sentence_count = 0 recipe_count = 0 with open(data_path + output_file, "w") as sep_file: sep_file.write('{') for recipe in whole_dataset.keys(): if recipe_count != 0: sep_file.write(',') recipe_count += 1 print(recipe_count) sentence_count += len(whole_dataset[recipe]['instructions']) sep_file.write('"' + recipe + '":' + json.dumps(whole_dataset[recipe]['instructions'], ensure_ascii=False, indent=4) + '\n') # whole_dataset[recipe]['instructions'] = sentences # all_sentences[recipe] = sentences # if recipe_count % 1000 == 0: # gc.collect() sep_file.write('}') print("sentences: " + str(sentence_count)) def sep_sentences(whole_dataset): print("Separate sentences in instructions") reg = re.compile(r'[\s]+') # all_sentences = {} sentence_count = 0 recipe_count = 0 with open(data_path + "dataset_sep_sentences.json", "w") as dataset_sep_file: dataset_sep_file.write('{') for recipe in whole_dataset.keys(): if recipe_count != 0: dataset_sep_file.write(',') recipe_count += 1 print(recipe_count) # dataset_sep_file.write('{') dataset_sep_file.write('"' + recipe + '":') instr = whole_dataset[recipe]['instructions'] instr_nlp = nlp(instr) sentences = list(instr_nlp.sents) sentence_count += len(sentences) whole_dataset[recipe]['instructions'] = list(map(lambda x: reg.sub(r' ', str(x)), sentences)) json.dump(whole_dataset[recipe], dataset_sep_file, ensure_ascii=False, indent=4) # dataset_sep_file.write(',"instructions":' + make_list_str(sentences, reg)) # dataset_sep_file.write(',"comments":'+ make_list_str(whole_dataset[recipe]['comments'], reg) + '}') # whole_dataset[recipe]['instructions'] = sentences # all_sentences[recipe] = sentences if recipe_count % 10000 == 0: gc.collect() dataset_sep_file.write('}') print("sentences: " + str(sentence_count)) def check_occurrances(step_file='sep_sentences.json', out_file='ingredient_occurrance.json'): print("Checking for occurrances of ingredients in steps") with open(data_path + "mult_ingredients_nice.json", "r") as ingredients_json_file: all_ingredients = json.load(ingredients_json_file) with open(data_path + step_file, "r") as steps_json_file: all_steps = json.load(steps_json_file) occurrances = {} all_steps_str = "" for recipe in all_steps.keys(): for step in all_steps[recipe]: all_steps_str += step count_ingr = 0 small_amounts = [] for ingr in all_ingredients.keys(): count_ingr += 1 print(count_ingr) counts = all_steps_str.count(ingr) if counts < 10: small_amounts.append((ingr, counts)) occurrances[ingr] = counts print(small_amounts) print(len(small_amounts)) with open(data_path + out_file, "w") as ingredient_json_file: json.dump(occurrances, ingredient_json_file, ensure_ascii=False, indent=4) def replace_alt_words(step, ingredients): # Eiswürfel und Crushed Ice ersetzen ice_types = ["Eis_Schokolade", "Eis_Vanillegeschmack", "Eis_Vanille", "Eis"] if "Crushed_Ice" in ingredients: ice_cream_bool = False for ice in ice_types: if ice in ingredients: ice_cream_bool = True if not ice_cream_bool: # replace ice, Eis with Crushed_Ice step = regex_list[28].sub('Crushed_Ice', step) step = regex_list[29].sub('Crushed_Ice', step) if "Eiswürfel" in ingredients: ice_cream_bool = False for ice in ice_types: if ice in ingredients: ice_cream_bool = True if not ice_cream_bool: # replace ice, Eis with Eiswürfel step = regex_list[29].sub('Eiswürfel', step) # Zitrusabrieb verbessern if "Zitronenabrieb" in ingredients: # replace Zitronenschale with Zitronenabrieb step = regex_list[30].sub('Zitronenabrieb', step) if "Limettenabrieb" in ingredients: # replace Limettenschale with Limettenabrieb step = regex_list[31].sub('Limettenabrieb', step) if "Orangenabrieb" in ingredients: # replace Orangenschale with Orangenabrieb step = regex_list[32].sub('Orangenabrieb', step) if "Mandarinenabrieb" in ingredients: # replace Mandarinenschale with Mandarinenabrieb step = regex_list[33].sub('Mandarinenabrieb', step) for ingred in ingredients: # change certain ingredient names if "Paprika" in ingred: # replace Paprikaschote(n)? step = regex_list[1].sub('Paprika', step) if "Lauch" in ingred: # replace Porree step = regex_list[2].sub('Lauch', step) if "Zucchini" in ingred: # replace Zucchino step = regex_list[3].sub('Zucchini', step) if "Sahne" in ingred and "saure" not in ingred: # replace Schlagsahne step = regex_list[6].sub('Sahne', step) # replace Schlagobers step = regex_list[7].sub('Sahne', step) # replace süße Sahne step = regex_list[8].sub('Sahne', step) if "Zimt" in ingred: # replace Zimtpulver step = regex_list[9].sub('Zimt', step) if "Wasser" in ingred: # replace Wasser\sohne\sKohlens\u00e4ure step = regex_list[10].sub('Wasser', step) # Mineralwasser\sohne\sKohlens\u00e4ure step = regex_list[11].sub('Wasser', step) if "Sprudelwasser" in ingred and "Wasser" not in ingredients: # replace Wasser step = regex_list[12].sub('Sprudelwasser', step) # replace Mineralwasser step = regex_list[13].sub('Sprudelwasser', step) # replace Sodawasser step = regex_list[14].sub('Sprudelwasser', step) if "Ingwer" in ingred: # replace Ingwerwurzel step = regex_list[15].sub('Ingwer', step) if "Quark" in ingred: # replace Topfen step = regex_list[16].sub('Quark', step) if "Karotte" in ingred: # replace Möhre step = regex_list[17].sub('Karotte', step) if 'Puderzucker' in ingred: # replace Staubzucker step = regex_list[18].sub('Puderzucker', step) if "Pfeffer" in ingred: # replace Pfefferkörner step = regex_list[19].sub('Pfeffer', step) # replace Pfefferbeeren step = regex_list[20].sub('Pfeffer', step) if "Kartoffel" in ingred: # replace Erdäpfel step = regex_list[21].sub('Kartoffel', step) # replace Erdapfel step = regex_list[22].sub('Kartoffel', step) if "Eigelb" in ingred: # replace (Eidotter|Dotter) step = regex_list[23].sub('Eigelb', step) if "Eier" in ingred: # replace Freilandeier step = regex_list[24].sub('Eier', step) # replace Vollei(er)? step = regex_list[25].sub('Eier', step) if "Eiweiß" in ingred: # replace Eiklar step = regex_list[26].sub('Eiweiß', step) # replace Eischnee step = regex_list[27].sub('Eiweiß', step) return step def try_replace(repl, ingred, step, caps=False, regex_short=True): # match all capitals '^[A-ZÄÜÖ]' res = re.match(regex_list[0], repl) found = False if res: if regex_short: repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]{0,3}(?P[\s.,])' else: repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]*(?P[\s.,])' if caps: step = re.sub(repl_whole, " " + ingred + "\g", step, flags=re.IGNORECASE) else: step = re.sub(repl_whole, " " + ingred + "\g", step) if ingred in step: found = True return step, found def replace_categories(step, ingred): # avoid Vollmilch_Joghurt problem if "Joghurt" in ingred or "joghurt" in ingred or "Schoko" in ingred or "schoko" in ingred: return step, False res = False found = False if "Käse " in step or "Käsescheibe " in step or "Käsescheiben " in step: word_list = cheese reg = "Käse(scheibe|scheiben) " end = " " if ingred in word_list: found = True if not found and ("Käse." in step or "Käsescheibe." in step or "Käsescheiben." in step): word_list = cheese reg = "Käse(scheibe|scheiben)\." end = "." if ingred in word_list: found = True if not found and "Milch " in step: word_list = milk reg = "Milch " end = " " if ingred in word_list: found = True if not found and "Milch." in step: word_list = milk reg = "Milch\." end = "." if ingred in word_list: found = True if not found and "Fleisch " in step: word_list = meat reg = "Fleisch " end = " " if ingred in word_list: found = True if not found and "Fleisch." in step: word_list = meat reg = "Fleisch\." end = "." if ingred in word_list: found = True if not found and ("Wurst " in step or "Würste " in step): word_list = sausage reg = "(Wurst|Würste) " end = " " if ingred in word_list: found = True if not found and ("Wurst." in step or "Würste." in step): word_list = sausage reg = "(Wurst|Würste)\." end = "." if ingred in word_list: found = True if not found and "Fisch " in step: word_list = fish reg = "Fisch " end = " " if ingred in word_list: found = True if not found and "Fisch." in step: word_list = fish reg = "Fisch\." end = "." if ingred in word_list: found = True if not found and "Nudeln " in step: word_list = pasta reg = "Nudeln " end = " " if ingred in word_list: found = True if not found and "Nudeln." in step: word_list = pasta reg = "Nudeln\." end = "." if ingred in word_list: found = True if not found and "Brot " in step: word_list = bread reg = "Brot " end = " " if ingred in word_list: found = True if not found and "Brot." in step: word_list = bread reg = "Brot\." end = "." if ingred in word_list: found = True if not found and ("Semmel " in step or "Semmeln " in step or "Brötchen " in step): word_list = rolls reg = "(Semmel(n)?|Brötchen) " end = " " if ingred in word_list: found = True if not found and ("Semmel." in step or "Semmeln." in step or "Brötchen." in step): word_list = rolls reg = "(Semmel(n)?|Brötchen)\." end = "." if ingred in word_list: found = True if not found: return step, False long_reg = "(?P
[a-zA-ZäöüßÄÖÜ]{4,})(\-| )" + reg
    reg_res = re.finditer(long_reg, step)

    orig_step = step
    if reg_res:
        for match in reg_res:
            if match.groupdict()["pre"] in ingred:
                # replace whole regex
                res = True
                step = step[:match.start()] + ingred + end + step[match.end():]

    step = re.sub("( |^)" + reg, " " + ingred + end, step)

    if orig_step != step:
        res = True

    return step, res


def clean_steps(input_file='dataset_sep_sentences.json', output='dataset_clean_steps.json'):
    print("Cleaning recipe steps")
    with open(data_path + input_file, "r") as steps_json_file:
        whole_dataset = json.load(steps_json_file)

    rec_count = 0
    for recipe in whole_dataset:
        rec_count += 1
        if rec_count % 1000 == 0:
            print(rec_count)
        steps = whole_dataset[recipe]['instructions']
        ingredients = list(sorted(whole_dataset[recipe]['ingredients'], key=len, reverse=True))

        new_steps = []
        res = False
        for step in steps:
            # replace certain words (Schlagober, Eis gestoßen)
            step = replace_alt_words(step, ingredients)

            for ingred in ingredients:
                ingred_list = ingred.split("_")
                for ingr_part in ingred_list:
                    repl = ingr_part
                    # if starts with capital letter, check for occurrence and replace ingr_part with ingr
                    # Try replacing ingr_part with whole ingredient
                    step, res = try_replace(repl, ingred, step)
                    if not res:
                        # make certain changes and try replacing again
                        if "brühe" in ingr_part:
                            repl = "brühe"
                        elif "mehl" in ingr_part:
                            repl = "Mehl"
                        elif "öl" in ingr_part:
                            repl = "Öl"
                        elif "Maiskörner" in ingr_part:
                            repl = "Mais"
                        elif "honig" in ingr_part:
                            repl = "Honig"
                        elif "tomate" in ingr_part:
                            repl = "Tomate"
                        elif "Knoblauchzehe" in ingr_part:
                            # replace Knoblauchzehe(n)?
                            step = regex_list[4].sub('Knoblauchzehe', step)
                            # replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
                            step = regex_list[5].sub('Knoblauchzehe', step)
                        elif "Knoblauch" in ingr_part:
                            # replace Knoblauchzehe(n)?
                            step = regex_list[4].sub('Knoblauch', step)
                            # replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
                            step = regex_list[5].sub('Knoblauch', step)
                        else:
                            # replace category words with exact ingredient
                            # (e.g. "Das Brot_Roggen ..." instead of "Das Brot ...")
                            step, res = replace_categories(step, ingred)
                        if not res:
                            # Try replacing ingr_part with whole ingredient
                            step, res = try_replace(repl, ingred, step)
                        if not res:
                            # try replacing without considering case
                            step, res = try_replace(repl, ingred, step, caps=True)
                            if not res:
                                # replace even if word is longer ("Käsescheibe" not "Käse")
                                step, res = try_replace(repl, ingred, step, caps=False, regex_short=False)
                                if not res:
                                    # replace even if word is longer without considering case
                                    step, res = try_replace(repl, ingred, step, caps=True, regex_short=False)

            new_steps.append(step)

            # in step remove double occurrance of ingr
        whole_dataset[recipe]['instructions'] = new_steps

    with open(data_path + output, "w") as dataset_json_file:
        json.dump(whole_dataset, dataset_json_file, ensure_ascii=False, indent=4)
    gc.collect()


def count_occurrances(amount, file_name='cleaned_steps_occurrance.json'):
    with open(data_path + file_name, "r") as json_file:
        dataset = json.load(json_file)

    count_under_amount = 0
    for ingredient in dataset.keys():
        if dataset[ingredient] <= amount:
            count_under_amount += 1
            print(ingredient)
    print("There are " + str(count_under_amount) + " ingredients with up to " + str(amount) + " occurrances")


# checks for words of certain categories (that are not in list yet) in instructions
# if they occur, put the recipe's data and the word into file
# user can then see which words should be put into list of replacement words
def check_word(word, ingr_list_file, input_file='dataset_sep_sentences.json', output='ingredients.txt'):
    # get list of all cleaned ingredients > 20
    with open(ingr_list_file, "r") as json_file:
        whole_word_list = json.load(json_file).keys()

    # check for category
    if word == "Käse " or word == "Käse." or word == "Käsescheibe":
        word_list = cheese
        word_pure = "Käse"
    elif word == "Milch " or word == "Milch.":
        word_list = milk
        word_pure = "Milch"
    elif word == "Fleisch " or word == "Fleisch.":
        word_list = meat
        word_pure = "Fleisch"
    elif word == "Wurst " or word == "Wurst.":
        word_list = sausage
        word_pure = "Wurst"
    elif word == "Würste " or word == "Würste.":
        word_list = sausage
        word_pure = "Würste"
    elif word == "Fisch " or word == "Fisch.":
        word_list = fish
        word_pure = "Fisch"
    elif word == "Nudeln " or word == "Nudeln.":
        word_list = pasta
        word_pure = "Nudeln"
    elif word == "Brot " or word == "Brot.":
        word_list = bread
        word_pure = "Brot"
    elif word == "Semmel " or word == "Semmel." or word == "Semmeln " or word == "Semmeln.":
        word_list = rolls
        word_pure = "Semmel"
    elif word == "Brötchen " or word == "Brötchen.":
        word_list = rolls
        word_pure = "Brötchen"
    else:
        print(Fore.LIGHTRED_EX + "This category can't be processed correctly!")
        return

    with open(data_path + input_file, "r") as steps_json_file:
        whole_dataset = json.load(steps_json_file)
    for recipe in whole_dataset.keys():
        exists = False
        for step in whole_dataset[recipe]['instructions']:
            if word in step:
                for list_elem in word_list:
                    if list_elem in whole_dataset[recipe]['ingredients']:
                        exists = True
                if word_pure in whole_dataset[recipe]['ingredients']:
                    exists = True
                if not exists:
                    with open(output, "a") as dataset_json_file:
                        out_list = []
                        exists = True
                        for ingr in whole_dataset[recipe]['ingredients']:
                            if ingr in whole_word_list:
                                test = whole_dataset[recipe]
                                out_list.append(ingr)
                        if len(out_list) > 0:
                            dataset_json_file.write("\n" + whole_dataset[recipe]['name'] + "\n")
                            dataset_json_file.write(recipe + "\n")
                            dataset_json_file.write(step + "\n")
                            for ingr2 in out_list:
                                dataset_json_file.write(ingr2 + "\n")

def main():
    print("started")
    # separate sentences
    with open(data_path + "dataset_cleaned_nice.json", "r") as whole_cleaned_json_file:
        whole_dataset_cleaned = json.load(whole_cleaned_json_file)
    sep_sentences(whole_dataset_cleaned)

    clean_steps(input_file='dataset_sep_sentences.json', output='dataset_cleaned_steps.json')

    sep_sentences_file(input_file='dataset_cleaned_steps.json', output_file='cleaned_sep_sentences.json')
    #
    # check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')


    # count_occurrances(300)
    # check_word("Käse ")
    # check_word("Käse.")
    # check_word("Käsescheibe")
    # check_word("Milch ")
    # check_word("Fleisch ", "data/meats.json")
    # check_word("Fleisch.", "data/meats.json")
    # check_word("Wurst ", "data/sausage.json")
    # check_word("Wurst.", "data/sausage.json")
    # check_word("Würste ", "data/sausage.json")
    # check_word("Würste.", "data/sausage.json")
    # check_word("Fisch ", "data/fish.json")
    # check_word("Fisch.", "data/fish.json")
    # check_word("Nudeln ", "data/pasta.json")
    # check_word("Nudeln.", "data/pasta.json")
    # check_word("Brot ", "data/bread.json")
    # check_word("Brot.", "data/bread.json")
    # check_word("Semmel ", "data/rolls.json")
    # check_word("Semmel.", "data/rolls.json")
    # check_word("Semmeln ", "data/rolls.json")
    # check_word("Semmeln.", "data/rolls.json")
    # check_word("Brötchen ", "data/rolls.json")
    # check_word("Brötchen.", "data/rolls.json")
    # check_word("test", "data/rolls.json")

    # output = replace_categories("Den Käse auf das Burger Brot legen", "Burgerbrötchen")
    # print(output)


    # test
    # clean_steps(input='dataset_test.json', output='dataset_cleaned_steps_test.json')
    # sep_sentences_file(input_file='dataset_cleaned_steps_test.json', output_file='cleaned_sep_sentences_test.json')
    # check_occurrances(step_file='cleaned_sep_sentences_test.json', out_file='cleaned_steps_occurrance_test.json')


    # clean_steps()
    # sep_sentences_file(input_file='dataset_clean_steps.json', output_file='cleaned_sep_sentences.json')
    # check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')

main()