MasterarbeitCode/clean_dataset/dataset_instructions_helpers.py

import json
import re
import gc
import spacy
from clean_dataset.lists import regex_list, cheese, milk, meat, sausage, fish, pasta, bread, rolls
from colorama import Fore

nlp = spacy.load('de_core_news_lg')
data_path = "data/"
home_path = "prep_dataset/"


def make_list_str(given_list, reg):
    if given_list:
        return "null"
    list_str = '['
    count = 0
    for element in given_list:
        element = reg.sub(r' ', str(element))
        if count != 0:
            list_str += ','
        count += 1
        list_str += '"' + str(element) + '"'

    list_str += ']'
    return list_str


# creates dataset of only recipe URLs and steps
def sep_sentences_file(input_file='dataset_sep_sentences.json', output_file='sep_sentences.json'):
    print("Making steps file")
    with open(data_path + input_file, "r") as whole_sep_json_file:
        whole_dataset = json.load(whole_sep_json_file)

    # all_sentences = {}
    sentence_count = 0
    recipe_count = 0
    with open(data_path + output_file, "w") as sep_file:
        sep_file.write('{')
        for recipe in whole_dataset.keys():
            if recipe_count != 0:
                sep_file.write(',')
            recipe_count += 1
            print(recipe_count)
            sentence_count += len(whole_dataset[recipe]['instructions'])
            sep_file.write('"' + recipe + '":' + json.dumps(whole_dataset[recipe]['instructions'], ensure_ascii=False, indent=4) + '\n')

            # whole_dataset[recipe]['instructions'] = sentences
            # all_sentences[recipe] = sentences
            # if recipe_count % 1000 == 0:
            #     gc.collect()

        sep_file.write('}')
        print("sentences: " + str(sentence_count))


def sep_sentences(whole_dataset):
    print("Separate sentences in instructions")
    reg = re.compile(r'[\s]+')
    # all_sentences = {}
    sentence_count = 0
    recipe_count = 0
    with open(data_path + "dataset_sep_sentences.json", "w") as dataset_sep_file:
        dataset_sep_file.write('{')
        for recipe in whole_dataset.keys():
            if recipe_count != 0:
                dataset_sep_file.write(',')
            recipe_count += 1
            print(recipe_count)
            # dataset_sep_file.write('{')

            dataset_sep_file.write('"' + recipe + '":')
            instr = whole_dataset[recipe]['instructions']
            instr_nlp = nlp(instr)
            sentences = list(instr_nlp.sents)
            sentence_count += len(sentences)
            whole_dataset[recipe]['instructions'] = list(map(lambda x: reg.sub(r' ', str(x)), sentences))
            json.dump(whole_dataset[recipe], dataset_sep_file, ensure_ascii=False, indent=4)
            # dataset_sep_file.write(',"instructions":' + make_list_str(sentences, reg))
            # dataset_sep_file.write(',"comments":'+ make_list_str(whole_dataset[recipe]['comments'], reg) + '}')
            # whole_dataset[recipe]['instructions'] = sentences
            # all_sentences[recipe] = sentences
            if recipe_count % 10000 == 0:
                gc.collect()

        dataset_sep_file.write('}')
        print("sentences: " + str(sentence_count))


def check_occurrances(step_file='sep_sentences.json', out_file='ingredient_occurrance.json'):
    print("Checking for occurrances of ingredients in steps")
    with open(data_path + "mult_ingredients_nice.json", "r") as ingredients_json_file:
        all_ingredients = json.load(ingredients_json_file)
    with open(data_path + step_file, "r") as steps_json_file:
        all_steps = json.load(steps_json_file)
    occurrances = {}

    all_steps_str = ""
    for recipe in all_steps.keys():
        for step in all_steps[recipe]:
            all_steps_str += step
    count_ingr = 0

    small_amounts = []
    for ingr in all_ingredients.keys():
        count_ingr += 1
        print(count_ingr)
        counts = all_steps_str.count(ingr)
        if counts < 10:
            small_amounts.append((ingr, counts))
        occurrances[ingr] = counts

    print(small_amounts)
    print(len(small_amounts))
    with open(data_path + out_file, "w") as ingredient_json_file:
        json.dump(occurrances, ingredient_json_file, ensure_ascii=False, indent=4)


def replace_alt_words(step, ingredients):
    # Eiswürfel und Crushed Ice ersetzen
    ice_types = ["Eis_Schokolade", "Eis_Vanillegeschmack", "Eis_Vanille", "Eis"]
    if "Crushed_Ice" in ingredients:
        ice_cream_bool = False
        for ice in ice_types:
            if ice in ingredients:
                ice_cream_bool = True
        if not ice_cream_bool:
            # replace ice, Eis with Crushed_Ice
            step = regex_list[28].sub('Crushed_Ice', step)
            step = regex_list[29].sub('Crushed_Ice', step)

    if "Eiswürfel" in ingredients:
        ice_cream_bool = False
        for ice in ice_types:
            if ice in ingredients:
                ice_cream_bool = True
        if not ice_cream_bool:
            # replace ice, Eis with Eiswürfel
            step = regex_list[29].sub('Eiswürfel', step)

    # Zitrusabrieb verbessern
    if "Zitronenabrieb" in ingredients:
        # replace Zitronenschale with Zitronenabrieb
        step = regex_list[30].sub('Zitronenabrieb', step)
    if "Limettenabrieb" in ingredients:
        # replace Limettenschale with Limettenabrieb
        step = regex_list[31].sub('Limettenabrieb', step)
    if "Orangenabrieb" in ingredients:
        # replace Orangenschale with Orangenabrieb
        step = regex_list[32].sub('Orangenabrieb', step)
    if "Mandarinenabrieb" in ingredients:
        # replace Mandarinenschale with Mandarinenabrieb
        step = regex_list[33].sub('Mandarinenabrieb', step)

    for ingred in ingredients:
        # change certain ingredient names
        if "Paprika" in ingred:
            # replace Paprikaschote(n)?
            step = regex_list[1].sub('Paprika', step)
        if "Lauch" in ingred:
            # replace Porree
            step = regex_list[2].sub('Lauch', step)
        if "Zucchini" in ingred:
            # replace Zucchino
            step = regex_list[3].sub('Zucchini', step)
        if "Sahne" in ingred and "saure" not in ingred:
            # replace Schlagsahne
            step = regex_list[6].sub('Sahne', step)
            # replace Schlagobers
            step = regex_list[7].sub('Sahne', step)
            # replace süße Sahne
            step = regex_list[8].sub('Sahne', step)
        if "Zimt" in ingred:
            # replace Zimtpulver
            step = regex_list[9].sub('Zimt', step)
        if "Wasser" in ingred:
            # replace Wasser\sohne\sKohlens\u00e4ure
            step = regex_list[10].sub('Wasser', step)
            # Mineralwasser\sohne\sKohlens\u00e4ure
            step = regex_list[11].sub('Wasser', step)
        if "Sprudelwasser" in ingred and "Wasser" not in ingredients:
            # replace Wasser
            step = regex_list[12].sub('Sprudelwasser', step)
            # replace Mineralwasser
            step = regex_list[13].sub('Sprudelwasser', step)
            # replace Sodawasser
            step = regex_list[14].sub('Sprudelwasser', step)
        if "Ingwer" in ingred:
            # replace Ingwerwurzel
            step = regex_list[15].sub('Ingwer', step)
        if "Quark" in ingred:
            # replace Topfen
            step = regex_list[16].sub('Quark', step)
        if "Karotte" in ingred:
            # replace Möhre
            step = regex_list[17].sub('Karotte', step)
        if 'Puderzucker' in ingred:
            # replace Staubzucker
            step = regex_list[18].sub('Puderzucker', step)
        if "Pfeffer" in ingred:
            # replace Pfefferkörner
            step = regex_list[19].sub('Pfeffer', step)
            # replace Pfefferbeeren
            step = regex_list[20].sub('Pfeffer', step)
        if "Kartoffel" in ingred:
            # replace Erdäpfel
            step = regex_list[21].sub('Kartoffel', step)
            # replace Erdapfel
            step = regex_list[22].sub('Kartoffel', step)
        if "Eigelb" in ingred:
            # replace (Eidotter|Dotter)
            step = regex_list[23].sub('Eigelb', step)
        if "Eier" in ingred:
            # replace Freilandeier
            step = regex_list[24].sub('Eier', step)
            # replace Vollei(er)?
            step = regex_list[25].sub('Eier', step)
        if "Eiweiß" in ingred:
            # replace Eiklar
            step = regex_list[26].sub('Eiweiß', step)
            # replace Eischnee
            step = regex_list[27].sub('Eiweiß', step)

    return step


def try_replace(repl, ingred, step, caps=False, regex_short=True):
    # match all capitals '^[A-ZÄÜÖ]'
    res = re.match(regex_list[0], repl)
    found = False

    if res:
        if regex_short:
            repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]{0,3}(?P<end>[\s.,])'
        else:
            repl_whole = '\s[a-zA-ZäÄöÖüÜß]*' + repl + '[a-zA-ZäÄöÖüÜß]*(?P<end>[\s.,])'
        if caps:
            step = re.sub(repl_whole, " " + ingred + "\g<end>", step, flags=re.IGNORECASE)
        else:
            step = re.sub(repl_whole, " " + ingred + "\g<end>", step)
        if ingred in step:
            found = True

    return step, found


def replace_categories(step, ingred):
    # avoid Vollmilch_Joghurt problem
    if "Joghurt" in ingred or "joghurt" in ingred or "Schoko" in ingred or "schoko" in ingred:
        return step, False

    res = False
    found = False

    if "Käse " in step or "Käsescheibe " in step or "Käsescheiben " in step:
        word_list = cheese
        reg = "Käse(scheibe|scheiben) "
        end = " "
        if ingred in word_list:
            found = True
    if not found and ("Käse." in step or "Käsescheibe." in step or "Käsescheiben." in step):
        word_list = cheese
        reg = "Käse(scheibe|scheiben)\."
        end = "."
        if ingred in word_list:
            found = True
    if not found and "Milch " in step:
        word_list = milk
        reg = "Milch "
        end = " "
        if ingred in word_list:
            found = True
    if not found and "Milch." in step:
        word_list = milk
        reg = "Milch\."
        end = "."
        if ingred in word_list:
            found = True
    if not found and "Fleisch " in step:
        word_list = meat
        reg = "Fleisch "
        end = " "
        if ingred in word_list:
            found = True
    if not found and "Fleisch." in step:
        word_list = meat
        reg = "Fleisch\."
        end = "."
        if ingred in word_list:
            found = True
    if not found and ("Wurst " in step or "Würste " in step):
        word_list = sausage
        reg = "(Wurst|Würste) "
        end = " "
        if ingred in word_list:
            found = True
    if not found and ("Wurst." in step or "Würste." in step):
        word_list = sausage
        reg = "(Wurst|Würste)\."
        end = "."
        if ingred in word_list:
            found = True
    if not found and "Fisch " in step:
        word_list = fish
        reg = "Fisch "
        end = " "
        if ingred in word_list:
            found = True
    if not found and "Fisch." in step:
        word_list = fish
        reg = "Fisch\."
        end = "."
        if ingred in word_list:
            found = True
    if not found and "Nudeln " in step:
        word_list = pasta
        reg = "Nudeln "
        end = " "
        if ingred in word_list:
            found = True
    if not found and "Nudeln." in step:
        word_list = pasta
        reg = "Nudeln\."
        end = "."
        if ingred in word_list:
            found = True
    if not found and "Brot " in step:
        word_list = bread
        reg = "Brot "
        end = " "
        if ingred in word_list:
            found = True
    if not found and "Brot." in step:
        word_list = bread
        reg = "Brot\."
        end = "."
        if ingred in word_list:
            found = True
    if not found and ("Semmel " in step or "Semmeln " in step or "Brötchen " in step):
        word_list = rolls
        reg = "(Semmel(n)?|Brötchen) "
        end = " "
        if ingred in word_list:
            found = True
    if not found and ("Semmel." in step or "Semmeln." in step or "Brötchen." in step):
        word_list = rolls
        reg = "(Semmel(n)?|Brötchen)\."
        end = "."
        if ingred in word_list:
            found = True
    if not found:
        return step, False

    long_reg = "(?P<pre>[a-zA-ZäöüßÄÖÜ]{4,})(\-| )" + reg
    reg_res = re.finditer(long_reg, step)

    orig_step = step
    if reg_res:
        for match in reg_res:
            if match.groupdict()["pre"] in ingred:
                # replace whole regex
                res = True
                step = step[:match.start()] + ingred + end + step[match.end():]

    step = re.sub("( |^)" + reg, " " + ingred + end, step)

    if orig_step != step:
        res = True

    return step, res


def clean_steps(input_file='dataset_sep_sentences.json', output='dataset_clean_steps.json'):
    print("Cleaning recipe steps")
    with open(data_path + input_file, "r") as steps_json_file:
        whole_dataset = json.load(steps_json_file)

    rec_count = 0
    for recipe in whole_dataset:
        rec_count += 1
        if rec_count % 1000 == 0:
            print(rec_count)
        steps = whole_dataset[recipe]['instructions']
        ingredients = list(sorted(whole_dataset[recipe]['ingredients'], key=len, reverse=True))

        new_steps = []
        res = False
        for step in steps:
            # replace certain words (Schlagober, Eis gestoßen)
            step = replace_alt_words(step, ingredients)

            for ingred in ingredients:
                ingred_list = ingred.split("_")
                for ingr_part in ingred_list:
                    repl = ingr_part
                    # if starts with capital letter, check for occurrence and replace ingr_part with ingr
                    # Try replacing ingr_part with whole ingredient
                    step, res = try_replace(repl, ingred, step)
                    if not res:
                        # make certain changes and try replacing again
                        if "brühe" in ingr_part:
                            repl = "brühe"
                        elif "mehl" in ingr_part:
                            repl = "Mehl"
                        elif "öl" in ingr_part:
                            repl = "Öl"
                        elif "Maiskörner" in ingr_part:
                            repl = "Mais"
                        elif "honig" in ingr_part:
                            repl = "Honig"
                        elif "tomate" in ingr_part:
                            repl = "Tomate"
                        elif "Knoblauchzehe" in ingr_part:
                            # replace Knoblauchzehe(n)?
                            step = regex_list[4].sub('Knoblauchzehe', step)
                            # replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
                            step = regex_list[5].sub('Knoblauchzehe', step)
                        elif "Knoblauch" in ingr_part:
                            # replace Knoblauchzehe(n)?
                            step = regex_list[4].sub('Knoblauch', step)
                            # replace Knoblauch(\s)?(-)?(\s)?(z|Z)ehe(n)?
                            step = regex_list[5].sub('Knoblauch', step)
                        else:
                            # replace category words with exact ingredient
                            # (e.g. "Das Brot_Roggen ..." instead of "Das Brot ...")
                            step, res = replace_categories(step, ingred)
                        if not res:
                            # Try replacing ingr_part with whole ingredient
                            step, res = try_replace(repl, ingred, step)
                        if not res:
                            # try replacing without considering case
                            step, res = try_replace(repl, ingred, step, caps=True)
                            if not res:
                                # replace even if word is longer ("Käsescheibe" not "Käse")
                                step, res = try_replace(repl, ingred, step, caps=False, regex_short=False)
                                if not res:
                                    # replace even if word is longer without considering case
                                    step, res = try_replace(repl, ingred, step, caps=True, regex_short=False)

            new_steps.append(step)

            # in step remove double occurrance of ingr
        whole_dataset[recipe]['instructions'] = new_steps

    with open(data_path + output, "w") as dataset_json_file:
        json.dump(whole_dataset, dataset_json_file, ensure_ascii=False, indent=4)
    gc.collect()


def count_occurrances(amount, file_name='cleaned_steps_occurrance.json'):
    with open(data_path + file_name, "r") as json_file:
        dataset = json.load(json_file)

    count_under_amount = 0
    for ingredient in dataset.keys():
        if dataset[ingredient] <= amount:
            count_under_amount += 1
            print(ingredient)
    print("There are " + str(count_under_amount) + " ingredients with up to " + str(amount) + " occurrances")


# checks for words of certain categories (that are not in list yet) in instructions
# if they occur, put the recipe's data and the word into file
# user can then see which words should be put into list of replacement words
def check_word(word, ingr_list_file, input_file='dataset_sep_sentences.json', output='ingredients.txt'):
    # get list of all cleaned ingredients > 20
    with open(ingr_list_file, "r") as json_file:
        whole_word_list = json.load(json_file).keys()

    # check for category
    if word == "Käse " or word == "Käse." or word == "Käsescheibe":
        word_list = cheese
        word_pure = "Käse"
    elif word == "Milch " or word == "Milch.":
        word_list = milk
        word_pure = "Milch"
    elif word == "Fleisch " or word == "Fleisch.":
        word_list = meat
        word_pure = "Fleisch"
    elif word == "Wurst " or word == "Wurst.":
        word_list = sausage
        word_pure = "Wurst"
    elif word == "Würste " or word == "Würste.":
        word_list = sausage
        word_pure = "Würste"
    elif word == "Fisch " or word == "Fisch.":
        word_list = fish
        word_pure = "Fisch"
    elif word == "Nudeln " or word == "Nudeln.":
        word_list = pasta
        word_pure = "Nudeln"
    elif word == "Brot " or word == "Brot.":
        word_list = bread
        word_pure = "Brot"
    elif word == "Semmel " or word == "Semmel." or word == "Semmeln " or word == "Semmeln.":
        word_list = rolls
        word_pure = "Semmel"
    elif word == "Brötchen " or word == "Brötchen.":
        word_list = rolls
        word_pure = "Brötchen"
    else:
        print(Fore.LIGHTRED_EX + "This category can't be processed correctly!")
        return

    with open(data_path + input_file, "r") as steps_json_file:
        whole_dataset = json.load(steps_json_file)
    for recipe in whole_dataset.keys():
        exists = False
        for step in whole_dataset[recipe]['instructions']:
            if word in step:
                for list_elem in word_list:
                    if list_elem in whole_dataset[recipe]['ingredients']:
                        exists = True
                if word_pure in whole_dataset[recipe]['ingredients']:
                    exists = True
                if not exists:
                    with open(output, "a") as dataset_json_file:
                        out_list = []
                        exists = True
                        for ingr in whole_dataset[recipe]['ingredients']:
                            if ingr in whole_word_list:
                                test = whole_dataset[recipe]
                                out_list.append(ingr)
                        if len(out_list) > 0:
                            dataset_json_file.write("\n" + whole_dataset[recipe]['name'] + "\n")
                            dataset_json_file.write(recipe + "\n")
                            dataset_json_file.write(step + "\n")
                            for ingr2 in out_list:
                                dataset_json_file.write(ingr2 + "\n")

def main():
    print("started")
    # separate sentences
    with open(data_path + "dataset_cleaned_nice.json", "r") as whole_cleaned_json_file:
        whole_dataset_cleaned = json.load(whole_cleaned_json_file)
    sep_sentences(whole_dataset_cleaned)

    clean_steps(input_file='dataset_sep_sentences.json', output='dataset_cleaned_steps.json')

    sep_sentences_file(input_file='dataset_cleaned_steps.json', output_file='cleaned_sep_sentences.json')
    #
    # check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')


    # count_occurrances(300)
    # check_word("Käse ")
    # check_word("Käse.")
    # check_word("Käsescheibe")
    # check_word("Milch ")
    # check_word("Fleisch ", "data/meats.json")
    # check_word("Fleisch.", "data/meats.json")
    # check_word("Wurst ", "data/sausage.json")
    # check_word("Wurst.", "data/sausage.json")
    # check_word("Würste ", "data/sausage.json")
    # check_word("Würste.", "data/sausage.json")
    # check_word("Fisch ", "data/fish.json")
    # check_word("Fisch.", "data/fish.json")
    # check_word("Nudeln ", "data/pasta.json")
    # check_word("Nudeln.", "data/pasta.json")
    # check_word("Brot ", "data/bread.json")
    # check_word("Brot.", "data/bread.json")
    # check_word("Semmel ", "data/rolls.json")
    # check_word("Semmel.", "data/rolls.json")
    # check_word("Semmeln ", "data/rolls.json")
    # check_word("Semmeln.", "data/rolls.json")
    # check_word("Brötchen ", "data/rolls.json")
    # check_word("Brötchen.", "data/rolls.json")
    # check_word("test", "data/rolls.json")

    # output = replace_categories("Den Käse auf das Burger Brot legen", "Burgerbrötchen")
    # print(output)


    # test
    # clean_steps(input='dataset_test.json', output='dataset_cleaned_steps_test.json')
    # sep_sentences_file(input_file='dataset_cleaned_steps_test.json', output_file='cleaned_sep_sentences_test.json')
    # check_occurrances(step_file='cleaned_sep_sentences_test.json', out_file='cleaned_steps_occurrance_test.json')


    # clean_steps()
    # sep_sentences_file(input_file='dataset_clean_steps.json', output_file='cleaned_sep_sentences.json')
    # check_occurrances(step_file='cleaned_sep_sentences.json', out_file='cleaned_steps_occurrance.json')

main()