MasterarbeitCode/train_model/prep_dataset_training.py

from pathlib import Path

from transformers import AutoConfig, BertTokenizer, AutoModelForMaskedLM
import json
import statistics
from sklearn.model_selection import train_test_split


def make_dataset(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
    weird_step = ""

    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
        steps_dataset = json.load(whole_sep_json_file)

    all_datapoints = {}
    tokens_per_step = []
    tokens_per_recipe = []
    sentences_per_recipe = []
    recipe_nr = 1
    for recipe in steps_dataset.keys():
        print(recipe_nr)
        recipe_nr += 1
        recipe_list = []
        # curr_step = "[CLS]"
        curr_step = ""
        nr_class_tokens = 1
        curr_step_len = nr_class_tokens
        nr_sent = 0
        sentences_per_recipe.append(len(steps_dataset[recipe]))
        nr_recipe_tokens = 0
        entered = False

        for step in steps_dataset[recipe]:
            entered = False
            step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
            step_len = len(step_tok)
            # if step_len <= nr_class_tokens:
            #     weird_step = step
            tokens_per_step.append(step_len)
            # add sentence to datapoint
            if curr_step_len + (step_len - nr_class_tokens - 1) < 513:  # eigtl 512
                curr_step += " " + step + " [SEP]"
                # -2 since not adding CLS and SEP
                curr_step_len += step_len - nr_class_tokens
                nr_sent += 1
                entered = False
            # add sentence to next datapoint
            else:
                # curr_step += " [SEP]"
                curr_step = curr_step[:-6]
                curr_step_len -= 1
                recipe_list.append(curr_step)
                nr_recipe_tokens += curr_step_len
                curr_step = step
                # curr_step = "[CLS] " + step
                curr_step_len = step_len
                nr_sent = 1
                entered = True
        if not entered:
            # curr_step += " [SEP]"
            curr_step = curr_step[:-6]
            curr_step_len -= 1
            recipe_list.append(curr_step)
            nr_recipe_tokens += curr_step_len

        tokens_per_recipe.append(nr_recipe_tokens)
        all_datapoints[recipe] = recipe_list

    with open(out_path, "w") as whole_dataset:
        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)


    # Number of tokens in a single step/sentence
    tokens_per_step.sort()
    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
    # print(tokens_per_step)

    tokens_per_recipe.sort()
    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
    # print(tokens_per_recipe)

    sentences_per_recipe.sort()
    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
    # print(sentences_per_recipe)

    print(weird_step)


def make_dataset2(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):

    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
        steps_dataset = json.load(whole_sep_json_file)

    all_datapoints = {}
    tokens_per_step = []
    tokens_per_recipe = []
    sentences_per_recipe = []
    recipe_nr = 1
    for recipe in steps_dataset.keys():
        print(recipe_nr)
        recipe_nr += 1
        recipe_list = []
        # curr_step = "[CLS]"
        curr_step = ""
        nr_class_tokens = 2
        curr_step_len = nr_class_tokens
        nr_sent = 0
        sentences_per_recipe.append(len(steps_dataset[recipe]))
        nr_recipe_tokens = 0
        entered = False

        for step in steps_dataset[recipe]:
            entered = False
            step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
            step_len = len(step_tok)
            # if step_len <= nr_class_tokens:
            #     weird_step = step
            tokens_per_step.append(step_len)
            # add sentence to datapoint
            if curr_step_len + (step_len - nr_class_tokens) < 513:  # eigtl 512
                curr_step += " " + step
                # -2 since not adding CLS and SEP
                curr_step_len += step_len - nr_class_tokens
                nr_sent += 1
                entered = False
            # add sentence to next datapoint
            else:
                # curr_step += " [SEP]"
                recipe_list.append(curr_step)
                nr_recipe_tokens += curr_step_len
                curr_step = step
                # curr_step = "[CLS] " + step
                curr_step_len = step_len
                nr_sent = 1
                entered = True
        if not entered:
            # curr_step += " [SEP]"
            recipe_list.append(curr_step)
            nr_recipe_tokens += curr_step_len

        tokens_per_recipe.append(nr_recipe_tokens)
        all_datapoints[recipe] = recipe_list

    with open(out_path, "w") as whole_dataset:
        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)


    # Number of tokens in a single step/sentence
    tokens_per_step.sort()
    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
    # print(tokens_per_step)

    tokens_per_recipe.sort()
    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
    # print(tokens_per_recipe)

    sentences_per_recipe.sort()
    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
    # print(sentences_per_recipe)


def make_dataset3(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):

    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
        steps_dataset = json.load(whole_sep_json_file)

    all_datapoints = {}
    tokens_per_step = []
    tokens_per_recipe = []
    sentences_per_recipe = []
    recipe_nr = 1

    for recipe in steps_dataset.keys():
        print(recipe_nr)
        recipe_nr += 1
        recipe_list = []
        nr_sent = 0
        sentences_per_recipe.append(len(steps_dataset[recipe]))
        nr_recipe_tokens = 0
        add_part = False
        prev_sentence = ""

        for step in steps_dataset[recipe]:
            if len(step) <= 5:
                if ((len(prev_sentence) > 0 and (prev_sentence[-1] == "." or prev_sentence[-1] == "!")) or (len(prev_sentence) > 1 and (prev_sentence[-2:] == ". " or prev_sentence[-2:] == "! "))) and not (prev_sentence == "." or prev_sentence == "!" or prev_sentence == ". " or prev_sentence == "! "):
                    if add_part:
                        prev_sentence += " " + step
                    add_part = True
                else:
                    if not len(recipe_list) == 0:
                        recipe_list[len(recipe_list)-1] += " " + step
                    else:
                        recipe_list.append(step)
                    add_part = False
                    prev_sentence = step
            else:
                if add_part:
                    curr_step = prev_sentence + " " + step
                    add_part = False
                else:
                    curr_step = step
                recipe_list.append(curr_step)
                step_tok = tokenizer.tokenize("[CLS] " + curr_step + " [SEP]")
                step_len = len(step_tok)
                nr_recipe_tokens += step_len
                prev_sentence = step
                tokens_per_step.append(step_len)
                nr_sent += 1
        sentences_per_recipe.append(nr_sent)
        all_datapoints[recipe] = recipe_list
        tokens_per_recipe.append(nr_recipe_tokens)

    with open(out_path, "w") as whole_dataset:
        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)

    # Number of tokens in a single step/sentence
    tokens_per_step.sort()
    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
    # print(tokens_per_step)

    tokens_per_recipe.sort()
    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
    # print(tokens_per_recipe)

    sentences_per_recipe.sort()
    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
    # print(sentences_per_recipe)


def check_dataset(tokenizer, file_path="data/complete_dataset.json"):
    with open(file_path, "r") as whole_sep_json_file:
        dataset = json.load(whole_sep_json_file)

    tokens_per_step = []

    recipe_nr = 1
    for recipe in dataset.keys():
        if recipe_nr % 10000 == 0:
            print(recipe_nr)
        recipe_nr += 1
        for step in dataset[recipe]:
            step_tok = tokenizer.tokenize(step)
            step_len = len(step_tok)
            tokens_per_step.append(step_len)

    # Number of tokens in a single step/sentence
    tokens_per_step.sort()
    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step) - 1]))
    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))


def format_dataset(input_path, out_path="data/model_datapoints.txt"):
    with open(input_path, "r") as whole_sep_json_file:
        dataset = json.load(whole_sep_json_file)
    recipe_data = []
    for recipe in dataset.keys():
        recipe_data.append("\n".join(dataset[recipe]))
    with open(out_path, "w") as whole_dataset:
        whole_dataset.write("\n".join(recipe_data))


def extract_instructions_from_recipes(needed_recipes):
    instructions = []
    for recipe in needed_recipes:
        for instruction in recipe:
            instructions.append(instruction)

    return instructions


def split_dataset(input_path, training_path, testing_path):
    with open(input_path, "r") as f:
        whole_dataset = json.load(f)

    all_recipes = []
    for recipe in whole_dataset.keys():
        all_recipes += [whole_dataset[recipe]]

    train_recipes, test_recipes = train_test_split(all_recipes, test_size=0.01, shuffle=True, random_state=42)
    train_instructions = extract_instructions_from_recipes(train_recipes)
    test_instructions = extract_instructions_from_recipes(test_recipes)

    print(f'Train Instructions: {len(train_instructions)}\n'
          f'Test Instructions: {len(test_instructions)}')

    with open(training_path, "w") as f:
        f.write('\n'.join(train_instructions))

    with open(testing_path, "w") as f:
        f.write('\n'.join(test_instructions))

def make_complete_dataset(whole_dataset_path, steps_path, out_path):
    with open(whole_dataset_path, "r") as f:
        whole_dataset = json.load(f)

    with open(steps_path, "r") as f:
        all_steps = json.load(f)

    for recipe in whole_dataset.keys():
        whole_dataset[recipe]['instructions'] = all_steps[recipe]

    with open(out_path, "w") as f:
        json.dump(whole_dataset, f, ensure_ascii=False, indent=4)

def main():
    vocab_path = "train_model/vocab/"
    data_path = "data/"
    cache_dir = None
    config = AutoConfig.from_pretrained("bert-base-german-cased", cache_dir=cache_dir)

    with open(vocab_path + "used_ingredients.json", "r") as used_ingredients_file:
        used_ingredients = json.load(used_ingredients_file)
    tokenizer = BertTokenizer(vocab_file=vocab_path + 'vocab.txt', do_lower_case=False, max_len=512, never_split=used_ingredients)

    model = AutoModelForMaskedLM.from_pretrained(
        "bert-base-german-cased",
        from_tf=bool(".ckpt" in "bert-base-german-cased"),
        config=config,
        cache_dir=cache_dir,
    )

    model.resize_token_embeddings(len(tokenizer))

    revised_dataset_path = data_path + "complete_dataset.json"

    # combine sentences into datapoints
    ## ADAPT WHICH DATASET VERSION TO MAKE!
    make_dataset3(tokenizer, out_path=revised_dataset_path)

    # get statistics for datapoints
    check_dataset(tokenizer, file_path=revised_dataset_path)

    # make list of datapoints
    format_dataset(input_path=revised_dataset_path, out_path="data/model_datapoints.txt")

    # change dataset to have new datapoints as ingredients
    make_complete_dataset(whole_dataset_path=data_path+"dataset_cleaned_steps_not_empty.json", steps_path=revised_dataset_path, out_path=data_path+"full_dataset.json")

    split_dataset(revised_dataset_path, data_path + "training_data.txt", data_path + "testing_data.txt")


if __name__ == '__main__':
    main()