initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/train_model/prep_dataset_training.py
+++ b/train_model/prep_dataset_training.py
@@ -0,0 +1,373 @@
+from pathlib import Path
+
+from transformers import AutoConfig, BertTokenizer, AutoModelForMaskedLM
+import json
+import statistics
+from sklearn.model_selection import train_test_split
+
+
+def make_dataset(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
+    weird_step = ""
+
+    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
+        steps_dataset = json.load(whole_sep_json_file)
+
+    all_datapoints = {}
+    tokens_per_step = []
+    tokens_per_recipe = []
+    sentences_per_recipe = []
+    recipe_nr = 1
+    for recipe in steps_dataset.keys():
+        print(recipe_nr)
+        recipe_nr += 1
+        recipe_list = []
+        # curr_step = "[CLS]"
+        curr_step = ""
+        nr_class_tokens = 1
+        curr_step_len = nr_class_tokens
+        nr_sent = 0
+        sentences_per_recipe.append(len(steps_dataset[recipe]))
+        nr_recipe_tokens = 0
+        entered = False
+
+        for step in steps_dataset[recipe]:
+            entered = False
+            step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
+            step_len = len(step_tok)
+            # if step_len <= nr_class_tokens:
+            #     weird_step = step
+            tokens_per_step.append(step_len)
+            # add sentence to datapoint
+            if curr_step_len + (step_len - nr_class_tokens - 1) < 513:  # eigtl 512
+                curr_step += " " + step + " [SEP]"
+                # -2 since not adding CLS and SEP
+                curr_step_len += step_len - nr_class_tokens
+                nr_sent += 1
+                entered = False
+            # add sentence to next datapoint
+            else:
+                # curr_step += " [SEP]"
+                curr_step = curr_step[:-6]
+                curr_step_len -= 1
+                recipe_list.append(curr_step)
+                nr_recipe_tokens += curr_step_len
+                curr_step = step
+                # curr_step = "[CLS] " + step
+                curr_step_len = step_len
+                nr_sent = 1
+                entered = True
+        if not entered:
+            # curr_step += " [SEP]"
+            curr_step = curr_step[:-6]
+            curr_step_len -= 1
+            recipe_list.append(curr_step)
+            nr_recipe_tokens += curr_step_len
+
+        tokens_per_recipe.append(nr_recipe_tokens)
+        all_datapoints[recipe] = recipe_list
+
+    with open(out_path, "w") as whole_dataset:
+        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
+
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+    # print(tokens_per_step)
+
+    tokens_per_recipe.sort()
+    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
+    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
+    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
+    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
+    # print(tokens_per_recipe)
+
+    sentences_per_recipe.sort()
+    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
+    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
+    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
+    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
+    # print(sentences_per_recipe)
+
+    print(weird_step)
+
+
+def make_dataset2(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
+
+    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
+        steps_dataset = json.load(whole_sep_json_file)
+
+    all_datapoints = {}
+    tokens_per_step = []
+    tokens_per_recipe = []
+    sentences_per_recipe = []
+    recipe_nr = 1
+    for recipe in steps_dataset.keys():
+        print(recipe_nr)
+        recipe_nr += 1
+        recipe_list = []
+        # curr_step = "[CLS]"
+        curr_step = ""
+        nr_class_tokens = 2
+        curr_step_len = nr_class_tokens
+        nr_sent = 0
+        sentences_per_recipe.append(len(steps_dataset[recipe]))
+        nr_recipe_tokens = 0
+        entered = False
+
+        for step in steps_dataset[recipe]:
+            entered = False
+            step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
+            step_len = len(step_tok)
+            # if step_len <= nr_class_tokens:
+            #     weird_step = step
+            tokens_per_step.append(step_len)
+            # add sentence to datapoint
+            if curr_step_len + (step_len - nr_class_tokens) < 513:  # eigtl 512
+                curr_step += " " + step
+                # -2 since not adding CLS and SEP
+                curr_step_len += step_len - nr_class_tokens
+                nr_sent += 1
+                entered = False
+            # add sentence to next datapoint
+            else:
+                # curr_step += " [SEP]"
+                recipe_list.append(curr_step)
+                nr_recipe_tokens += curr_step_len
+                curr_step = step
+                # curr_step = "[CLS] " + step
+                curr_step_len = step_len
+                nr_sent = 1
+                entered = True
+        if not entered:
+            # curr_step += " [SEP]"
+            recipe_list.append(curr_step)
+            nr_recipe_tokens += curr_step_len
+
+        tokens_per_recipe.append(nr_recipe_tokens)
+        all_datapoints[recipe] = recipe_list
+
+    with open(out_path, "w") as whole_dataset:
+        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
+
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+    # print(tokens_per_step)
+
+    tokens_per_recipe.sort()
+    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
+    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
+    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
+    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
+    # print(tokens_per_recipe)
+
+    sentences_per_recipe.sort()
+    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
+    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
+    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
+    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
+    # print(sentences_per_recipe)
+
+
+def make_dataset3(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
+
+    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
+        steps_dataset = json.load(whole_sep_json_file)
+
+    all_datapoints = {}
+    tokens_per_step = []
+    tokens_per_recipe = []
+    sentences_per_recipe = []
+    recipe_nr = 1
+
+    for recipe in steps_dataset.keys():
+        print(recipe_nr)
+        recipe_nr += 1
+        recipe_list = []
+        nr_sent = 0
+        sentences_per_recipe.append(len(steps_dataset[recipe]))
+        nr_recipe_tokens = 0
+        add_part = False
+        prev_sentence = ""
+
+        for step in steps_dataset[recipe]:
+            if len(step) <= 5:
+                if ((len(prev_sentence) > 0 and (prev_sentence[-1] == "." or prev_sentence[-1] == "!")) or (len(prev_sentence) > 1 and (prev_sentence[-2:] == ". " or prev_sentence[-2:] == "! "))) and not (prev_sentence == "." or prev_sentence == "!" or prev_sentence == ". " or prev_sentence == "! "):
+                    if add_part:
+                        prev_sentence += " " + step
+                    add_part = True
+                else:
+                    if not len(recipe_list) == 0:
+                        recipe_list[len(recipe_list)-1] += " " + step
+                    else:
+                        recipe_list.append(step)
+                    add_part = False
+                    prev_sentence = step
+            else:
+                if add_part:
+                    curr_step = prev_sentence + " " + step
+                    add_part = False
+                else:
+                    curr_step = step
+                recipe_list.append(curr_step)
+                step_tok = tokenizer.tokenize("[CLS] " + curr_step + " [SEP]")
+                step_len = len(step_tok)
+                nr_recipe_tokens += step_len
+                prev_sentence = step
+                tokens_per_step.append(step_len)
+                nr_sent += 1
+        sentences_per_recipe.append(nr_sent)
+        all_datapoints[recipe] = recipe_list
+        tokens_per_recipe.append(nr_recipe_tokens)
+
+    with open(out_path, "w") as whole_dataset:
+        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+    # print(tokens_per_step)
+
+    tokens_per_recipe.sort()
+    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
+    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
+    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
+    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
+    # print(tokens_per_recipe)
+
+    sentences_per_recipe.sort()
+    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
+    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
+    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
+    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
+    # print(sentences_per_recipe)
+
+
+def check_dataset(tokenizer, file_path="data/complete_dataset.json"):
+    with open(file_path, "r") as whole_sep_json_file:
+        dataset = json.load(whole_sep_json_file)
+
+    tokens_per_step = []
+
+    recipe_nr = 1
+    for recipe in dataset.keys():
+        if recipe_nr % 10000 == 0:
+            print(recipe_nr)
+        recipe_nr += 1
+        for step in dataset[recipe]:
+            step_tok = tokenizer.tokenize(step)
+            step_len = len(step_tok)
+            tokens_per_step.append(step_len)
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step) - 1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+
+
+def format_dataset(input_path, out_path="data/model_datapoints.txt"):
+    with open(input_path, "r") as whole_sep_json_file:
+        dataset = json.load(whole_sep_json_file)
+    recipe_data = []
+    for recipe in dataset.keys():
+        recipe_data.append("\n".join(dataset[recipe]))
+    with open(out_path, "w") as whole_dataset:
+        whole_dataset.write("\n".join(recipe_data))
+
+
+def extract_instructions_from_recipes(needed_recipes):
+    instructions = []
+    for recipe in needed_recipes:
+        for instruction in recipe:
+            instructions.append(instruction)
+
+    return instructions
+
+
+
+def split_dataset(input_path, training_path, testing_path):
+    with open(input_path, "r") as f:
+        whole_dataset = json.load(f)
+
+    all_recipes = []
+    for recipe in whole_dataset.keys():
+        all_recipes += [whole_dataset[recipe]]
+
+    train_recipes, test_recipes = train_test_split(all_recipes, test_size=0.01, shuffle=True, random_state=42)
+    train_instructions = extract_instructions_from_recipes(train_recipes)
+    test_instructions = extract_instructions_from_recipes(test_recipes)
+
+    print(f'Train Instructions: {len(train_instructions)}\n'
+          f'Test Instructions: {len(test_instructions)}')
+
+    with open(training_path, "w") as f:
+        f.write('\n'.join(train_instructions))
+
+    with open(testing_path, "w") as f:
+        f.write('\n'.join(test_instructions))
+
+def make_complete_dataset(whole_dataset_path, steps_path, out_path):
+    with open(whole_dataset_path, "r") as f:
+        whole_dataset = json.load(f)
+
+    with open(steps_path, "r") as f:
+        all_steps = json.load(f)
+
+    for recipe in whole_dataset.keys():
+        whole_dataset[recipe]['instructions'] = all_steps[recipe]
+
+    with open(out_path, "w") as f:
+        json.dump(whole_dataset, f, ensure_ascii=False, indent=4)
+
+def main():
+    vocab_path = "train_model/vocab/"
+    data_path = "data/"
+    cache_dir = None
+    config = AutoConfig.from_pretrained("bert-base-german-cased", cache_dir=cache_dir)
+
+    with open(vocab_path + "used_ingredients.json", "r") as used_ingredients_file:
+        used_ingredients = json.load(used_ingredients_file)
+    tokenizer = BertTokenizer(vocab_file=vocab_path + 'vocab.txt', do_lower_case=False, max_len=512, never_split=used_ingredients)
+
+    model = AutoModelForMaskedLM.from_pretrained(
+        "bert-base-german-cased",
+        from_tf=bool(".ckpt" in "bert-base-german-cased"),
+        config=config,
+        cache_dir=cache_dir,
+    )
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    revised_dataset_path = data_path + "complete_dataset.json"
+
+    # combine sentences into datapoints
+    ## ADAPT WHICH DATASET VERSION TO MAKE!
+    make_dataset3(tokenizer, out_path=revised_dataset_path)
+
+    # get statistics for datapoints
+    check_dataset(tokenizer, file_path=revised_dataset_path)
+
+    # make list of datapoints
+    format_dataset(input_path=revised_dataset_path, out_path="data/model_datapoints.txt")
+
+    # change dataset to have new datapoints as ingredients
+    make_complete_dataset(whole_dataset_path=data_path+"dataset_cleaned_steps_not_empty.json", steps_path=revised_dataset_path, out_path=data_path+"full_dataset.json")
+
+    split_dataset(revised_dataset_path, data_path + "training_data.txt", data_path + "testing_data.txt")
+
+
+if __name__ == '__main__':
+    main()