initial commit of project

2021-04-11 23:28:41 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
@@ -0,0 +1,15 @@
+#Vocab
+To create vocab.txt file, run **make_new_vocab.py**
+
+# Prep dataset
+**prep_dataset_training**: Format and split dataset, so it can be used for training. Adapt which dataset version to make!
+
+# train German FoodBERT
+**language_modeling**
+
+
+#Vocab Files:
+**bert-base-german-cased_tokenizer.json**: original bert-base-german-cased tokenizer file
+**bert_vocab.txt**: original bert-base-german-cased vocab
+**used_ingredients**: all ingredients in dataset
+**vocab.txt**: German FoodBERT vocabulary
@@ -0,0 +1,333 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
+# https://github.com/huggingface/transformers/blob/v2.9.1/examples/language-modeling/run_language_modeling.py
+import time
+from pathlib import Path
+
+'''
+Modified huggingface code for pretraining FoodBERT.
+Running parameters: 
+--output_dir=output --model_type=bert --model_name=bert-base-german-cased --do_train 
+--train_data_file="data/training_data.txt" --do_eval --eval_data_file="data/testing_data.txt" 
+--mlm --line_by_line --per_device_train_batch_size=8 --gradient_accumulation_steps=2 --per_device_eval_batch_size=8 
+--save_total_limit=5 --save_steps=10000 --logging_steps=10000 --evaluation_strategy=epoch 
+--model_name_or_path="bert-base-german-cased"
+may need adjustment (especially paths)
+'''
+
+import json
+import logging
+import math
+import os
+import pickle
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import BertTokenizer
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_WITH_LM_HEAD_MAPPING,
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    HfArgumentParser,
+    LineByLineTextDataset,
+    PreTrainedTokenizer,
+    TextDataset,
+    Trainer,
+    TrainingArguments,
+    set_seed, BertTokenizer,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+class CachedLineByLineTextDataset(LineByLineTextDataset):
+    """
+    Adds caching functionality to LineByLineTextDataset
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
+        assert os.path.isfile(file_path)
+        cached_file_path = Path(file_path.rsplit('.', 1)[0] + '_cache.pth')
+        if cached_file_path.exists():
+            with cached_file_path.open('rb') as f:
+                self.examples = pickle.load(f)
+                logger.info(
+                    f"Loading features from cached file {cached_file_path}")
+
+        else:
+            logger.info("Creating features from dataset file at %s", file_path)
+
+            with open(file_path, encoding="utf-8") as f:
+                lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+            batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size, truncation=True)
+            self.examples = batch_encoding["input_ids"]
+            with cached_file_path.open('wb') as f:
+                pickle.dump(self.examples, f, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file {cached_file_path}")
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    train_data_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a text file)."}
+    )
+    eval_data_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+
+    mlm: bool = field(
+        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+
+    block_size: int = field(
+        default=-1,
+        metadata={
+            "help": "Optional input sequence length after tokenization."
+            "The training dataset will be truncated in block of this size for training."
+            "Default to the model max input length for single sentence inputs (take into account special tokens)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1):
+    file_path = args.eval_data_file if evaluate else args.train_data_file
+    if args.line_by_line:
+        return CachedLineByLineTextDataset(
+            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank
+        )
+    else:
+        return TextDataset(
+            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank,
+        )
+
+
+def main():
+    print("start time: " + time.strftime('%d.%m %H:%M'))
+
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if data_args.eval_data_file is None and training_args.do_eval:
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    with open("train_model/vocab/used_ingredients.json", "r") as used_ingredients_file:
+        used_ingredients = json.load(used_ingredients_file)
+    tokenizer = BertTokenizer(vocab_file='train_model/vocab/vocab.txt', do_lower_case=False, model_max_length=512, never_split=used_ingredients)
+
+    if model_args.model_name_or_path:
+        model = AutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForMaskedLM.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
+        raise ValueError(
+            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
+
+    if data_args.block_size <= 0:
+        data_args.block_size = tokenizer.model_max_length
+        # Our input block size will be the max possible for the model
+    else:
+        data_args.block_size = min(data_args.block_size, tokenizer.model_max_length)
+
+    # Get datasets
+    train_dataset = (
+        get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank)
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True)
+        if training_args.do_eval
+        else None
+    )
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
+    )
+    # Make sure checkpoint recovery and continous training works on GPU, probably we need to make sure to push all parameters to the gpu
+    # Solves bug in Trainer https://github.com/huggingface/transformers/issues/4240
+    # put in if needed
+    model.to(training_args.device)
+
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset
+    )
+
+    # Training
+    if training_args.do_train:
+        model_path = (
+            model_args.model_name_or_path
+            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
+            else None
+        )
+        trainer.train(model_path=model_path)
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_process_zero():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval and training_args.local_rank in [-1, 0]:
+        logger.info("*** Evaluate ***")
+
+        eval_output = trainer.evaluate()
+
+        perplexity = math.exp(eval_output["eval_loss"])
+        result = {"perplexity": perplexity}
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+        results.update(result)
+
+    print("end time: " + time.strftime('%d.%m %H:%M'))
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,82 @@
+import json
+
+data_path = "data/"
+vocab_path = "train_model/vocab/"
+
+def make_vocab_from_tokenizer(base_vocab):
+    with open(vocab_path + "vocab.txt", "w") as vocab_file:
+        for word in base_vocab:
+            vocab_file.write(word + "\n")
+
+
+
+def check_words_in_vocab(tokenizer):
+    ingredient_path = "mult_ingredients_nice.json"
+    with open(data_path + ingredient_path, "r") as ingr_json_file:
+        ingredients = json.load(ingr_json_file)
+
+    new_words = []
+    new_word_count = 0
+
+    for ingr in ingredients.keys():
+        if ingr not in tokenizer["model"]["vocab"].keys():
+            new_words.append(ingr)
+            new_word_count += 1
+            # print(new_word_count)
+        else:
+            print(ingr)
+
+    with open(vocab_path + "used_ingredients.json", "w") as used_ingredients_file:
+        json.dump(list(ingredients.keys()), used_ingredients_file, ensure_ascii=False, indent=4)
+        # used_ingredients_file.write("\n".join(ingredients.keys()))
+    print(str(new_word_count) + " words to be added to vocab")
+    return new_words
+
+
+def add_words_to_vocab(new_words):
+    with open(vocab_path + "vocab.txt", "a") as vocab_file:
+        vocab_file.write("\n")
+        vocab_file.write("\n".join(new_words))
+
+
+def create_base_vocab(tokenizer):
+    with open(vocab_path + "vocab.txt", "a") as vocab_file:
+        vocab_file.write("\n".join(tokenizer["model"]["vocab"].keys()))
+
+
+def check_existing(tokenizer):
+    ingredient_path = "mult_ingredients_nice.json"
+    with open(data_path + ingredient_path, "r") as ingr_json_file:
+        ingredients = json.load(ingr_json_file)
+
+    new_words = []
+    new_word_count = 0
+    old_word_count = 0
+
+    for ingr in ingredients.keys():
+        if ingr not in tokenizer["model"]["vocab"].keys():
+            new_words.append(ingr)
+            new_word_count += 1
+            # print(new_word_count)
+        else:
+            print(ingr)
+            old_word_count += 1
+    print(old_word_count)
+
+
+def main():
+    tokenizer_path = "train_model/vocab/bert-base-german-cased_tokenizer.json"
+    with open(tokenizer_path, "r") as whole_json_file:
+        tokenizer = json.load(whole_json_file)
+    # check_existing(tokenizer)
+
+    # make_vocab_from_tokenizer(tokenizer["model"]["vocab"])
+
+    new_words = check_words_in_vocab(tokenizer)
+
+    create_base_vocab(tokenizer)
+
+    add_words_to_vocab(new_words)
+
+
+main()
@@ -0,0 +1,373 @@
+from pathlib import Path
+
+from transformers import AutoConfig, BertTokenizer, AutoModelForMaskedLM
+import json
+import statistics
+from sklearn.model_selection import train_test_split
+
+
+def make_dataset(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
+    weird_step = ""
+
+    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
+        steps_dataset = json.load(whole_sep_json_file)
+
+    all_datapoints = {}
+    tokens_per_step = []
+    tokens_per_recipe = []
+    sentences_per_recipe = []
+    recipe_nr = 1
+    for recipe in steps_dataset.keys():
+        print(recipe_nr)
+        recipe_nr += 1
+        recipe_list = []
+        # curr_step = "[CLS]"
+        curr_step = ""
+        nr_class_tokens = 1
+        curr_step_len = nr_class_tokens
+        nr_sent = 0
+        sentences_per_recipe.append(len(steps_dataset[recipe]))
+        nr_recipe_tokens = 0
+        entered = False
+
+        for step in steps_dataset[recipe]:
+            entered = False
+            step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
+            step_len = len(step_tok)
+            # if step_len <= nr_class_tokens:
+            #     weird_step = step
+            tokens_per_step.append(step_len)
+            # add sentence to datapoint
+            if curr_step_len + (step_len - nr_class_tokens - 1) < 513:  # eigtl 512
+                curr_step += " " + step + " [SEP]"
+                # -2 since not adding CLS and SEP
+                curr_step_len += step_len - nr_class_tokens
+                nr_sent += 1
+                entered = False
+            # add sentence to next datapoint
+            else:
+                # curr_step += " [SEP]"
+                curr_step = curr_step[:-6]
+                curr_step_len -= 1
+                recipe_list.append(curr_step)
+                nr_recipe_tokens += curr_step_len
+                curr_step = step
+                # curr_step = "[CLS] " + step
+                curr_step_len = step_len
+                nr_sent = 1
+                entered = True
+        if not entered:
+            # curr_step += " [SEP]"
+            curr_step = curr_step[:-6]
+            curr_step_len -= 1
+            recipe_list.append(curr_step)
+            nr_recipe_tokens += curr_step_len
+
+        tokens_per_recipe.append(nr_recipe_tokens)
+        all_datapoints[recipe] = recipe_list
+
+    with open(out_path, "w") as whole_dataset:
+        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
+
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+    # print(tokens_per_step)
+
+    tokens_per_recipe.sort()
+    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
+    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
+    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
+    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
+    # print(tokens_per_recipe)
+
+    sentences_per_recipe.sort()
+    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
+    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
+    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
+    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
+    # print(sentences_per_recipe)
+
+    print(weird_step)
+
+
+def make_dataset2(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
+
+    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
+        steps_dataset = json.load(whole_sep_json_file)
+
+    all_datapoints = {}
+    tokens_per_step = []
+    tokens_per_recipe = []
+    sentences_per_recipe = []
+    recipe_nr = 1
+    for recipe in steps_dataset.keys():
+        print(recipe_nr)
+        recipe_nr += 1
+        recipe_list = []
+        # curr_step = "[CLS]"
+        curr_step = ""
+        nr_class_tokens = 2
+        curr_step_len = nr_class_tokens
+        nr_sent = 0
+        sentences_per_recipe.append(len(steps_dataset[recipe]))
+        nr_recipe_tokens = 0
+        entered = False
+
+        for step in steps_dataset[recipe]:
+            entered = False
+            step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
+            step_len = len(step_tok)
+            # if step_len <= nr_class_tokens:
+            #     weird_step = step
+            tokens_per_step.append(step_len)
+            # add sentence to datapoint
+            if curr_step_len + (step_len - nr_class_tokens) < 513:  # eigtl 512
+                curr_step += " " + step
+                # -2 since not adding CLS and SEP
+                curr_step_len += step_len - nr_class_tokens
+                nr_sent += 1
+                entered = False
+            # add sentence to next datapoint
+            else:
+                # curr_step += " [SEP]"
+                recipe_list.append(curr_step)
+                nr_recipe_tokens += curr_step_len
+                curr_step = step
+                # curr_step = "[CLS] " + step
+                curr_step_len = step_len
+                nr_sent = 1
+                entered = True
+        if not entered:
+            # curr_step += " [SEP]"
+            recipe_list.append(curr_step)
+            nr_recipe_tokens += curr_step_len
+
+        tokens_per_recipe.append(nr_recipe_tokens)
+        all_datapoints[recipe] = recipe_list
+
+    with open(out_path, "w") as whole_dataset:
+        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
+
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+    # print(tokens_per_step)
+
+    tokens_per_recipe.sort()
+    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
+    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
+    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
+    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
+    # print(tokens_per_recipe)
+
+    sentences_per_recipe.sort()
+    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
+    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
+    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
+    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
+    # print(sentences_per_recipe)
+
+
+def make_dataset3(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
+
+    with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
+        steps_dataset = json.load(whole_sep_json_file)
+
+    all_datapoints = {}
+    tokens_per_step = []
+    tokens_per_recipe = []
+    sentences_per_recipe = []
+    recipe_nr = 1
+
+    for recipe in steps_dataset.keys():
+        print(recipe_nr)
+        recipe_nr += 1
+        recipe_list = []
+        nr_sent = 0
+        sentences_per_recipe.append(len(steps_dataset[recipe]))
+        nr_recipe_tokens = 0
+        add_part = False
+        prev_sentence = ""
+
+        for step in steps_dataset[recipe]:
+            if len(step) <= 5:
+                if ((len(prev_sentence) > 0 and (prev_sentence[-1] == "." or prev_sentence[-1] == "!")) or (len(prev_sentence) > 1 and (prev_sentence[-2:] == ". " or prev_sentence[-2:] == "! "))) and not (prev_sentence == "." or prev_sentence == "!" or prev_sentence == ". " or prev_sentence == "! "):
+                    if add_part:
+                        prev_sentence += " " + step
+                    add_part = True
+                else:
+                    if not len(recipe_list) == 0:
+                        recipe_list[len(recipe_list)-1] += " " + step
+                    else:
+                        recipe_list.append(step)
+                    add_part = False
+                    prev_sentence = step
+            else:
+                if add_part:
+                    curr_step = prev_sentence + " " + step
+                    add_part = False
+                else:
+                    curr_step = step
+                recipe_list.append(curr_step)
+                step_tok = tokenizer.tokenize("[CLS] " + curr_step + " [SEP]")
+                step_len = len(step_tok)
+                nr_recipe_tokens += step_len
+                prev_sentence = step
+                tokens_per_step.append(step_len)
+                nr_sent += 1
+        sentences_per_recipe.append(nr_sent)
+        all_datapoints[recipe] = recipe_list
+        tokens_per_recipe.append(nr_recipe_tokens)
+
+    with open(out_path, "w") as whole_dataset:
+        json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+    # print(tokens_per_step)
+
+    tokens_per_recipe.sort()
+    print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
+    print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
+    print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
+    print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
+    # print(tokens_per_recipe)
+
+    sentences_per_recipe.sort()
+    print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
+    print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
+    print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
+    print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
+    # print(sentences_per_recipe)
+
+
+def check_dataset(tokenizer, file_path="data/complete_dataset.json"):
+    with open(file_path, "r") as whole_sep_json_file:
+        dataset = json.load(whole_sep_json_file)
+
+    tokens_per_step = []
+
+    recipe_nr = 1
+    for recipe in dataset.keys():
+        if recipe_nr % 10000 == 0:
+            print(recipe_nr)
+        recipe_nr += 1
+        for step in dataset[recipe]:
+            step_tok = tokenizer.tokenize(step)
+            step_len = len(step_tok)
+            tokens_per_step.append(step_len)
+
+    # Number of tokens in a single step/sentence
+    tokens_per_step.sort()
+    print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
+    print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step) - 1]))
+    print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
+    print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
+
+
+def format_dataset(input_path, out_path="data/model_datapoints.txt"):
+    with open(input_path, "r") as whole_sep_json_file:
+        dataset = json.load(whole_sep_json_file)
+    recipe_data = []
+    for recipe in dataset.keys():
+        recipe_data.append("\n".join(dataset[recipe]))
+    with open(out_path, "w") as whole_dataset:
+        whole_dataset.write("\n".join(recipe_data))
+
+
+def extract_instructions_from_recipes(needed_recipes):
+    instructions = []
+    for recipe in needed_recipes:
+        for instruction in recipe:
+            instructions.append(instruction)
+
+    return instructions
+
+
+
+def split_dataset(input_path, training_path, testing_path):
+    with open(input_path, "r") as f:
+        whole_dataset = json.load(f)
+
+    all_recipes = []
+    for recipe in whole_dataset.keys():
+        all_recipes += [whole_dataset[recipe]]
+
+    train_recipes, test_recipes = train_test_split(all_recipes, test_size=0.01, shuffle=True, random_state=42)
+    train_instructions = extract_instructions_from_recipes(train_recipes)
+    test_instructions = extract_instructions_from_recipes(test_recipes)
+
+    print(f'Train Instructions: {len(train_instructions)}\n'
+          f'Test Instructions: {len(test_instructions)}')
+
+    with open(training_path, "w") as f:
+        f.write('\n'.join(train_instructions))
+
+    with open(testing_path, "w") as f:
+        f.write('\n'.join(test_instructions))
+
+def make_complete_dataset(whole_dataset_path, steps_path, out_path):
+    with open(whole_dataset_path, "r") as f:
+        whole_dataset = json.load(f)
+
+    with open(steps_path, "r") as f:
+        all_steps = json.load(f)
+
+    for recipe in whole_dataset.keys():
+        whole_dataset[recipe]['instructions'] = all_steps[recipe]
+
+    with open(out_path, "w") as f:
+        json.dump(whole_dataset, f, ensure_ascii=False, indent=4)
+
+def main():
+    vocab_path = "train_model/vocab/"
+    data_path = "data/"
+    cache_dir = None
+    config = AutoConfig.from_pretrained("bert-base-german-cased", cache_dir=cache_dir)
+
+    with open(vocab_path + "used_ingredients.json", "r") as used_ingredients_file:
+        used_ingredients = json.load(used_ingredients_file)
+    tokenizer = BertTokenizer(vocab_file=vocab_path + 'vocab.txt', do_lower_case=False, max_len=512, never_split=used_ingredients)
+
+    model = AutoModelForMaskedLM.from_pretrained(
+        "bert-base-german-cased",
+        from_tf=bool(".ckpt" in "bert-base-german-cased"),
+        config=config,
+        cache_dir=cache_dir,
+    )
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    revised_dataset_path = data_path + "complete_dataset.json"
+
+    # combine sentences into datapoints
+    ## ADAPT WHICH DATASET VERSION TO MAKE!
+    make_dataset3(tokenizer, out_path=revised_dataset_path)
+
+    # get statistics for datapoints
+    check_dataset(tokenizer, file_path=revised_dataset_path)
+
+    # make list of datapoints
+    format_dataset(input_path=revised_dataset_path, out_path="data/model_datapoints.txt")
+
+    # change dataset to have new datapoints as ingredients
+    make_complete_dataset(whole_dataset_path=data_path+"dataset_cleaned_steps_not_empty.json", steps_path=revised_dataset_path, out_path=data_path+"full_dataset.json")
+
+    split_dataset(revised_dataset_path, data_path + "training_data.txt", data_path + "testing_data.txt")
+
+
+if __name__ == '__main__':
+    main()