initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/train_model/language_modeling.py
+++ b/train_model/language_modeling.py
@@ -0,0 +1,333 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
+GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
+using a masked language modeling (MLM) loss.
+"""
+# https://github.com/huggingface/transformers/blob/v2.9.1/examples/language-modeling/run_language_modeling.py
+import time
+from pathlib import Path
+
+'''
+Modified huggingface code for pretraining FoodBERT.
+Running parameters: 
+--output_dir=output --model_type=bert --model_name=bert-base-german-cased --do_train 
+--train_data_file="data/training_data.txt" --do_eval --eval_data_file="data/testing_data.txt" 
+--mlm --line_by_line --per_device_train_batch_size=8 --gradient_accumulation_steps=2 --per_device_eval_batch_size=8 
+--save_total_limit=5 --save_steps=10000 --logging_steps=10000 --evaluation_strategy=epoch 
+--model_name_or_path="bert-base-german-cased"
+may need adjustment (especially paths)
+'''
+
+import json
+import logging
+import math
+import os
+import pickle
+from dataclasses import dataclass, field
+from typing import Optional
+
+from transformers import BertTokenizer
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_WITH_LM_HEAD_MAPPING,
+    AutoConfig,
+    AutoModelForMaskedLM,
+    AutoTokenizer,
+    DataCollatorForLanguageModeling,
+    HfArgumentParser,
+    LineByLineTextDataset,
+    PreTrainedTokenizer,
+    TextDataset,
+    Trainer,
+    TrainingArguments,
+    set_seed, BertTokenizer,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+class CachedLineByLineTextDataset(LineByLineTextDataset):
+    """
+    Adds caching functionality to LineByLineTextDataset
+    """
+
+    def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
+        assert os.path.isfile(file_path)
+        cached_file_path = Path(file_path.rsplit('.', 1)[0] + '_cache.pth')
+        if cached_file_path.exists():
+            with cached_file_path.open('rb') as f:
+                self.examples = pickle.load(f)
+                logger.info(
+                    f"Loading features from cached file {cached_file_path}")
+
+        else:
+            logger.info("Creating features from dataset file at %s", file_path)
+
+            with open(file_path, encoding="utf-8") as f:
+                lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+            batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size, truncation=True)
+            self.examples = batch_encoding["input_ids"]
+            with cached_file_path.open('wb') as f:
+                pickle.dump(self.examples, f, protocol=pickle.HIGHEST_PROTOCOL)
+                logger.info(
+                    f"Saving features into cached file {cached_file_path}")
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    train_data_file: Optional[str] = field(
+        default=None, metadata={"help": "The input training data file (a text file)."}
+    )
+    eval_data_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+
+    mlm: bool = field(
+        default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+
+    block_size: int = field(
+        default=-1,
+        metadata={
+            "help": "Optional input sequence length after tokenization."
+            "The training dataset will be truncated in block of this size for training."
+            "Default to the model max input length for single sentence inputs (take into account special tokens)."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1):
+    file_path = args.eval_data_file if evaluate else args.train_data_file
+    if args.line_by_line:
+        return CachedLineByLineTextDataset(
+            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank
+        )
+    else:
+        return TextDataset(
+            tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank,
+        )
+
+
+def main():
+    print("start time: " + time.strftime('%d.%m %H:%M'))
+
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if data_args.eval_data_file is None and training_args.do_eval:
+        raise ValueError(
+            "Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
+            "or remove the --do_eval argument."
+        )
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    with open("train_model/vocab/used_ingredients.json", "r") as used_ingredients_file:
+        used_ingredients = json.load(used_ingredients_file)
+    tokenizer = BertTokenizer(vocab_file='train_model/vocab/vocab.txt', do_lower_case=False, model_max_length=512, never_split=used_ingredients)
+
+    if model_args.model_name_or_path:
+        model = AutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path,
+            from_tf=bool(".ckpt" in model_args.model_name_or_path),
+            config=config,
+            cache_dir=model_args.cache_dir,
+        )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForMaskedLM.from_config(config)
+
+    model.resize_token_embeddings(len(tokenizer))
+
+    if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
+        raise ValueError(
+            "BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
+            "flag (masked language modeling)."
+        )
+
+    if data_args.block_size <= 0:
+        data_args.block_size = tokenizer.model_max_length
+        # Our input block size will be the max possible for the model
+    else:
+        data_args.block_size = min(data_args.block_size, tokenizer.model_max_length)
+
+    # Get datasets
+    train_dataset = (
+        get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank)
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True)
+        if training_args.do_eval
+        else None
+    )
+    data_collator = DataCollatorForLanguageModeling(
+        tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
+    )
+    # Make sure checkpoint recovery and continous training works on GPU, probably we need to make sure to push all parameters to the gpu
+    # Solves bug in Trainer https://github.com/huggingface/transformers/issues/4240
+    # put in if needed
+    model.to(training_args.device)
+
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        data_collator=data_collator,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset
+    )
+
+    # Training
+    if training_args.do_train:
+        model_path = (
+            model_args.model_name_or_path
+            if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
+            else None
+        )
+        trainer.train(model_path=model_path)
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_process_zero():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    results = {}
+    if training_args.do_eval and training_args.local_rank in [-1, 0]:
+        logger.info("*** Evaluate ***")
+
+        eval_output = trainer.evaluate()
+
+        perplexity = math.exp(eval_output["eval_loss"])
+        result = {"perplexity": perplexity}
+
+        output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results *****")
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+        results.update(result)
+
+    print("end time: " + time.strftime('%d.%m %H:%M'))
+    return results
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()