initial commit of project
This commit is contained in:
333
train_model/language_modeling.py
Normal file
333
train_model/language_modeling.py
Normal file
@@ -0,0 +1,333 @@
|
||||
# coding=utf-8
|
||||
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
|
||||
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Fine-tuning the library models for language modeling on a text file (GPT, GPT-2, BERT, RoBERTa).
|
||||
GPT and GPT-2 are fine-tuned using a causal language modeling (CLM) loss while BERT and RoBERTa are fine-tuned
|
||||
using a masked language modeling (MLM) loss.
|
||||
"""
|
||||
# https://github.com/huggingface/transformers/blob/v2.9.1/examples/language-modeling/run_language_modeling.py
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
'''
|
||||
Modified huggingface code for pretraining FoodBERT.
|
||||
Running parameters:
|
||||
--output_dir=output --model_type=bert --model_name=bert-base-german-cased --do_train
|
||||
--train_data_file="data/training_data.txt" --do_eval --eval_data_file="data/testing_data.txt"
|
||||
--mlm --line_by_line --per_device_train_batch_size=8 --gradient_accumulation_steps=2 --per_device_eval_batch_size=8
|
||||
--save_total_limit=5 --save_steps=10000 --logging_steps=10000 --evaluation_strategy=epoch
|
||||
--model_name_or_path="bert-base-german-cased"
|
||||
may need adjustment (especially paths)
|
||||
'''
|
||||
|
||||
import json
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
import pickle
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
from transformers import BertTokenizer
|
||||
from transformers import (
|
||||
CONFIG_MAPPING,
|
||||
MODEL_WITH_LM_HEAD_MAPPING,
|
||||
AutoConfig,
|
||||
AutoModelForMaskedLM,
|
||||
AutoTokenizer,
|
||||
DataCollatorForLanguageModeling,
|
||||
HfArgumentParser,
|
||||
LineByLineTextDataset,
|
||||
PreTrainedTokenizer,
|
||||
TextDataset,
|
||||
Trainer,
|
||||
TrainingArguments,
|
||||
set_seed, BertTokenizer,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
MODEL_CONFIG_CLASSES = list(MODEL_WITH_LM_HEAD_MAPPING.keys())
|
||||
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
|
||||
|
||||
class CachedLineByLineTextDataset(LineByLineTextDataset):
|
||||
"""
|
||||
Adds caching functionality to LineByLineTextDataset
|
||||
"""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, file_path: str, block_size: int, local_rank=-1):
|
||||
assert os.path.isfile(file_path)
|
||||
cached_file_path = Path(file_path.rsplit('.', 1)[0] + '_cache.pth')
|
||||
if cached_file_path.exists():
|
||||
with cached_file_path.open('rb') as f:
|
||||
self.examples = pickle.load(f)
|
||||
logger.info(
|
||||
f"Loading features from cached file {cached_file_path}")
|
||||
|
||||
else:
|
||||
logger.info("Creating features from dataset file at %s", file_path)
|
||||
|
||||
with open(file_path, encoding="utf-8") as f:
|
||||
lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
|
||||
|
||||
batch_encoding = tokenizer.batch_encode_plus(lines, add_special_tokens=True, max_length=block_size, truncation=True)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
with cached_file_path.open('wb') as f:
|
||||
pickle.dump(self.examples, f, protocol=pickle.HIGHEST_PROTOCOL)
|
||||
logger.info(
|
||||
f"Saving features into cached file {cached_file_path}")
|
||||
|
||||
|
||||
@dataclass
|
||||
class ModelArguments:
|
||||
"""
|
||||
Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
|
||||
"""
|
||||
|
||||
model_name_or_path: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"help": "The model checkpoint for weights initialization. Leave None if you want to train a model from scratch."
|
||||
},
|
||||
)
|
||||
model_type: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
|
||||
)
|
||||
config_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
|
||||
)
|
||||
tokenizer_name: Optional[str] = field(
|
||||
default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
|
||||
)
|
||||
cache_dir: Optional[str] = field(
|
||||
default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataTrainingArguments:
|
||||
"""
|
||||
Arguments pertaining to what data we are going to input our model for training and eval.
|
||||
"""
|
||||
|
||||
train_data_file: Optional[str] = field(
|
||||
default=None, metadata={"help": "The input training data file (a text file)."}
|
||||
)
|
||||
eval_data_file: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
|
||||
)
|
||||
line_by_line: bool = field(
|
||||
default=False,
|
||||
metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
|
||||
)
|
||||
|
||||
mlm: bool = field(
|
||||
default=False, metadata={"help": "Train with masked-language modeling loss instead of language modeling."}
|
||||
)
|
||||
mlm_probability: float = field(
|
||||
default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
|
||||
)
|
||||
|
||||
block_size: int = field(
|
||||
default=-1,
|
||||
metadata={
|
||||
"help": "Optional input sequence length after tokenization."
|
||||
"The training dataset will be truncated in block of this size for training."
|
||||
"Default to the model max input length for single sentence inputs (take into account special tokens)."
|
||||
},
|
||||
)
|
||||
overwrite_cache: bool = field(
|
||||
default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
|
||||
)
|
||||
|
||||
|
||||
def get_dataset(args: DataTrainingArguments, tokenizer: PreTrainedTokenizer, evaluate=False, local_rank=-1):
|
||||
file_path = args.eval_data_file if evaluate else args.train_data_file
|
||||
if args.line_by_line:
|
||||
return CachedLineByLineTextDataset(
|
||||
tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank
|
||||
)
|
||||
else:
|
||||
return TextDataset(
|
||||
tokenizer=tokenizer, file_path=file_path, block_size=args.block_size, local_rank=local_rank,
|
||||
)
|
||||
|
||||
|
||||
def main():
|
||||
print("start time: " + time.strftime('%d.%m %H:%M'))
|
||||
|
||||
# See all possible arguments in src/transformers/training_args.py
|
||||
# or by passing the --help flag to this script.
|
||||
# We now keep distinct sets of args, for a cleaner separation of concerns.
|
||||
|
||||
parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
|
||||
model_args, data_args, training_args = parser.parse_args_into_dataclasses()
|
||||
|
||||
if data_args.eval_data_file is None and training_args.do_eval:
|
||||
raise ValueError(
|
||||
"Cannot do evaluation without an evaluation data file. Either supply a file to --eval_data_file "
|
||||
"or remove the --do_eval argument."
|
||||
)
|
||||
|
||||
if (
|
||||
os.path.exists(training_args.output_dir)
|
||||
and os.listdir(training_args.output_dir)
|
||||
and training_args.do_train
|
||||
and not training_args.overwrite_output_dir
|
||||
):
|
||||
raise ValueError(
|
||||
f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
|
||||
)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
|
||||
datefmt="%m/%d/%Y %H:%M:%S",
|
||||
level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
|
||||
)
|
||||
logger.warning(
|
||||
"Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
|
||||
training_args.local_rank,
|
||||
training_args.device,
|
||||
training_args.n_gpu,
|
||||
bool(training_args.local_rank != -1),
|
||||
training_args.fp16,
|
||||
)
|
||||
logger.info("Training/evaluation parameters %s", training_args)
|
||||
|
||||
# Set seed
|
||||
set_seed(training_args.seed)
|
||||
|
||||
# Load pretrained model and tokenizer
|
||||
#
|
||||
# Distributed training:
|
||||
# The .from_pretrained methods guarantee that only one local process can concurrently
|
||||
# download model & vocab.
|
||||
|
||||
if model_args.config_name:
|
||||
config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
|
||||
elif model_args.model_name_or_path:
|
||||
config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
|
||||
else:
|
||||
config = CONFIG_MAPPING[model_args.model_type]()
|
||||
logger.warning("You are instantiating a new config instance from scratch.")
|
||||
|
||||
with open("train_model/vocab/used_ingredients.json", "r") as used_ingredients_file:
|
||||
used_ingredients = json.load(used_ingredients_file)
|
||||
tokenizer = BertTokenizer(vocab_file='train_model/vocab/vocab.txt', do_lower_case=False, model_max_length=512, never_split=used_ingredients)
|
||||
|
||||
if model_args.model_name_or_path:
|
||||
model = AutoModelForMaskedLM.from_pretrained(
|
||||
model_args.model_name_or_path,
|
||||
from_tf=bool(".ckpt" in model_args.model_name_or_path),
|
||||
config=config,
|
||||
cache_dir=model_args.cache_dir,
|
||||
)
|
||||
else:
|
||||
logger.info("Training new model from scratch")
|
||||
model = AutoModelForMaskedLM.from_config(config)
|
||||
|
||||
model.resize_token_embeddings(len(tokenizer))
|
||||
|
||||
if config.model_type in ["bert", "roberta", "distilbert", "camembert"] and not data_args.mlm:
|
||||
raise ValueError(
|
||||
"BERT and RoBERTa-like models do not have LM heads but masked LM heads. They must be run using the --mlm "
|
||||
"flag (masked language modeling)."
|
||||
)
|
||||
|
||||
if data_args.block_size <= 0:
|
||||
data_args.block_size = tokenizer.model_max_length
|
||||
# Our input block size will be the max possible for the model
|
||||
else:
|
||||
data_args.block_size = min(data_args.block_size, tokenizer.model_max_length)
|
||||
|
||||
# Get datasets
|
||||
train_dataset = (
|
||||
get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank)
|
||||
if training_args.do_train
|
||||
else None
|
||||
)
|
||||
eval_dataset = (
|
||||
get_dataset(data_args, tokenizer=tokenizer, local_rank=training_args.local_rank, evaluate=True)
|
||||
if training_args.do_eval
|
||||
else None
|
||||
)
|
||||
data_collator = DataCollatorForLanguageModeling(
|
||||
tokenizer=tokenizer, mlm=data_args.mlm, mlm_probability=data_args.mlm_probability
|
||||
)
|
||||
# Make sure checkpoint recovery and continous training works on GPU, probably we need to make sure to push all parameters to the gpu
|
||||
# Solves bug in Trainer https://github.com/huggingface/transformers/issues/4240
|
||||
# put in if needed
|
||||
model.to(training_args.device)
|
||||
|
||||
|
||||
# Initialize our Trainer
|
||||
trainer = Trainer(
|
||||
model=model,
|
||||
args=training_args,
|
||||
data_collator=data_collator,
|
||||
train_dataset=train_dataset,
|
||||
eval_dataset=eval_dataset
|
||||
)
|
||||
|
||||
# Training
|
||||
if training_args.do_train:
|
||||
model_path = (
|
||||
model_args.model_name_or_path
|
||||
if model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)
|
||||
else None
|
||||
)
|
||||
trainer.train(model_path=model_path)
|
||||
trainer.save_model()
|
||||
# For convenience, we also re-save the tokenizer to the same directory,
|
||||
# so that you can share your model easily on huggingface.co/models =)
|
||||
if trainer.is_world_process_zero():
|
||||
tokenizer.save_pretrained(training_args.output_dir)
|
||||
|
||||
# Evaluation
|
||||
results = {}
|
||||
if training_args.do_eval and training_args.local_rank in [-1, 0]:
|
||||
logger.info("*** Evaluate ***")
|
||||
|
||||
eval_output = trainer.evaluate()
|
||||
|
||||
perplexity = math.exp(eval_output["eval_loss"])
|
||||
result = {"perplexity": perplexity}
|
||||
|
||||
output_eval_file = os.path.join(training_args.output_dir, "eval_results_lm.txt")
|
||||
with open(output_eval_file, "w") as writer:
|
||||
logger.info("***** Eval results *****")
|
||||
for key in sorted(result.keys()):
|
||||
logger.info(" %s = %s", key, str(result[key]))
|
||||
writer.write("%s = %s\n" % (key, str(result[key])))
|
||||
|
||||
results.update(result)
|
||||
|
||||
print("end time: " + time.strftime('%d.%m %H:%M'))
|
||||
return results
|
||||
|
||||
|
||||
def _mp_fn(index):
|
||||
# For xla_spawn (TPUs)
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user