initial commit of project

This commit is contained in:
2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions

View File

@@ -0,0 +1,373 @@
from pathlib import Path
from transformers import AutoConfig, BertTokenizer, AutoModelForMaskedLM
import json
import statistics
from sklearn.model_selection import train_test_split
def make_dataset(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
weird_step = ""
with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
steps_dataset = json.load(whole_sep_json_file)
all_datapoints = {}
tokens_per_step = []
tokens_per_recipe = []
sentences_per_recipe = []
recipe_nr = 1
for recipe in steps_dataset.keys():
print(recipe_nr)
recipe_nr += 1
recipe_list = []
# curr_step = "[CLS]"
curr_step = ""
nr_class_tokens = 1
curr_step_len = nr_class_tokens
nr_sent = 0
sentences_per_recipe.append(len(steps_dataset[recipe]))
nr_recipe_tokens = 0
entered = False
for step in steps_dataset[recipe]:
entered = False
step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
step_len = len(step_tok)
# if step_len <= nr_class_tokens:
# weird_step = step
tokens_per_step.append(step_len)
# add sentence to datapoint
if curr_step_len + (step_len - nr_class_tokens - 1) < 513: # eigtl 512
curr_step += " " + step + " [SEP]"
# -2 since not adding CLS and SEP
curr_step_len += step_len - nr_class_tokens
nr_sent += 1
entered = False
# add sentence to next datapoint
else:
# curr_step += " [SEP]"
curr_step = curr_step[:-6]
curr_step_len -= 1
recipe_list.append(curr_step)
nr_recipe_tokens += curr_step_len
curr_step = step
# curr_step = "[CLS] " + step
curr_step_len = step_len
nr_sent = 1
entered = True
if not entered:
# curr_step += " [SEP]"
curr_step = curr_step[:-6]
curr_step_len -= 1
recipe_list.append(curr_step)
nr_recipe_tokens += curr_step_len
tokens_per_recipe.append(nr_recipe_tokens)
all_datapoints[recipe] = recipe_list
with open(out_path, "w") as whole_dataset:
json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
# Number of tokens in a single step/sentence
tokens_per_step.sort()
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
# print(tokens_per_step)
tokens_per_recipe.sort()
print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
# print(tokens_per_recipe)
sentences_per_recipe.sort()
print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
# print(sentences_per_recipe)
print(weird_step)
def make_dataset2(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
steps_dataset = json.load(whole_sep_json_file)
all_datapoints = {}
tokens_per_step = []
tokens_per_recipe = []
sentences_per_recipe = []
recipe_nr = 1
for recipe in steps_dataset.keys():
print(recipe_nr)
recipe_nr += 1
recipe_list = []
# curr_step = "[CLS]"
curr_step = ""
nr_class_tokens = 2
curr_step_len = nr_class_tokens
nr_sent = 0
sentences_per_recipe.append(len(steps_dataset[recipe]))
nr_recipe_tokens = 0
entered = False
for step in steps_dataset[recipe]:
entered = False
step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
step_len = len(step_tok)
# if step_len <= nr_class_tokens:
# weird_step = step
tokens_per_step.append(step_len)
# add sentence to datapoint
if curr_step_len + (step_len - nr_class_tokens) < 513: # eigtl 512
curr_step += " " + step
# -2 since not adding CLS and SEP
curr_step_len += step_len - nr_class_tokens
nr_sent += 1
entered = False
# add sentence to next datapoint
else:
# curr_step += " [SEP]"
recipe_list.append(curr_step)
nr_recipe_tokens += curr_step_len
curr_step = step
# curr_step = "[CLS] " + step
curr_step_len = step_len
nr_sent = 1
entered = True
if not entered:
# curr_step += " [SEP]"
recipe_list.append(curr_step)
nr_recipe_tokens += curr_step_len
tokens_per_recipe.append(nr_recipe_tokens)
all_datapoints[recipe] = recipe_list
with open(out_path, "w") as whole_dataset:
json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
# Number of tokens in a single step/sentence
tokens_per_step.sort()
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
# print(tokens_per_step)
tokens_per_recipe.sort()
print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
# print(tokens_per_recipe)
sentences_per_recipe.sort()
print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
# print(sentences_per_recipe)
def make_dataset3(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
steps_dataset = json.load(whole_sep_json_file)
all_datapoints = {}
tokens_per_step = []
tokens_per_recipe = []
sentences_per_recipe = []
recipe_nr = 1
for recipe in steps_dataset.keys():
print(recipe_nr)
recipe_nr += 1
recipe_list = []
nr_sent = 0
sentences_per_recipe.append(len(steps_dataset[recipe]))
nr_recipe_tokens = 0
add_part = False
prev_sentence = ""
for step in steps_dataset[recipe]:
if len(step) <= 5:
if ((len(prev_sentence) > 0 and (prev_sentence[-1] == "." or prev_sentence[-1] == "!")) or (len(prev_sentence) > 1 and (prev_sentence[-2:] == ". " or prev_sentence[-2:] == "! "))) and not (prev_sentence == "." or prev_sentence == "!" or prev_sentence == ". " or prev_sentence == "! "):
if add_part:
prev_sentence += " " + step
add_part = True
else:
if not len(recipe_list) == 0:
recipe_list[len(recipe_list)-1] += " " + step
else:
recipe_list.append(step)
add_part = False
prev_sentence = step
else:
if add_part:
curr_step = prev_sentence + " " + step
add_part = False
else:
curr_step = step
recipe_list.append(curr_step)
step_tok = tokenizer.tokenize("[CLS] " + curr_step + " [SEP]")
step_len = len(step_tok)
nr_recipe_tokens += step_len
prev_sentence = step
tokens_per_step.append(step_len)
nr_sent += 1
sentences_per_recipe.append(nr_sent)
all_datapoints[recipe] = recipe_list
tokens_per_recipe.append(nr_recipe_tokens)
with open(out_path, "w") as whole_dataset:
json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
# Number of tokens in a single step/sentence
tokens_per_step.sort()
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
# print(tokens_per_step)
tokens_per_recipe.sort()
print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
# print(tokens_per_recipe)
sentences_per_recipe.sort()
print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
# print(sentences_per_recipe)
def check_dataset(tokenizer, file_path="data/complete_dataset.json"):
with open(file_path, "r") as whole_sep_json_file:
dataset = json.load(whole_sep_json_file)
tokens_per_step = []
recipe_nr = 1
for recipe in dataset.keys():
if recipe_nr % 10000 == 0:
print(recipe_nr)
recipe_nr += 1
for step in dataset[recipe]:
step_tok = tokenizer.tokenize(step)
step_len = len(step_tok)
tokens_per_step.append(step_len)
# Number of tokens in a single step/sentence
tokens_per_step.sort()
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step) - 1]))
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
def format_dataset(input_path, out_path="data/model_datapoints.txt"):
with open(input_path, "r") as whole_sep_json_file:
dataset = json.load(whole_sep_json_file)
recipe_data = []
for recipe in dataset.keys():
recipe_data.append("\n".join(dataset[recipe]))
with open(out_path, "w") as whole_dataset:
whole_dataset.write("\n".join(recipe_data))
def extract_instructions_from_recipes(needed_recipes):
instructions = []
for recipe in needed_recipes:
for instruction in recipe:
instructions.append(instruction)
return instructions
def split_dataset(input_path, training_path, testing_path):
with open(input_path, "r") as f:
whole_dataset = json.load(f)
all_recipes = []
for recipe in whole_dataset.keys():
all_recipes += [whole_dataset[recipe]]
train_recipes, test_recipes = train_test_split(all_recipes, test_size=0.01, shuffle=True, random_state=42)
train_instructions = extract_instructions_from_recipes(train_recipes)
test_instructions = extract_instructions_from_recipes(test_recipes)
print(f'Train Instructions: {len(train_instructions)}\n'
f'Test Instructions: {len(test_instructions)}')
with open(training_path, "w") as f:
f.write('\n'.join(train_instructions))
with open(testing_path, "w") as f:
f.write('\n'.join(test_instructions))
def make_complete_dataset(whole_dataset_path, steps_path, out_path):
with open(whole_dataset_path, "r") as f:
whole_dataset = json.load(f)
with open(steps_path, "r") as f:
all_steps = json.load(f)
for recipe in whole_dataset.keys():
whole_dataset[recipe]['instructions'] = all_steps[recipe]
with open(out_path, "w") as f:
json.dump(whole_dataset, f, ensure_ascii=False, indent=4)
def main():
vocab_path = "train_model/vocab/"
data_path = "data/"
cache_dir = None
config = AutoConfig.from_pretrained("bert-base-german-cased", cache_dir=cache_dir)
with open(vocab_path + "used_ingredients.json", "r") as used_ingredients_file:
used_ingredients = json.load(used_ingredients_file)
tokenizer = BertTokenizer(vocab_file=vocab_path + 'vocab.txt', do_lower_case=False, max_len=512, never_split=used_ingredients)
model = AutoModelForMaskedLM.from_pretrained(
"bert-base-german-cased",
from_tf=bool(".ckpt" in "bert-base-german-cased"),
config=config,
cache_dir=cache_dir,
)
model.resize_token_embeddings(len(tokenizer))
revised_dataset_path = data_path + "complete_dataset.json"
# combine sentences into datapoints
## ADAPT WHICH DATASET VERSION TO MAKE!
make_dataset3(tokenizer, out_path=revised_dataset_path)
# get statistics for datapoints
check_dataset(tokenizer, file_path=revised_dataset_path)
# make list of datapoints
format_dataset(input_path=revised_dataset_path, out_path="data/model_datapoints.txt")
# change dataset to have new datapoints as ingredients
make_complete_dataset(whole_dataset_path=data_path+"dataset_cleaned_steps_not_empty.json", steps_path=revised_dataset_path, out_path=data_path+"full_dataset.json")
split_dataset(revised_dataset_path, data_path + "training_data.txt", data_path + "testing_data.txt")
if __name__ == '__main__':
main()