374 lines
15 KiB
Python
374 lines
15 KiB
Python
from pathlib import Path
|
|
|
|
from transformers import AutoConfig, BertTokenizer, AutoModelForMaskedLM
|
|
import json
|
|
import statistics
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
def make_dataset(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
|
|
weird_step = ""
|
|
|
|
with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
|
|
steps_dataset = json.load(whole_sep_json_file)
|
|
|
|
all_datapoints = {}
|
|
tokens_per_step = []
|
|
tokens_per_recipe = []
|
|
sentences_per_recipe = []
|
|
recipe_nr = 1
|
|
for recipe in steps_dataset.keys():
|
|
print(recipe_nr)
|
|
recipe_nr += 1
|
|
recipe_list = []
|
|
# curr_step = "[CLS]"
|
|
curr_step = ""
|
|
nr_class_tokens = 1
|
|
curr_step_len = nr_class_tokens
|
|
nr_sent = 0
|
|
sentences_per_recipe.append(len(steps_dataset[recipe]))
|
|
nr_recipe_tokens = 0
|
|
entered = False
|
|
|
|
for step in steps_dataset[recipe]:
|
|
entered = False
|
|
step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
|
|
step_len = len(step_tok)
|
|
# if step_len <= nr_class_tokens:
|
|
# weird_step = step
|
|
tokens_per_step.append(step_len)
|
|
# add sentence to datapoint
|
|
if curr_step_len + (step_len - nr_class_tokens - 1) < 513: # eigtl 512
|
|
curr_step += " " + step + " [SEP]"
|
|
# -2 since not adding CLS and SEP
|
|
curr_step_len += step_len - nr_class_tokens
|
|
nr_sent += 1
|
|
entered = False
|
|
# add sentence to next datapoint
|
|
else:
|
|
# curr_step += " [SEP]"
|
|
curr_step = curr_step[:-6]
|
|
curr_step_len -= 1
|
|
recipe_list.append(curr_step)
|
|
nr_recipe_tokens += curr_step_len
|
|
curr_step = step
|
|
# curr_step = "[CLS] " + step
|
|
curr_step_len = step_len
|
|
nr_sent = 1
|
|
entered = True
|
|
if not entered:
|
|
# curr_step += " [SEP]"
|
|
curr_step = curr_step[:-6]
|
|
curr_step_len -= 1
|
|
recipe_list.append(curr_step)
|
|
nr_recipe_tokens += curr_step_len
|
|
|
|
tokens_per_recipe.append(nr_recipe_tokens)
|
|
all_datapoints[recipe] = recipe_list
|
|
|
|
with open(out_path, "w") as whole_dataset:
|
|
json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
|
|
|
|
|
|
# Number of tokens in a single step/sentence
|
|
tokens_per_step.sort()
|
|
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
|
|
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
|
|
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
|
|
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
|
|
# print(tokens_per_step)
|
|
|
|
tokens_per_recipe.sort()
|
|
print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
|
|
print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
|
|
print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
|
|
print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
|
|
# print(tokens_per_recipe)
|
|
|
|
sentences_per_recipe.sort()
|
|
print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
|
|
print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
|
|
print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
|
|
print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
|
|
# print(sentences_per_recipe)
|
|
|
|
print(weird_step)
|
|
|
|
|
|
def make_dataset2(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
|
|
|
|
with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
|
|
steps_dataset = json.load(whole_sep_json_file)
|
|
|
|
all_datapoints = {}
|
|
tokens_per_step = []
|
|
tokens_per_recipe = []
|
|
sentences_per_recipe = []
|
|
recipe_nr = 1
|
|
for recipe in steps_dataset.keys():
|
|
print(recipe_nr)
|
|
recipe_nr += 1
|
|
recipe_list = []
|
|
# curr_step = "[CLS]"
|
|
curr_step = ""
|
|
nr_class_tokens = 2
|
|
curr_step_len = nr_class_tokens
|
|
nr_sent = 0
|
|
sentences_per_recipe.append(len(steps_dataset[recipe]))
|
|
nr_recipe_tokens = 0
|
|
entered = False
|
|
|
|
for step in steps_dataset[recipe]:
|
|
entered = False
|
|
step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP"
|
|
step_len = len(step_tok)
|
|
# if step_len <= nr_class_tokens:
|
|
# weird_step = step
|
|
tokens_per_step.append(step_len)
|
|
# add sentence to datapoint
|
|
if curr_step_len + (step_len - nr_class_tokens) < 513: # eigtl 512
|
|
curr_step += " " + step
|
|
# -2 since not adding CLS and SEP
|
|
curr_step_len += step_len - nr_class_tokens
|
|
nr_sent += 1
|
|
entered = False
|
|
# add sentence to next datapoint
|
|
else:
|
|
# curr_step += " [SEP]"
|
|
recipe_list.append(curr_step)
|
|
nr_recipe_tokens += curr_step_len
|
|
curr_step = step
|
|
# curr_step = "[CLS] " + step
|
|
curr_step_len = step_len
|
|
nr_sent = 1
|
|
entered = True
|
|
if not entered:
|
|
# curr_step += " [SEP]"
|
|
recipe_list.append(curr_step)
|
|
nr_recipe_tokens += curr_step_len
|
|
|
|
tokens_per_recipe.append(nr_recipe_tokens)
|
|
all_datapoints[recipe] = recipe_list
|
|
|
|
with open(out_path, "w") as whole_dataset:
|
|
json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
|
|
|
|
|
|
# Number of tokens in a single step/sentence
|
|
tokens_per_step.sort()
|
|
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
|
|
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
|
|
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
|
|
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
|
|
# print(tokens_per_step)
|
|
|
|
tokens_per_recipe.sort()
|
|
print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
|
|
print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
|
|
print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
|
|
print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
|
|
# print(tokens_per_recipe)
|
|
|
|
sentences_per_recipe.sort()
|
|
print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
|
|
print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
|
|
print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
|
|
print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
|
|
# print(sentences_per_recipe)
|
|
|
|
|
|
def make_dataset3(tokenizer, data_path="data/", out_path="data/complete_dataset.json"):
|
|
|
|
with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file:
|
|
steps_dataset = json.load(whole_sep_json_file)
|
|
|
|
all_datapoints = {}
|
|
tokens_per_step = []
|
|
tokens_per_recipe = []
|
|
sentences_per_recipe = []
|
|
recipe_nr = 1
|
|
|
|
for recipe in steps_dataset.keys():
|
|
print(recipe_nr)
|
|
recipe_nr += 1
|
|
recipe_list = []
|
|
nr_sent = 0
|
|
sentences_per_recipe.append(len(steps_dataset[recipe]))
|
|
nr_recipe_tokens = 0
|
|
add_part = False
|
|
prev_sentence = ""
|
|
|
|
for step in steps_dataset[recipe]:
|
|
if len(step) <= 5:
|
|
if ((len(prev_sentence) > 0 and (prev_sentence[-1] == "." or prev_sentence[-1] == "!")) or (len(prev_sentence) > 1 and (prev_sentence[-2:] == ". " or prev_sentence[-2:] == "! "))) and not (prev_sentence == "." or prev_sentence == "!" or prev_sentence == ". " or prev_sentence == "! "):
|
|
if add_part:
|
|
prev_sentence += " " + step
|
|
add_part = True
|
|
else:
|
|
if not len(recipe_list) == 0:
|
|
recipe_list[len(recipe_list)-1] += " " + step
|
|
else:
|
|
recipe_list.append(step)
|
|
add_part = False
|
|
prev_sentence = step
|
|
else:
|
|
if add_part:
|
|
curr_step = prev_sentence + " " + step
|
|
add_part = False
|
|
else:
|
|
curr_step = step
|
|
recipe_list.append(curr_step)
|
|
step_tok = tokenizer.tokenize("[CLS] " + curr_step + " [SEP]")
|
|
step_len = len(step_tok)
|
|
nr_recipe_tokens += step_len
|
|
prev_sentence = step
|
|
tokens_per_step.append(step_len)
|
|
nr_sent += 1
|
|
sentences_per_recipe.append(nr_sent)
|
|
all_datapoints[recipe] = recipe_list
|
|
tokens_per_recipe.append(nr_recipe_tokens)
|
|
|
|
with open(out_path, "w") as whole_dataset:
|
|
json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4)
|
|
|
|
# Number of tokens in a single step/sentence
|
|
tokens_per_step.sort()
|
|
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
|
|
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1]))
|
|
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
|
|
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
|
|
# print(tokens_per_step)
|
|
|
|
tokens_per_recipe.sort()
|
|
print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0]))
|
|
print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1]))
|
|
print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe)))
|
|
print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe)))
|
|
# print(tokens_per_recipe)
|
|
|
|
sentences_per_recipe.sort()
|
|
print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0]))
|
|
print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1]))
|
|
print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe)))
|
|
print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe)))
|
|
# print(sentences_per_recipe)
|
|
|
|
|
|
def check_dataset(tokenizer, file_path="data/complete_dataset.json"):
|
|
with open(file_path, "r") as whole_sep_json_file:
|
|
dataset = json.load(whole_sep_json_file)
|
|
|
|
tokens_per_step = []
|
|
|
|
recipe_nr = 1
|
|
for recipe in dataset.keys():
|
|
if recipe_nr % 10000 == 0:
|
|
print(recipe_nr)
|
|
recipe_nr += 1
|
|
for step in dataset[recipe]:
|
|
step_tok = tokenizer.tokenize(step)
|
|
step_len = len(step_tok)
|
|
tokens_per_step.append(step_len)
|
|
|
|
# Number of tokens in a single step/sentence
|
|
tokens_per_step.sort()
|
|
print("Smallest amount of tokens in a step: " + str(tokens_per_step[0]))
|
|
print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step) - 1]))
|
|
print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step)))
|
|
print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step)))
|
|
|
|
|
|
def format_dataset(input_path, out_path="data/model_datapoints.txt"):
|
|
with open(input_path, "r") as whole_sep_json_file:
|
|
dataset = json.load(whole_sep_json_file)
|
|
recipe_data = []
|
|
for recipe in dataset.keys():
|
|
recipe_data.append("\n".join(dataset[recipe]))
|
|
with open(out_path, "w") as whole_dataset:
|
|
whole_dataset.write("\n".join(recipe_data))
|
|
|
|
|
|
def extract_instructions_from_recipes(needed_recipes):
|
|
instructions = []
|
|
for recipe in needed_recipes:
|
|
for instruction in recipe:
|
|
instructions.append(instruction)
|
|
|
|
return instructions
|
|
|
|
|
|
|
|
def split_dataset(input_path, training_path, testing_path):
|
|
with open(input_path, "r") as f:
|
|
whole_dataset = json.load(f)
|
|
|
|
all_recipes = []
|
|
for recipe in whole_dataset.keys():
|
|
all_recipes += [whole_dataset[recipe]]
|
|
|
|
train_recipes, test_recipes = train_test_split(all_recipes, test_size=0.01, shuffle=True, random_state=42)
|
|
train_instructions = extract_instructions_from_recipes(train_recipes)
|
|
test_instructions = extract_instructions_from_recipes(test_recipes)
|
|
|
|
print(f'Train Instructions: {len(train_instructions)}\n'
|
|
f'Test Instructions: {len(test_instructions)}')
|
|
|
|
with open(training_path, "w") as f:
|
|
f.write('\n'.join(train_instructions))
|
|
|
|
with open(testing_path, "w") as f:
|
|
f.write('\n'.join(test_instructions))
|
|
|
|
def make_complete_dataset(whole_dataset_path, steps_path, out_path):
|
|
with open(whole_dataset_path, "r") as f:
|
|
whole_dataset = json.load(f)
|
|
|
|
with open(steps_path, "r") as f:
|
|
all_steps = json.load(f)
|
|
|
|
for recipe in whole_dataset.keys():
|
|
whole_dataset[recipe]['instructions'] = all_steps[recipe]
|
|
|
|
with open(out_path, "w") as f:
|
|
json.dump(whole_dataset, f, ensure_ascii=False, indent=4)
|
|
|
|
def main():
|
|
vocab_path = "train_model/vocab/"
|
|
data_path = "data/"
|
|
cache_dir = None
|
|
config = AutoConfig.from_pretrained("bert-base-german-cased", cache_dir=cache_dir)
|
|
|
|
with open(vocab_path + "used_ingredients.json", "r") as used_ingredients_file:
|
|
used_ingredients = json.load(used_ingredients_file)
|
|
tokenizer = BertTokenizer(vocab_file=vocab_path + 'vocab.txt', do_lower_case=False, max_len=512, never_split=used_ingredients)
|
|
|
|
model = AutoModelForMaskedLM.from_pretrained(
|
|
"bert-base-german-cased",
|
|
from_tf=bool(".ckpt" in "bert-base-german-cased"),
|
|
config=config,
|
|
cache_dir=cache_dir,
|
|
)
|
|
|
|
model.resize_token_embeddings(len(tokenizer))
|
|
|
|
revised_dataset_path = data_path + "complete_dataset.json"
|
|
|
|
# combine sentences into datapoints
|
|
## ADAPT WHICH DATASET VERSION TO MAKE!
|
|
make_dataset3(tokenizer, out_path=revised_dataset_path)
|
|
|
|
# get statistics for datapoints
|
|
check_dataset(tokenizer, file_path=revised_dataset_path)
|
|
|
|
# make list of datapoints
|
|
format_dataset(input_path=revised_dataset_path, out_path="data/model_datapoints.txt")
|
|
|
|
# change dataset to have new datapoints as ingredients
|
|
make_complete_dataset(whole_dataset_path=data_path+"dataset_cleaned_steps_not_empty.json", steps_path=revised_dataset_path, out_path=data_path+"full_dataset.json")
|
|
|
|
split_dataset(revised_dataset_path, data_path + "training_data.txt", data_path + "testing_data.txt")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|