from pathlib import Path from transformers import AutoConfig, BertTokenizer, AutoModelForMaskedLM import json import statistics from sklearn.model_selection import train_test_split def make_dataset(tokenizer, data_path="data/", out_path="data/complete_dataset.json"): weird_step = "" with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file: steps_dataset = json.load(whole_sep_json_file) all_datapoints = {} tokens_per_step = [] tokens_per_recipe = [] sentences_per_recipe = [] recipe_nr = 1 for recipe in steps_dataset.keys(): print(recipe_nr) recipe_nr += 1 recipe_list = [] # curr_step = "[CLS]" curr_step = "" nr_class_tokens = 1 curr_step_len = nr_class_tokens nr_sent = 0 sentences_per_recipe.append(len(steps_dataset[recipe])) nr_recipe_tokens = 0 entered = False for step in steps_dataset[recipe]: entered = False step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP" step_len = len(step_tok) # if step_len <= nr_class_tokens: # weird_step = step tokens_per_step.append(step_len) # add sentence to datapoint if curr_step_len + (step_len - nr_class_tokens - 1) < 513: # eigtl 512 curr_step += " " + step + " [SEP]" # -2 since not adding CLS and SEP curr_step_len += step_len - nr_class_tokens nr_sent += 1 entered = False # add sentence to next datapoint else: # curr_step += " [SEP]" curr_step = curr_step[:-6] curr_step_len -= 1 recipe_list.append(curr_step) nr_recipe_tokens += curr_step_len curr_step = step # curr_step = "[CLS] " + step curr_step_len = step_len nr_sent = 1 entered = True if not entered: # curr_step += " [SEP]" curr_step = curr_step[:-6] curr_step_len -= 1 recipe_list.append(curr_step) nr_recipe_tokens += curr_step_len tokens_per_recipe.append(nr_recipe_tokens) all_datapoints[recipe] = recipe_list with open(out_path, "w") as whole_dataset: json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4) # Number of tokens in a single step/sentence tokens_per_step.sort() print("Smallest amount of tokens in a step: " + str(tokens_per_step[0])) print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1])) print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step))) print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step))) # print(tokens_per_step) tokens_per_recipe.sort() print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0])) print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1])) print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe))) print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe))) # print(tokens_per_recipe) sentences_per_recipe.sort() print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0])) print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1])) print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe))) print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe))) # print(sentences_per_recipe) print(weird_step) def make_dataset2(tokenizer, data_path="data/", out_path="data/complete_dataset.json"): with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file: steps_dataset = json.load(whole_sep_json_file) all_datapoints = {} tokens_per_step = [] tokens_per_recipe = [] sentences_per_recipe = [] recipe_nr = 1 for recipe in steps_dataset.keys(): print(recipe_nr) recipe_nr += 1 recipe_list = [] # curr_step = "[CLS]" curr_step = "" nr_class_tokens = 2 curr_step_len = nr_class_tokens nr_sent = 0 sentences_per_recipe.append(len(steps_dataset[recipe])) nr_recipe_tokens = 0 entered = False for step in steps_dataset[recipe]: entered = False step_tok = tokenizer.tokenize("[CLS] " + step + " [SEP]") # + " SEP" step_len = len(step_tok) # if step_len <= nr_class_tokens: # weird_step = step tokens_per_step.append(step_len) # add sentence to datapoint if curr_step_len + (step_len - nr_class_tokens) < 513: # eigtl 512 curr_step += " " + step # -2 since not adding CLS and SEP curr_step_len += step_len - nr_class_tokens nr_sent += 1 entered = False # add sentence to next datapoint else: # curr_step += " [SEP]" recipe_list.append(curr_step) nr_recipe_tokens += curr_step_len curr_step = step # curr_step = "[CLS] " + step curr_step_len = step_len nr_sent = 1 entered = True if not entered: # curr_step += " [SEP]" recipe_list.append(curr_step) nr_recipe_tokens += curr_step_len tokens_per_recipe.append(nr_recipe_tokens) all_datapoints[recipe] = recipe_list with open(out_path, "w") as whole_dataset: json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4) # Number of tokens in a single step/sentence tokens_per_step.sort() print("Smallest amount of tokens in a step: " + str(tokens_per_step[0])) print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1])) print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step))) print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step))) # print(tokens_per_step) tokens_per_recipe.sort() print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0])) print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1])) print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe))) print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe))) # print(tokens_per_recipe) sentences_per_recipe.sort() print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0])) print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1])) print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe))) print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe))) # print(sentences_per_recipe) def make_dataset3(tokenizer, data_path="data/", out_path="data/complete_dataset.json"): with open(data_path + 'cleaned_sep_sentences_not_empty.json', "r") as whole_sep_json_file: steps_dataset = json.load(whole_sep_json_file) all_datapoints = {} tokens_per_step = [] tokens_per_recipe = [] sentences_per_recipe = [] recipe_nr = 1 for recipe in steps_dataset.keys(): print(recipe_nr) recipe_nr += 1 recipe_list = [] nr_sent = 0 sentences_per_recipe.append(len(steps_dataset[recipe])) nr_recipe_tokens = 0 add_part = False prev_sentence = "" for step in steps_dataset[recipe]: if len(step) <= 5: if ((len(prev_sentence) > 0 and (prev_sentence[-1] == "." or prev_sentence[-1] == "!")) or (len(prev_sentence) > 1 and (prev_sentence[-2:] == ". " or prev_sentence[-2:] == "! "))) and not (prev_sentence == "." or prev_sentence == "!" or prev_sentence == ". " or prev_sentence == "! "): if add_part: prev_sentence += " " + step add_part = True else: if not len(recipe_list) == 0: recipe_list[len(recipe_list)-1] += " " + step else: recipe_list.append(step) add_part = False prev_sentence = step else: if add_part: curr_step = prev_sentence + " " + step add_part = False else: curr_step = step recipe_list.append(curr_step) step_tok = tokenizer.tokenize("[CLS] " + curr_step + " [SEP]") step_len = len(step_tok) nr_recipe_tokens += step_len prev_sentence = step tokens_per_step.append(step_len) nr_sent += 1 sentences_per_recipe.append(nr_sent) all_datapoints[recipe] = recipe_list tokens_per_recipe.append(nr_recipe_tokens) with open(out_path, "w") as whole_dataset: json.dump(all_datapoints, whole_dataset, ensure_ascii=False, indent=4) # Number of tokens in a single step/sentence tokens_per_step.sort() print("Smallest amount of tokens in a step: " + str(tokens_per_step[0])) print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step)-1])) print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step))) print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step))) # print(tokens_per_step) tokens_per_recipe.sort() print("Smallest amount of tokens in a recipe: " + str(tokens_per_recipe[0])) print("Largest amount of tokens in a recipe: " + str(tokens_per_recipe[len(tokens_per_recipe)-1])) print("Average amount of tokens in a recipe: " + str(statistics.mean(tokens_per_recipe))) print("Median amount of tokens in a recipe: " + str(statistics.median(tokens_per_recipe))) # print(tokens_per_recipe) sentences_per_recipe.sort() print("Smallest number of sentences in a recipe: " + str(sentences_per_recipe[0])) print("Largest number of sentences in a recipe: " + str(sentences_per_recipe[len(sentences_per_recipe)-1])) print("Average number of sentences in a recipe: " + str(statistics.mean(sentences_per_recipe))) print("Median number of sentences in a recipe: " + str(statistics.median(sentences_per_recipe))) # print(sentences_per_recipe) def check_dataset(tokenizer, file_path="data/complete_dataset.json"): with open(file_path, "r") as whole_sep_json_file: dataset = json.load(whole_sep_json_file) tokens_per_step = [] recipe_nr = 1 for recipe in dataset.keys(): if recipe_nr % 10000 == 0: print(recipe_nr) recipe_nr += 1 for step in dataset[recipe]: step_tok = tokenizer.tokenize(step) step_len = len(step_tok) tokens_per_step.append(step_len) # Number of tokens in a single step/sentence tokens_per_step.sort() print("Smallest amount of tokens in a step: " + str(tokens_per_step[0])) print("Largest amount of tokens in a step: " + str(tokens_per_step[len(tokens_per_step) - 1])) print("Average amount of tokens in a step: " + str(statistics.mean(tokens_per_step))) print("Median amount of tokens in a step: " + str(statistics.median(tokens_per_step))) def format_dataset(input_path, out_path="data/model_datapoints.txt"): with open(input_path, "r") as whole_sep_json_file: dataset = json.load(whole_sep_json_file) recipe_data = [] for recipe in dataset.keys(): recipe_data.append("\n".join(dataset[recipe])) with open(out_path, "w") as whole_dataset: whole_dataset.write("\n".join(recipe_data)) def extract_instructions_from_recipes(needed_recipes): instructions = [] for recipe in needed_recipes: for instruction in recipe: instructions.append(instruction) return instructions def split_dataset(input_path, training_path, testing_path): with open(input_path, "r") as f: whole_dataset = json.load(f) all_recipes = [] for recipe in whole_dataset.keys(): all_recipes += [whole_dataset[recipe]] train_recipes, test_recipes = train_test_split(all_recipes, test_size=0.01, shuffle=True, random_state=42) train_instructions = extract_instructions_from_recipes(train_recipes) test_instructions = extract_instructions_from_recipes(test_recipes) print(f'Train Instructions: {len(train_instructions)}\n' f'Test Instructions: {len(test_instructions)}') with open(training_path, "w") as f: f.write('\n'.join(train_instructions)) with open(testing_path, "w") as f: f.write('\n'.join(test_instructions)) def make_complete_dataset(whole_dataset_path, steps_path, out_path): with open(whole_dataset_path, "r") as f: whole_dataset = json.load(f) with open(steps_path, "r") as f: all_steps = json.load(f) for recipe in whole_dataset.keys(): whole_dataset[recipe]['instructions'] = all_steps[recipe] with open(out_path, "w") as f: json.dump(whole_dataset, f, ensure_ascii=False, indent=4) def main(): vocab_path = "train_model/vocab/" data_path = "data/" cache_dir = None config = AutoConfig.from_pretrained("bert-base-german-cased", cache_dir=cache_dir) with open(vocab_path + "used_ingredients.json", "r") as used_ingredients_file: used_ingredients = json.load(used_ingredients_file) tokenizer = BertTokenizer(vocab_file=vocab_path + 'vocab.txt', do_lower_case=False, max_len=512, never_split=used_ingredients) model = AutoModelForMaskedLM.from_pretrained( "bert-base-german-cased", from_tf=bool(".ckpt" in "bert-base-german-cased"), config=config, cache_dir=cache_dir, ) model.resize_token_embeddings(len(tokenizer)) revised_dataset_path = data_path + "complete_dataset.json" # combine sentences into datapoints ## ADAPT WHICH DATASET VERSION TO MAKE! make_dataset3(tokenizer, out_path=revised_dataset_path) # get statistics for datapoints check_dataset(tokenizer, file_path=revised_dataset_path) # make list of datapoints format_dataset(input_path=revised_dataset_path, out_path="data/model_datapoints.txt") # change dataset to have new datapoints as ingredients make_complete_dataset(whole_dataset_path=data_path+"dataset_cleaned_steps_not_empty.json", steps_path=revised_dataset_path, out_path=data_path+"full_dataset.json") split_dataset(revised_dataset_path, data_path + "training_data.txt", data_path + "testing_data.txt") if __name__ == '__main__': main()