import json data_path = "data/" vocab_path = "train_model/vocab/" def make_vocab_from_tokenizer(base_vocab): with open(vocab_path + "vocab.txt", "w") as vocab_file: for word in base_vocab: vocab_file.write(word + "\n") def check_words_in_vocab(tokenizer): ingredient_path = "mult_ingredients_nice.json" with open(data_path + ingredient_path, "r") as ingr_json_file: ingredients = json.load(ingr_json_file) new_words = [] new_word_count = 0 for ingr in ingredients.keys(): if ingr not in tokenizer["model"]["vocab"].keys(): new_words.append(ingr) new_word_count += 1 # print(new_word_count) else: print(ingr) with open(vocab_path + "used_ingredients.json", "w") as used_ingredients_file: json.dump(list(ingredients.keys()), used_ingredients_file, ensure_ascii=False, indent=4) # used_ingredients_file.write("\n".join(ingredients.keys())) print(str(new_word_count) + " words to be added to vocab") return new_words def add_words_to_vocab(new_words): with open(vocab_path + "vocab.txt", "a") as vocab_file: vocab_file.write("\n") vocab_file.write("\n".join(new_words)) def create_base_vocab(tokenizer): with open(vocab_path + "vocab.txt", "a") as vocab_file: vocab_file.write("\n".join(tokenizer["model"]["vocab"].keys())) def check_existing(tokenizer): ingredient_path = "mult_ingredients_nice.json" with open(data_path + ingredient_path, "r") as ingr_json_file: ingredients = json.load(ingr_json_file) new_words = [] new_word_count = 0 old_word_count = 0 for ingr in ingredients.keys(): if ingr not in tokenizer["model"]["vocab"].keys(): new_words.append(ingr) new_word_count += 1 # print(new_word_count) else: print(ingr) old_word_count += 1 print(old_word_count) def main(): tokenizer_path = "train_model/vocab/bert-base-german-cased_tokenizer.json" with open(tokenizer_path, "r") as whole_json_file: tokenizer = json.load(whole_json_file) # check_existing(tokenizer) # make_vocab_from_tokenizer(tokenizer["model"]["vocab"]) new_words = check_words_in_vocab(tokenizer) create_base_vocab(tokenizer) add_words_to_vocab(new_words) main()