Files
MasterarbeitCode/train_model/make_new_vocab.py
2021-04-11 23:28:41 +02:00

82 lines
2.3 KiB
Python

import json
data_path = "data/"
vocab_path = "train_model/vocab/"
def make_vocab_from_tokenizer(base_vocab):
with open(vocab_path + "vocab.txt", "w") as vocab_file:
for word in base_vocab:
vocab_file.write(word + "\n")
def check_words_in_vocab(tokenizer):
ingredient_path = "mult_ingredients_nice.json"
with open(data_path + ingredient_path, "r") as ingr_json_file:
ingredients = json.load(ingr_json_file)
new_words = []
new_word_count = 0
for ingr in ingredients.keys():
if ingr not in tokenizer["model"]["vocab"].keys():
new_words.append(ingr)
new_word_count += 1
# print(new_word_count)
else:
print(ingr)
with open(vocab_path + "used_ingredients.json", "w") as used_ingredients_file:
json.dump(list(ingredients.keys()), used_ingredients_file, ensure_ascii=False, indent=4)
# used_ingredients_file.write("\n".join(ingredients.keys()))
print(str(new_word_count) + " words to be added to vocab")
return new_words
def add_words_to_vocab(new_words):
with open(vocab_path + "vocab.txt", "a") as vocab_file:
vocab_file.write("\n")
vocab_file.write("\n".join(new_words))
def create_base_vocab(tokenizer):
with open(vocab_path + "vocab.txt", "a") as vocab_file:
vocab_file.write("\n".join(tokenizer["model"]["vocab"].keys()))
def check_existing(tokenizer):
ingredient_path = "mult_ingredients_nice.json"
with open(data_path + ingredient_path, "r") as ingr_json_file:
ingredients = json.load(ingr_json_file)
new_words = []
new_word_count = 0
old_word_count = 0
for ingr in ingredients.keys():
if ingr not in tokenizer["model"]["vocab"].keys():
new_words.append(ingr)
new_word_count += 1
# print(new_word_count)
else:
print(ingr)
old_word_count += 1
print(old_word_count)
def main():
tokenizer_path = "train_model/vocab/bert-base-german-cased_tokenizer.json"
with open(tokenizer_path, "r") as whole_json_file:
tokenizer = json.load(whole_json_file)
# check_existing(tokenizer)
# make_vocab_from_tokenizer(tokenizer["model"]["vocab"])
new_words = check_words_in_vocab(tokenizer)
create_base_vocab(tokenizer)
add_words_to_vocab(new_words)
main()