82 lines
2.3 KiB
Python
82 lines
2.3 KiB
Python
import json
|
|
|
|
data_path = "data/"
|
|
vocab_path = "train_model/vocab/"
|
|
|
|
def make_vocab_from_tokenizer(base_vocab):
|
|
with open(vocab_path + "vocab.txt", "w") as vocab_file:
|
|
for word in base_vocab:
|
|
vocab_file.write(word + "\n")
|
|
|
|
|
|
|
|
def check_words_in_vocab(tokenizer):
|
|
ingredient_path = "mult_ingredients_nice.json"
|
|
with open(data_path + ingredient_path, "r") as ingr_json_file:
|
|
ingredients = json.load(ingr_json_file)
|
|
|
|
new_words = []
|
|
new_word_count = 0
|
|
|
|
for ingr in ingredients.keys():
|
|
if ingr not in tokenizer["model"]["vocab"].keys():
|
|
new_words.append(ingr)
|
|
new_word_count += 1
|
|
# print(new_word_count)
|
|
else:
|
|
print(ingr)
|
|
|
|
with open(vocab_path + "used_ingredients.json", "w") as used_ingredients_file:
|
|
json.dump(list(ingredients.keys()), used_ingredients_file, ensure_ascii=False, indent=4)
|
|
# used_ingredients_file.write("\n".join(ingredients.keys()))
|
|
print(str(new_word_count) + " words to be added to vocab")
|
|
return new_words
|
|
|
|
|
|
def add_words_to_vocab(new_words):
|
|
with open(vocab_path + "vocab.txt", "a") as vocab_file:
|
|
vocab_file.write("\n")
|
|
vocab_file.write("\n".join(new_words))
|
|
|
|
|
|
def create_base_vocab(tokenizer):
|
|
with open(vocab_path + "vocab.txt", "a") as vocab_file:
|
|
vocab_file.write("\n".join(tokenizer["model"]["vocab"].keys()))
|
|
|
|
|
|
def check_existing(tokenizer):
|
|
ingredient_path = "mult_ingredients_nice.json"
|
|
with open(data_path + ingredient_path, "r") as ingr_json_file:
|
|
ingredients = json.load(ingr_json_file)
|
|
|
|
new_words = []
|
|
new_word_count = 0
|
|
old_word_count = 0
|
|
|
|
for ingr in ingredients.keys():
|
|
if ingr not in tokenizer["model"]["vocab"].keys():
|
|
new_words.append(ingr)
|
|
new_word_count += 1
|
|
# print(new_word_count)
|
|
else:
|
|
print(ingr)
|
|
old_word_count += 1
|
|
print(old_word_count)
|
|
|
|
|
|
def main():
|
|
tokenizer_path = "train_model/vocab/bert-base-german-cased_tokenizer.json"
|
|
with open(tokenizer_path, "r") as whole_json_file:
|
|
tokenizer = json.load(whole_json_file)
|
|
# check_existing(tokenizer)
|
|
|
|
# make_vocab_from_tokenizer(tokenizer["model"]["vocab"])
|
|
|
|
new_words = check_words_in_vocab(tokenizer)
|
|
|
|
create_base_vocab(tokenizer)
|
|
|
|
add_words_to_vocab(new_words)
|
|
|
|
|
|
main() |