initial commit of project
This commit is contained in:
82
train_model/make_new_vocab.py
Normal file
82
train_model/make_new_vocab.py
Normal file
@@ -0,0 +1,82 @@
|
||||
import json
|
||||
|
||||
data_path = "data/"
|
||||
vocab_path = "train_model/vocab/"
|
||||
|
||||
def make_vocab_from_tokenizer(base_vocab):
|
||||
with open(vocab_path + "vocab.txt", "w") as vocab_file:
|
||||
for word in base_vocab:
|
||||
vocab_file.write(word + "\n")
|
||||
|
||||
|
||||
|
||||
def check_words_in_vocab(tokenizer):
|
||||
ingredient_path = "mult_ingredients_nice.json"
|
||||
with open(data_path + ingredient_path, "r") as ingr_json_file:
|
||||
ingredients = json.load(ingr_json_file)
|
||||
|
||||
new_words = []
|
||||
new_word_count = 0
|
||||
|
||||
for ingr in ingredients.keys():
|
||||
if ingr not in tokenizer["model"]["vocab"].keys():
|
||||
new_words.append(ingr)
|
||||
new_word_count += 1
|
||||
# print(new_word_count)
|
||||
else:
|
||||
print(ingr)
|
||||
|
||||
with open(vocab_path + "used_ingredients.json", "w") as used_ingredients_file:
|
||||
json.dump(list(ingredients.keys()), used_ingredients_file, ensure_ascii=False, indent=4)
|
||||
# used_ingredients_file.write("\n".join(ingredients.keys()))
|
||||
print(str(new_word_count) + " words to be added to vocab")
|
||||
return new_words
|
||||
|
||||
|
||||
def add_words_to_vocab(new_words):
|
||||
with open(vocab_path + "vocab.txt", "a") as vocab_file:
|
||||
vocab_file.write("\n")
|
||||
vocab_file.write("\n".join(new_words))
|
||||
|
||||
|
||||
def create_base_vocab(tokenizer):
|
||||
with open(vocab_path + "vocab.txt", "a") as vocab_file:
|
||||
vocab_file.write("\n".join(tokenizer["model"]["vocab"].keys()))
|
||||
|
||||
|
||||
def check_existing(tokenizer):
|
||||
ingredient_path = "mult_ingredients_nice.json"
|
||||
with open(data_path + ingredient_path, "r") as ingr_json_file:
|
||||
ingredients = json.load(ingr_json_file)
|
||||
|
||||
new_words = []
|
||||
new_word_count = 0
|
||||
old_word_count = 0
|
||||
|
||||
for ingr in ingredients.keys():
|
||||
if ingr not in tokenizer["model"]["vocab"].keys():
|
||||
new_words.append(ingr)
|
||||
new_word_count += 1
|
||||
# print(new_word_count)
|
||||
else:
|
||||
print(ingr)
|
||||
old_word_count += 1
|
||||
print(old_word_count)
|
||||
|
||||
|
||||
def main():
|
||||
tokenizer_path = "train_model/vocab/bert-base-german-cased_tokenizer.json"
|
||||
with open(tokenizer_path, "r") as whole_json_file:
|
||||
tokenizer = json.load(whole_json_file)
|
||||
# check_existing(tokenizer)
|
||||
|
||||
# make_vocab_from_tokenizer(tokenizer["model"]["vocab"])
|
||||
|
||||
new_words = check_words_in_vocab(tokenizer)
|
||||
|
||||
create_base_vocab(tokenizer)
|
||||
|
||||
add_words_to_vocab(new_words)
|
||||
|
||||
|
||||
main()
|
||||
Reference in New Issue
Block a user