MasterarbeitCode/train_model/make_new_vocab.py

import json

data_path = "data/"
vocab_path = "train_model/vocab/"

def make_vocab_from_tokenizer(base_vocab):
    with open(vocab_path + "vocab.txt", "w") as vocab_file:
        for word in base_vocab:
            vocab_file.write(word + "\n")


def check_words_in_vocab(tokenizer):
    ingredient_path = "mult_ingredients_nice.json"
    with open(data_path + ingredient_path, "r") as ingr_json_file:
        ingredients = json.load(ingr_json_file)

    new_words = []
    new_word_count = 0

    for ingr in ingredients.keys():
        if ingr not in tokenizer["model"]["vocab"].keys():
            new_words.append(ingr)
            new_word_count += 1
            # print(new_word_count)
        else:
            print(ingr)

    with open(vocab_path + "used_ingredients.json", "w") as used_ingredients_file:
        json.dump(list(ingredients.keys()), used_ingredients_file, ensure_ascii=False, indent=4)
        # used_ingredients_file.write("\n".join(ingredients.keys()))
    print(str(new_word_count) + " words to be added to vocab")
    return new_words


def add_words_to_vocab(new_words):
    with open(vocab_path + "vocab.txt", "a") as vocab_file:
        vocab_file.write("\n")
        vocab_file.write("\n".join(new_words))


def create_base_vocab(tokenizer):
    with open(vocab_path + "vocab.txt", "a") as vocab_file:
        vocab_file.write("\n".join(tokenizer["model"]["vocab"].keys()))


def check_existing(tokenizer):
    ingredient_path = "mult_ingredients_nice.json"
    with open(data_path + ingredient_path, "r") as ingr_json_file:
        ingredients = json.load(ingr_json_file)

    new_words = []
    new_word_count = 0
    old_word_count = 0

    for ingr in ingredients.keys():
        if ingr not in tokenizer["model"]["vocab"].keys():
            new_words.append(ingr)
            new_word_count += 1
            # print(new_word_count)
        else:
            print(ingr)
            old_word_count += 1
    print(old_word_count)


def main():
    tokenizer_path = "train_model/vocab/bert-base-german-cased_tokenizer.json"
    with open(tokenizer_path, "r") as whole_json_file:
        tokenizer = json.load(whole_json_file)
    # check_existing(tokenizer)

    # make_vocab_from_tokenizer(tokenizer["model"]["vocab"])

    new_words = check_words_in_vocab(tokenizer)

    create_base_vocab(tokenizer)

    add_words_to_vocab(new_words)


main()