initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/train_model/make_new_vocab.py
+++ b/train_model/make_new_vocab.py
@@ -0,0 +1,82 @@
+import json
+
+data_path = "data/"
+vocab_path = "train_model/vocab/"
+
+def make_vocab_from_tokenizer(base_vocab):
+    with open(vocab_path + "vocab.txt", "w") as vocab_file:
+        for word in base_vocab:
+            vocab_file.write(word + "\n")
+
+
+
+def check_words_in_vocab(tokenizer):
+    ingredient_path = "mult_ingredients_nice.json"
+    with open(data_path + ingredient_path, "r") as ingr_json_file:
+        ingredients = json.load(ingr_json_file)
+
+    new_words = []
+    new_word_count = 0
+
+    for ingr in ingredients.keys():
+        if ingr not in tokenizer["model"]["vocab"].keys():
+            new_words.append(ingr)
+            new_word_count += 1
+            # print(new_word_count)
+        else:
+            print(ingr)
+
+    with open(vocab_path + "used_ingredients.json", "w") as used_ingredients_file:
+        json.dump(list(ingredients.keys()), used_ingredients_file, ensure_ascii=False, indent=4)
+        # used_ingredients_file.write("\n".join(ingredients.keys()))
+    print(str(new_word_count) + " words to be added to vocab")
+    return new_words
+
+
+def add_words_to_vocab(new_words):
+    with open(vocab_path + "vocab.txt", "a") as vocab_file:
+        vocab_file.write("\n")
+        vocab_file.write("\n".join(new_words))
+
+
+def create_base_vocab(tokenizer):
+    with open(vocab_path + "vocab.txt", "a") as vocab_file:
+        vocab_file.write("\n".join(tokenizer["model"]["vocab"].keys()))
+
+
+def check_existing(tokenizer):
+    ingredient_path = "mult_ingredients_nice.json"
+    with open(data_path + ingredient_path, "r") as ingr_json_file:
+        ingredients = json.load(ingr_json_file)
+
+    new_words = []
+    new_word_count = 0
+    old_word_count = 0
+
+    for ingr in ingredients.keys():
+        if ingr not in tokenizer["model"]["vocab"].keys():
+            new_words.append(ingr)
+            new_word_count += 1
+            # print(new_word_count)
+        else:
+            print(ingr)
+            old_word_count += 1
+    print(old_word_count)
+
+
+def main():
+    tokenizer_path = "train_model/vocab/bert-base-german-cased_tokenizer.json"
+    with open(tokenizer_path, "r") as whole_json_file:
+        tokenizer = json.load(whole_json_file)
+    # check_existing(tokenizer)
+
+    # make_vocab_from_tokenizer(tokenizer["model"]["vocab"])
+
+    new_words = check_words_in_vocab(tokenizer)
+
+    create_base_vocab(tokenizer)
+
+    add_words_to_vocab(new_words)
+
+
+main()