initial commit of project
This commit is contained in:
45
evaluation/helpers/approx_knn_classifier.py
Normal file
45
evaluation/helpers/approx_knn_classifier.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
from annoy import AnnoyIndex
|
||||
from tqdm import tqdm
|
||||
|
||||
|
||||
# Full guide https://github.com/spotify/annoy
|
||||
class ApproxKNNClassifier:
|
||||
def __init__(self, all_ingredient_embeddings, max_embedding_count,
|
||||
save_path=Path('data/eval/approx_knn_classifier.ann'), n_trees=10):
|
||||
|
||||
vector_length = all_ingredient_embeddings.shape[-1]
|
||||
self.max_embedding_count = max_embedding_count
|
||||
if save_path.exists():
|
||||
print('Loading Existing Approx Classifier')
|
||||
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular')
|
||||
self.approx_knn_classifier.load(str(save_path)) # super fast, will just mmap the file
|
||||
else:
|
||||
|
||||
# To make sure we don't just get ourselves: add max_embedding_count
|
||||
self.approx_knn_classifier = AnnoyIndex(vector_length, 'angular') # Length of item vector that will be indexed
|
||||
for i in tqdm(range(len(all_ingredient_embeddings)), total=len(all_ingredient_embeddings), desc='Creating Approx Classifier'):
|
||||
self.approx_knn_classifier.add_item(i, all_ingredient_embeddings[i])
|
||||
|
||||
self.approx_knn_classifier.build(n_trees)
|
||||
print('Saving Approx Classifier')
|
||||
self.approx_knn_classifier.save(str(save_path))
|
||||
|
||||
def k_nearest_neighbors(self, ingredient_embeddings):
|
||||
all_indices, all_distances = [], []
|
||||
for idx, ingredient_embedding in enumerate(
|
||||
ingredient_embeddings): # search_k gives you a run-time tradeoff between better accuracy and speed currently defaults
|
||||
indices, distances = self.approx_knn_classifier.get_nns_by_vector(ingredient_embedding, self.max_embedding_count + 200, include_distances=True)
|
||||
all_indices.append(indices)
|
||||
all_distances.append(distances)
|
||||
|
||||
return np.stack(all_distances), np.stack(all_indices)
|
||||
152
evaluation/helpers/generate_ingredient_embeddings.py
Normal file
152
evaluation/helpers/generate_ingredient_embeddings.py
Normal file
@@ -0,0 +1,152 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
import json
|
||||
import pickle
|
||||
import random
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from evaluation.helpers.prediction_model import PredictionModel
|
||||
|
||||
|
||||
def _generate_food_sentence_dict(model_path):
|
||||
with open('data/mult_ingredients_nice.json', "r") as f:
|
||||
food_items = json.load(f)
|
||||
food_items_set = set(food_items.keys())
|
||||
|
||||
with open(model_path + 'training_data.txt', "r") as f:
|
||||
train_instruction_sentences = f.read().splitlines()
|
||||
# remove overlong sentences
|
||||
train_instruction_sentences = [s for s in train_instruction_sentences if len(s.split()) <= 100]
|
||||
|
||||
with open(model_path + 'testing_data.txt', "r") as f:
|
||||
test_instruction_sentences = f.read().splitlines()
|
||||
# remove overlong sentences
|
||||
test_instruction_sentences = [s for s in test_instruction_sentences if len(s.split()) <= 100]
|
||||
|
||||
instruction_sentences = train_instruction_sentences + test_instruction_sentences
|
||||
|
||||
food_to_sentences_dict = defaultdict(list)
|
||||
for sentence in instruction_sentences:
|
||||
words = re.sub("[^\w]-'", " ", sentence).split()
|
||||
for word in words:
|
||||
if word in food_items_set:
|
||||
food_to_sentences_dict[word].append(sentence)
|
||||
|
||||
return food_to_sentences_dict
|
||||
|
||||
|
||||
def _random_sample_with_min_count(population, k):
|
||||
if len(population) <= k:
|
||||
return population
|
||||
else:
|
||||
return random.sample(population, k)
|
||||
|
||||
|
||||
def sample_random_sentence_dict(model_path, max_sentence_count):
|
||||
food_to_sentences_dict = _generate_food_sentence_dict(model_path=model_path)
|
||||
# only keep 100 randomly selected sentences
|
||||
food_to_sentences_dict_random_samples = {food: _random_sample_with_min_count(sentences, max_sentence_count) for
|
||||
food, sentences in food_to_sentences_dict.items()}
|
||||
return food_to_sentences_dict_random_samples
|
||||
|
||||
|
||||
def _map_ingredients_to_input_ids(model_path):
|
||||
with open('data/mult_ingredients_nice.json', "r") as f:
|
||||
ingredients = json.load(f).keys()
|
||||
model = PredictionModel(model_path)
|
||||
ingredient_ids = model.tokenizer.convert_tokens_to_ids(ingredients)
|
||||
|
||||
ingredient_ids_dict = dict(zip(ingredients, ingredient_ids))
|
||||
|
||||
return ingredient_ids_dict
|
||||
|
||||
|
||||
def _merge_synonmys(food_to_embeddings_dict, max_sentence_count):
|
||||
synonmy_replacements_path = Path('foodbert_embeddings/data/synonmy_replacements.json')
|
||||
synonmy_replacements = {}
|
||||
|
||||
merged_dict = defaultdict(list)
|
||||
# Merge ingredients
|
||||
for key, value in food_to_embeddings_dict.items():
|
||||
if key in synonmy_replacements:
|
||||
key_to_use = synonmy_replacements[key]
|
||||
else:
|
||||
key_to_use = key
|
||||
|
||||
merged_dict[key_to_use].append(value)
|
||||
|
||||
merged_dict = {k: np.concatenate(v) for k, v in merged_dict.items()}
|
||||
# When embedding count exceeds maximum allowed, reduce back to requested count
|
||||
for key, value in merged_dict.items():
|
||||
if len(value) > max_sentence_count:
|
||||
index = np.random.choice(value.shape[0], max_sentence_count, replace=False)
|
||||
new_value = value[index]
|
||||
merged_dict[key] = new_value
|
||||
|
||||
return merged_dict
|
||||
|
||||
|
||||
def generate_food_embedding_dict(max_sentence_count, model_path, eval_path='data/eval/', dataset_path="output"):
|
||||
'''
|
||||
Creates a dict where the keys are the ingredients and the values are a list of embeddings with length max_sentence_count or less if there are less occurences
|
||||
These embeddings are used in generate_substitutes.py to predict substitutes
|
||||
'''
|
||||
food_to_embeddings_dict_path = Path(eval_path + 'food_embeddings_dict.pkl')
|
||||
if food_to_embeddings_dict_path.exists():
|
||||
with food_to_embeddings_dict_path.open('rb') as f:
|
||||
food_to_embeddings_dict = pickle.load(f)
|
||||
|
||||
# # delete keys if we deleted ingredients
|
||||
# old_ingredients = set(food_to_embeddings_dict.keys())
|
||||
# with open('train_model/vocab/used_ingredients.json', "r") as f:
|
||||
# new_ingredients = set(json.load(f))
|
||||
#
|
||||
# keys_to_delete = old_ingredients.difference(new_ingredients)
|
||||
# for key in keys_to_delete:
|
||||
# food_to_embeddings_dict.pop(key, None) # delete key if it exists
|
||||
#
|
||||
# # merge new synonyms
|
||||
# food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
|
||||
#
|
||||
# with food_to_embeddings_dict_path.open('wb') as f:
|
||||
# pickle.dump(food_to_embeddings_dict, f) # Overwrite dict with cleaned version
|
||||
|
||||
return food_to_embeddings_dict
|
||||
|
||||
print('Sampling Random Sentences')
|
||||
food_to_sentences_dict_random_samples = sample_random_sentence_dict(model_path=dataset_path, max_sentence_count=max_sentence_count)
|
||||
food_to_embeddings_dict = defaultdict(list)
|
||||
print('Mapping Ingredients to Input Ids')
|
||||
all_ingredient_ids = _map_ingredients_to_input_ids(model_path=model_path)
|
||||
|
||||
prediction_model = PredictionModel(model_path=model_path)
|
||||
|
||||
for food, sentences in tqdm(food_to_sentences_dict_random_samples.items(), total=len(food_to_sentences_dict_random_samples),
|
||||
desc='Calculating Embeddings for Food items'):
|
||||
embeddings, ingredient_ids = prediction_model.predict_embeddings(sentences)
|
||||
# get embedding of food word
|
||||
embeddings_flat = embeddings.view((-1, 768))
|
||||
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
|
||||
food_id = all_ingredient_ids[food]
|
||||
food_embeddings = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
|
||||
food_to_embeddings_dict[food].extend(food_embeddings)
|
||||
|
||||
food_to_embeddings_dict = {k: np.stack(v) for k, v in food_to_embeddings_dict.items()}
|
||||
# Clean synonmy
|
||||
food_to_embeddings_dict = _merge_synonmys(food_to_embeddings_dict, max_sentence_count)
|
||||
|
||||
with food_to_embeddings_dict_path.open('wb') as f:
|
||||
pickle.dump(food_to_embeddings_dict, f)
|
||||
|
||||
return food_to_embeddings_dict
|
||||
38
evaluation/helpers/instructions_dataset.py
Normal file
38
evaluation/helpers/instructions_dataset.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
import torch
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from torch.utils.data import Dataset
|
||||
|
||||
|
||||
class InstructionsDataset(Dataset):
|
||||
def __init__(self, tokenizer, sentences):
|
||||
self.tokenizer = tokenizer
|
||||
|
||||
batch_encoding = tokenizer.batch_encode_plus(sentences, add_special_tokens=True, max_length=512, truncation=True)
|
||||
self.examples = batch_encoding["input_ids"]
|
||||
self.examples = self._tensorize_batch([torch.tensor(elem) for elem in self.examples])
|
||||
|
||||
def _tensorize_batch(self, examples) -> torch.Tensor:
|
||||
length_of_first = examples[0].size(0)
|
||||
are_tensors_same_length = all(x.size(0) == length_of_first for x in examples)
|
||||
if are_tensors_same_length:
|
||||
return torch.stack(examples, dim=0)
|
||||
else:
|
||||
if self.tokenizer._pad_token is None:
|
||||
raise ValueError(
|
||||
"You are attempting to pad samples but the tokenizer you are using"
|
||||
f" ({self.tokenizer.__class__.__name__}) does not have one."
|
||||
)
|
||||
return pad_sequence(examples, batch_first=True, padding_value=self.tokenizer.pad_token_id)
|
||||
|
||||
def __len__(self):
|
||||
return len(self.examples)
|
||||
|
||||
def __getitem__(self, i):
|
||||
return self.examples[i]
|
||||
36
evaluation/helpers/knn_classifier.py
Normal file
36
evaluation/helpers/knn_classifier.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import joblib
|
||||
from sklearn.neighbors import NearestNeighbors
|
||||
|
||||
|
||||
class KNNClassifier:
|
||||
def __init__(self, all_ingredient_embeddings, max_embedding_count,
|
||||
save_path=Path('data/eval/knn_classifier.joblib')):
|
||||
|
||||
if save_path.exists():
|
||||
print('Loading Existing Classifier')
|
||||
self.knn_classifier: NearestNeighbors = joblib.load(save_path)
|
||||
else:
|
||||
print('Training New Classifier')
|
||||
# To make sure we don't just get ourselves: add max_embedding_count
|
||||
self.knn_classifier: NearestNeighbors = NearestNeighbors(n_neighbors=max_embedding_count + 200, n_jobs=12,
|
||||
algorithm='brute') # kd_tree, ball_tree or brute
|
||||
self.knn_classifier.fit(all_ingredient_embeddings)
|
||||
|
||||
print('Saving Classifier')
|
||||
joblib.dump(self.knn_classifier, save_path)
|
||||
|
||||
print(f'\nKNN with: {self.knn_classifier._fit_method} and leaf size: {self.knn_classifier.leaf_size}\n')
|
||||
|
||||
def k_nearest_neighbors(self, ingredient_embeddings):
|
||||
distances, indices = self.knn_classifier.kneighbors(ingredient_embeddings, return_distance=True)
|
||||
|
||||
return distances, indices
|
||||
53
evaluation/helpers/prediction_model.py
Normal file
53
evaluation/helpers/prediction_model.py
Normal file
@@ -0,0 +1,53 @@
|
||||
# adapted from:
|
||||
# Pellegrini., C., E. Özsoy., M. Wintergerst., and G. Groh. (2021).
|
||||
# “Exploiting Food Embeddings for Ingredient Substitution.”
|
||||
# In: Proceedings of the 14th International Joint Conference on Biomedical
|
||||
# Engineering Systems and Technologies - Volume 5: HEALTHINF, INSTICC.
|
||||
# SciTePress, pp. 67–77. isbn: 978-989-758-490-9. doi: 10.5220/0010202000670077.
|
||||
|
||||
import json
|
||||
|
||||
import torch
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import BertModel, BertTokenizer
|
||||
|
||||
from evaluation.helpers.instructions_dataset import InstructionsDataset
|
||||
|
||||
|
||||
class PredictionModel:
|
||||
|
||||
def __init__(self, model_path=''):
|
||||
self.model: BertModel = BertModel.from_pretrained(
|
||||
pretrained_model_name_or_path=model_path)
|
||||
with open('train_model/vocab/used_ingredients.json', 'r') as f:
|
||||
used_ingredients = json.load(f)
|
||||
self.tokenizer = BertTokenizer(vocab_file='train_model/vocab/bert_vocab.txt', do_lower_case=False,
|
||||
max_len=512, never_split=used_ingredients, truncation=True)
|
||||
|
||||
self.device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
|
||||
|
||||
self.model.to(self.device)
|
||||
|
||||
def predict_embeddings(self, sentences):
|
||||
dataset = InstructionsDataset(tokenizer=self.tokenizer, sentences=sentences)
|
||||
dataloader = DataLoader(dataset, batch_size=100, pin_memory=True)
|
||||
|
||||
embeddings = []
|
||||
ingredient_ids = []
|
||||
for batch in dataloader:
|
||||
batch = batch.to(self.device)
|
||||
with torch.no_grad():
|
||||
embeddings_batch = self.model(batch)
|
||||
embeddings.extend(embeddings_batch[0])
|
||||
ingredient_ids.extend(batch)
|
||||
|
||||
return torch.stack(embeddings), ingredient_ids
|
||||
|
||||
def compute_embedding_for_ingredient(self, sentence, ingredient_name):
|
||||
embeddings, ingredient_ids = self.predict_embeddings([sentence])
|
||||
embeddings_flat = embeddings.view((-1, 768))
|
||||
ingredient_ids_flat = torch.stack(ingredient_ids).flatten()
|
||||
food_id = self.tokenizer.convert_tokens_to_ids(ingredient_name)
|
||||
food_embedding = embeddings_flat[ingredient_ids_flat == food_id].cpu().numpy()
|
||||
|
||||
return food_embedding[0]
|
||||
166
evaluation/helpers/revise_substitutes.py
Normal file
166
evaluation/helpers/revise_substitutes.py
Normal file
@@ -0,0 +1,166 @@
|
||||
import json
|
||||
synonyms_path = "data/synonyms.json"
|
||||
ground_truth_path = "data/ground_truth.json"
|
||||
category_subs = ["Paprika", "Apfel", "Gouda", "Huhn", "Gans", "Kaninchen", "Kalbfleisch", "Schweinefleisch", "Ente", "Lamm",
|
||||
"Pute", "Wildfleisch", "Rindfleisch", "Speck", "Fisch", "Kohl", "Blattsalat", "Schokolade", "Kuvertüre", "Kuchenglasur",
|
||||
"Honig", "Sirup", "Joghurt", "Essig", "Traubensaft", "Truthahn", "Wein"]
|
||||
|
||||
with open("final_Versions/models/vers3/eval/complete_substitute_pairs_50.json") as f:
|
||||
sub_dict = json.load(f)
|
||||
|
||||
|
||||
def engl_combined_substitutes_dict(found_substitutes_dict):
|
||||
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
with open("evaluation/engl_data/revised_engl_ground_truth.json", "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
|
||||
reversed_synonyms_dict = get_reversed_syn_dict(is_engl=True)
|
||||
|
||||
new_found_sub_dict = {}
|
||||
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
new_found_sub_dict[ingredient] = []
|
||||
current_subs = set()
|
||||
for sub in found_substitutes_dict[ingredient]:
|
||||
# delete substitute if it is the same as the ingredient
|
||||
if sub == ingredient:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# delete substitute if it is a synonym of the ingredient
|
||||
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# if substitute is a synonym of sth
|
||||
if sub in reversed_synonyms_dict.keys():
|
||||
if len(reversed_synonyms_dict[sub]) == 1:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
|
||||
|
||||
else:
|
||||
current_subs.add(sub)
|
||||
|
||||
new_found_sub_dict[ingredient] += list(current_subs)
|
||||
return new_found_sub_dict
|
||||
|
||||
|
||||
# merges substitutes with their synonyms, replaces synonyms with base synonym
|
||||
def combined_substitutes_dict(found_substitutes_dict):
|
||||
with open(synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
with open(ground_truth_path, "r") as whole_json_file:
|
||||
ground_truth_dict = json.load(whole_json_file)
|
||||
|
||||
|
||||
reversed_synonyms_dict = get_reversed_syn_dict()
|
||||
|
||||
new_found_sub_dict = {}
|
||||
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
new_found_sub_dict[ingredient] = []
|
||||
current_subs = set()
|
||||
for sub in found_substitutes_dict[ingredient]:
|
||||
# delete substitute if it is the same as the ingredient
|
||||
if sub == ingredient:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# delete substitute if it is a synonym of the ingredient
|
||||
if ingredient in synonyms_dict.keys() and sub in synonyms_dict[ingredient]:
|
||||
# found_substitutes_dict = found_substitutes_dict[ingredient].remove[sub]
|
||||
continue
|
||||
|
||||
# if substitute is a synonym of sth
|
||||
if sub in reversed_synonyms_dict.keys():
|
||||
if len(reversed_synonyms_dict[sub]) == 1:
|
||||
if reversed_synonyms_dict[sub][0] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
if ingredient in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
current_subs.add(sub)
|
||||
elif len(reversed_synonyms_dict[sub]) == 2:
|
||||
if ingredient in category_subs:
|
||||
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][1])
|
||||
else:
|
||||
print(reversed_synonyms_dict[sub])
|
||||
else:
|
||||
if reversed_synonyms_dict[sub][0] in category_subs and reversed_synonyms_dict[sub][1] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][1])
|
||||
elif reversed_synonyms_dict[sub][1] in category_subs and reversed_synonyms_dict[sub][0] not in category_subs:
|
||||
current_subs.add(reversed_synonyms_dict[sub][0])
|
||||
else:
|
||||
print(reversed_synonyms_dict[sub])
|
||||
else:
|
||||
print(sub + " is in " + str(reversed_synonyms_dict[sub]))
|
||||
|
||||
else:
|
||||
current_subs.add(sub)
|
||||
|
||||
new_found_sub_dict[ingredient] += list(current_subs)
|
||||
return new_found_sub_dict
|
||||
|
||||
|
||||
# combine substitutes found for an ingredient and its synonyms
|
||||
# also combine synonyms in substitutes
|
||||
def combine_all_synonyms(found_substitutes_dict):
|
||||
reversed_synonyms_dict = get_reversed_syn_dict_no_cat()
|
||||
|
||||
new_found_sub_dict = {}
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
if ingredient not in reversed_synonyms_dict.keys():
|
||||
new_found_sub_dict[ingredient] = set()
|
||||
|
||||
for ingredient in found_substitutes_dict.keys():
|
||||
if ingredient in reversed_synonyms_dict.keys():
|
||||
new_found_sub_dict[reversed_synonyms_dict[ingredient][0]] |= set(found_substitutes_dict[ingredient])
|
||||
else:
|
||||
new_found_sub_dict[ingredient] |= set(found_substitutes_dict[ingredient])
|
||||
|
||||
new_found_sub_dict_list = {}
|
||||
|
||||
for ingredient in new_found_sub_dict.keys():
|
||||
new_found_sub_dict_list[ingredient] = list(new_found_sub_dict[ingredient])
|
||||
|
||||
return combined_substitutes_dict(new_found_sub_dict_list)
|
||||
|
||||
|
||||
|
||||
def get_reversed_syn_dict(is_engl=False):
|
||||
if is_engl:
|
||||
with open("evaluation/engl_data/engl_synonyms.json", "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
else:
|
||||
with open(synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
reversed_synonyms_dict = {}
|
||||
for ingredient in synonyms_dict.keys():
|
||||
for syn in synonyms_dict[ingredient]:
|
||||
if syn not in reversed_synonyms_dict.keys():
|
||||
reversed_synonyms_dict[syn] = []
|
||||
reversed_synonyms_dict[syn].append(ingredient)
|
||||
|
||||
return reversed_synonyms_dict
|
||||
|
||||
def get_reversed_syn_dict_no_cat():
|
||||
with open(synonyms_path, "r") as whole_json_file:
|
||||
synonyms_dict = json.load(whole_json_file)
|
||||
reversed_synonyms_dict = {}
|
||||
for ingredient in synonyms_dict.keys():
|
||||
if ingredient not in category_subs:
|
||||
for syn in synonyms_dict[ingredient]:
|
||||
if syn not in reversed_synonyms_dict.keys():
|
||||
reversed_synonyms_dict[syn] = []
|
||||
reversed_synonyms_dict[syn].append(ingredient)
|
||||
|
||||
return reversed_synonyms_dict
|
||||
|
||||
|
||||
combined_substitutes_dict(sub_dict)
|
||||
Reference in New Issue
Block a user