# adapted by Franziska Paulus from: # Copyright 2016 Leon Zucchini # # This file is part of the "recipes" project # Repository: https://github.com/leonzucchini/recipes """ Parse information from recipe pages for storage to database """ import re import json from pyld import jsonld from bs4 import BeautifulSoup as bs def get_comments(reviews): result = [] for comment in reviews: result.append(comment["reviewBody"]) return result def parse_recipe_info(url, text, verbose=True): """ Grab html code from a file and parse for information - return dict of dicts. """ # Open file and parse with BS4 # soup = bs(text.decode("utf-8"), "lxml") if isinstance(text, bytes): soup = bs(text.decode("utf-8"), "lxml") else: soup = bs(text, "lxml") # Info into JSON try: info = soup.findAll("script", type="application/ld+json")[1] except IndexError as e: with open("errors.txt", "a") as myfile: myfile.write(url + "\n") return None try: info_json = json.loads(info.string) except json.decoder.JSONDecodeError as e: if e.msg == 'Invalid control character at': info_json = json.loads(info.string.replace("\t", " ")) else: raise e # info_json = json.loads(info.string.replace("\t", " ")) # put comments into list url = url[:-1] try: comments = get_comments(info_json["reviews"]) except KeyError as e: comments = None # return dict return {url: {"image": info_json["image"], "name": info_json["name"], "quantity": info_json["recipeYield"], "ingredients": info_json["recipeIngredient"], "instructions": info_json["recipeInstructions"], "comments": comments}}