MasterarbeitCode/crawl_recipes/parse_recipe_pages.py

# adapted by Franziska Paulus from:
# Copyright 2016 Leon Zucchini
#
# This file is part of the "recipes" project
# Repository: https://github.com/leonzucchini/recipes
"""
Parse information from recipe pages for storage to database
"""

import re
import json
from pyld import jsonld
from bs4 import BeautifulSoup as bs

def get_comments(reviews):
    result = []
    for comment in reviews:
        result.append(comment["reviewBody"])
    return result

def parse_recipe_info(url, text, verbose=True):
    """ Grab html code from a file and parse for information - return dict of dicts. """

    # Open file and parse with BS4
    # soup = bs(text.decode("utf-8"), "lxml")
    if isinstance(text, bytes):
        soup = bs(text.decode("utf-8"), "lxml")
    else:
        soup = bs(text, "lxml")


    # Info into JSON
    try:
        info = soup.findAll("script", type="application/ld+json")[1]
    except IndexError as e:
        with open("errors.txt", "a") as myfile:
            myfile.write(url + "\n")
        return None
    try:
        info_json = json.loads(info.string)
    except json.decoder.JSONDecodeError as e:
        if e.msg == 'Invalid control character at':
            info_json = json.loads(info.string.replace("\t", " "))
        else:
            raise e
    # info_json = json.loads(info.string.replace("\t", " "))

    # put comments into list
    url = url[:-1]
    try:
        comments = get_comments(info_json["reviews"])
    except KeyError as e:
        comments = None

    # return dict
    return {url: {"image": info_json["image"], "name": info_json["name"], "quantity": info_json["recipeYield"], "ingredients": info_json["recipeIngredient"], "instructions": info_json["recipeInstructions"], "comments": comments}}