Files
MasterarbeitCode/crawl_recipes/parse_recipe_pages.py
2021-04-11 23:28:41 +02:00

57 lines
1.7 KiB
Python

# adapted by Franziska Paulus from:
# Copyright 2016 Leon Zucchini
#
# This file is part of the "recipes" project
# Repository: https://github.com/leonzucchini/recipes
"""
Parse information from recipe pages for storage to database
"""
import re
import json
from pyld import jsonld
from bs4 import BeautifulSoup as bs
def get_comments(reviews):
result = []
for comment in reviews:
result.append(comment["reviewBody"])
return result
def parse_recipe_info(url, text, verbose=True):
""" Grab html code from a file and parse for information - return dict of dicts. """
# Open file and parse with BS4
# soup = bs(text.decode("utf-8"), "lxml")
if isinstance(text, bytes):
soup = bs(text.decode("utf-8"), "lxml")
else:
soup = bs(text, "lxml")
# Info into JSON
try:
info = soup.findAll("script", type="application/ld+json")[1]
except IndexError as e:
with open("errors.txt", "a") as myfile:
myfile.write(url + "\n")
return None
try:
info_json = json.loads(info.string)
except json.decoder.JSONDecodeError as e:
if e.msg == 'Invalid control character at':
info_json = json.loads(info.string.replace("\t", " "))
else:
raise e
# info_json = json.loads(info.string.replace("\t", " "))
# put comments into list
url = url[:-1]
try:
comments = get_comments(info_json["reviews"])
except KeyError as e:
comments = None
# return dict
return {url: {"image": info_json["image"], "name": info_json["name"], "quantity": info_json["recipeYield"], "ingredients": info_json["recipeIngredient"], "instructions": info_json["recipeInstructions"], "comments": comments}}