57 lines
1.7 KiB
Python
57 lines
1.7 KiB
Python
# adapted by Franziska Paulus from:
|
|
# Copyright 2016 Leon Zucchini
|
|
#
|
|
# This file is part of the "recipes" project
|
|
# Repository: https://github.com/leonzucchini/recipes
|
|
"""
|
|
Parse information from recipe pages for storage to database
|
|
"""
|
|
|
|
import re
|
|
import json
|
|
from pyld import jsonld
|
|
from bs4 import BeautifulSoup as bs
|
|
|
|
def get_comments(reviews):
|
|
result = []
|
|
for comment in reviews:
|
|
result.append(comment["reviewBody"])
|
|
return result
|
|
|
|
def parse_recipe_info(url, text, verbose=True):
|
|
""" Grab html code from a file and parse for information - return dict of dicts. """
|
|
|
|
# Open file and parse with BS4
|
|
# soup = bs(text.decode("utf-8"), "lxml")
|
|
if isinstance(text, bytes):
|
|
soup = bs(text.decode("utf-8"), "lxml")
|
|
else:
|
|
soup = bs(text, "lxml")
|
|
|
|
|
|
# Info into JSON
|
|
try:
|
|
info = soup.findAll("script", type="application/ld+json")[1]
|
|
except IndexError as e:
|
|
with open("errors.txt", "a") as myfile:
|
|
myfile.write(url + "\n")
|
|
return None
|
|
try:
|
|
info_json = json.loads(info.string)
|
|
except json.decoder.JSONDecodeError as e:
|
|
if e.msg == 'Invalid control character at':
|
|
info_json = json.loads(info.string.replace("\t", " "))
|
|
else:
|
|
raise e
|
|
# info_json = json.loads(info.string.replace("\t", " "))
|
|
|
|
# put comments into list
|
|
url = url[:-1]
|
|
try:
|
|
comments = get_comments(info_json["reviews"])
|
|
except KeyError as e:
|
|
comments = None
|
|
|
|
# return dict
|
|
return {url: {"image": info_json["image"], "name": info_json["name"], "quantity": info_json["recipeYield"], "ingredients": info_json["recipeIngredient"], "instructions": info_json["recipeInstructions"], "comments": comments}}
|