MasterarbeitCode/crawl_recipes/parse_category_subpages.py

# adapted by Franziska Paulus from:
# Copyright 2016 Leon Zucchini
#
# This file is part of the "recipes" project
# Repository: https://github.com/leonzucchini/recipes

import json
import os
import re
from bs4 import BeautifulSoup as bs
import pandas as pd
# import py2neo as pn

def parse_info(file_path, counter=1, verbose=True):
    """ Grab html code from a file and parse for information - return dict of dicts. """

    this_counter = counter
    # Open file and parse with BS4
    with open(file_path, "r") as f:
        soup = bs(f.read(), "lxml")

    # Get category
    category = re.match(r"g\d+(\w+)_.*", os.path.basename(file_path)).group(1)
    recipes = {}

    search_hits = soup.find_all("script", type="application/ld+json")
    for hit in search_hits:
    # Parse results in search hit for basic information

        try:
            # Recipe ID
            id = hit['id'].replace('recipe-','')
            recipes[id] = {}

            # Info on category and category list page
            recipes[id]["category"] = category
            pNum, rText, sNum = re.match(r".*(g\d+)(.*)_(\d*).*\.(txt)", file_path).group(1,2,3)
            recipes[id]["category_list_page"] = "www.chefkoch.de/rs/s" + sNum + pNum + "/" + rText + ".html"

            # URL, title, subtitle
            recipes[id]["url"] = "".join(["http://www.chefkoch.de", hit.a['href']])
            recipes[id]["title"] = hit.a.find("div", class_="search-list-item-title").get_text().strip().replace("\n","")
            recipes[id]["subtitle"] = hit.a.find("p", class_="search-list-item-subtitle").get_text().strip().replace("\n","")

            # Votes (number and average)
            votes_raw = hit.a.find("span", class_="search-list-item-uservotes-stars")["title"]
            recipes[id]["votes_n"] = re.match(r"^(\d*)\s.*", votes_raw).group(1)
            recipes[id]["votes_avg"] = re.match(r".*\s(.*?)$", votes_raw).group(1)

            # Other info
            recipes[id]["difficulty"] = hit.a.find("span", class_="search-list-item-difficulty").get_text()
            recipes[id]["preptime"] = hit.a.find("span", class_="search-list-item-preptime").get_text()
            recipes[id]["activationdate"] = hit.a.find("span", class_="search-list-item-activationdate").get_text()

            if verbose:
                print(file_path + " #" + str(this_counter) + " successfully parsed")

        except Exception:
            if verbose:
                print(file_path + " #" + str(this_counter) + " problem with parsing")

        this_counter += 1

    return (recipes, this_counter)

def parse_subpage(file_path, counter=1, verbose=True):
    """ Grab html code from a file and parse for information - return dict of dicts. """

    this_counter = counter
    # Open file and parse with BS4
    with open(file_path, "r") as f:
        soup = bs(f.read(), "lxml")


    #Get URLs and save to list
    recipe_urls = []
    search_hits = soup.find_all("script", type="application/ld+json")[1]
    hits_json = json.loads(search_hits.string.replace("\t", " "))
    for hit in hits_json["itemListElement"]:
        recipe_urls.append(hit["url"])
    return recipe_urls


# def neo_dict(dictionary):
#     """ Write content of dictionary to graph database as attributes (node = dict name). """
#     pass

def main():
    folder_path = "crawl_recipes/textFiles/"
    out_path = "recipe_urls.txt"
    curr_file_nr = 0
    curr_url_nr = 0

    # Get file names from folder
    file_paths = []
    [file_paths.append(os.path.join(folder_path, fn)) for fn in os.listdir(folder_path)]
    # file_paths.pop(0) # Remove log file from list

    # Get URLs
    counter = 1
    all_recipe_urls = []
    for fp in file_paths:
        print("file nr: " + str(curr_file_nr))
        curr_file_nr = curr_file_nr + 1
        urls = parse_subpage(fp, counter=counter, verbose=False)
        for url in urls:
            print("URL Nr.: " + str(curr_url_nr))
            curr_url_nr = curr_url_nr + 1

            if url not in all_recipe_urls:
                all_recipe_urls.append(url)

    # Save to file
    with open(out_path, 'w') as f:
        for item in all_recipe_urls:
            print(item)
            f.write("%s\n" % item)

    print("All done!")

main()