initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/crawl_recipes/parse_category_subpages.py
+++ b/crawl_recipes/parse_category_subpages.py
@@ -0,0 +1,121 @@
+# adapted by Franziska Paulus from:
+# Copyright 2016 Leon Zucchini
+#
+# This file is part of the "recipes" project
+# Repository: https://github.com/leonzucchini/recipes
+
+import json
+import os
+import re
+from bs4 import BeautifulSoup as bs
+import pandas as pd
+# import py2neo as pn
+
+def parse_info(file_path, counter=1, verbose=True):
+    """ Grab html code from a file and parse for information - return dict of dicts. """
+
+    this_counter = counter
+    # Open file and parse with BS4
+    with open(file_path, "r") as f:
+        soup = bs(f.read(), "lxml")
+
+    # Get category
+    category = re.match(r"g\d+(\w+)_.*", os.path.basename(file_path)).group(1)
+    recipes = {}
+
+    search_hits = soup.find_all("script", type="application/ld+json")
+    for hit in search_hits:
+    # Parse results in search hit for basic information
+
+        try:
+            # Recipe ID
+            id = hit['id'].replace('recipe-','')
+            recipes[id] = {}
+
+            # Info on category and category list page
+            recipes[id]["category"] = category
+            pNum, rText, sNum = re.match(r".*(g\d+)(.*)_(\d*).*\.(txt)", file_path).group(1,2,3)
+            recipes[id]["category_list_page"] = "www.chefkoch.de/rs/s" + sNum + pNum + "/" + rText + ".html"
+
+            # URL, title, subtitle
+            recipes[id]["url"] = "".join(["http://www.chefkoch.de", hit.a['href']])
+            recipes[id]["title"] = hit.a.find("div", class_="search-list-item-title").get_text().strip().replace("\n","")
+            recipes[id]["subtitle"] = hit.a.find("p", class_="search-list-item-subtitle").get_text().strip().replace("\n","")
+
+            # Votes (number and average)
+            votes_raw = hit.a.find("span", class_="search-list-item-uservotes-stars")["title"]
+            recipes[id]["votes_n"] = re.match(r"^(\d*)\s.*", votes_raw).group(1)
+            recipes[id]["votes_avg"] = re.match(r".*\s(.*?)$", votes_raw).group(1)
+
+            # Other info
+            recipes[id]["difficulty"] = hit.a.find("span", class_="search-list-item-difficulty").get_text()
+            recipes[id]["preptime"] = hit.a.find("span", class_="search-list-item-preptime").get_text()
+            recipes[id]["activationdate"] = hit.a.find("span", class_="search-list-item-activationdate").get_text()
+
+            if verbose:
+                print(file_path + " #" + str(this_counter) + " successfully parsed")
+
+        except Exception:
+            if verbose:
+                print(file_path + " #" + str(this_counter) + " problem with parsing")
+
+        this_counter += 1
+
+    return (recipes, this_counter)
+
+def parse_subpage(file_path, counter=1, verbose=True):
+    """ Grab html code from a file and parse for information - return dict of dicts. """
+
+    this_counter = counter
+    # Open file and parse with BS4
+    with open(file_path, "r") as f:
+        soup = bs(f.read(), "lxml")
+
+
+    #Get URLs and save to list
+    recipe_urls = []
+    search_hits = soup.find_all("script", type="application/ld+json")[1]
+    hits_json = json.loads(search_hits.string.replace("\t", " "))
+    for hit in hits_json["itemListElement"]:
+        recipe_urls.append(hit["url"])
+    return recipe_urls
+
+
+# def neo_dict(dictionary):
+#     """ Write content of dictionary to graph database as attributes (node = dict name). """
+#     pass
+
+def main():
+    folder_path = "crawl_recipes/textFiles/"
+    out_path = "recipe_urls.txt"
+    curr_file_nr = 0
+    curr_url_nr = 0
+
+    # Get file names from folder  
+    file_paths = []
+    [file_paths.append(os.path.join(folder_path, fn)) for fn in os.listdir(folder_path)]
+    # file_paths.pop(0) # Remove log file from list
+
+    # Get URLs
+    counter = 1
+    all_recipe_urls = []
+    for fp in file_paths:
+        print("file nr: " + str(curr_file_nr))
+        curr_file_nr = curr_file_nr + 1
+        urls = parse_subpage(fp, counter=counter, verbose=False)
+        for url in urls:
+            print("URL Nr.: " + str(curr_url_nr))
+            curr_url_nr = curr_url_nr + 1
+
+            if url not in all_recipe_urls:
+                all_recipe_urls.append(url)
+
+    # Save to file
+    with open(out_path, 'w') as f:
+        for item in all_recipe_urls:
+            print(item)
+            f.write("%s\n" % item)
+
+    print("All done!")
+
+main()