initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/crawl_recipes/crawl_category_subpages.py
+++ b/crawl_recipes/crawl_category_subpages.py
@@ -0,0 +1,119 @@
+# adapted by Franziska Paulus from:
+# Copyright 2016 Leon Zucchini
+#
+# This file is part of the "recipes" project
+# Repository: https://github.com/leonzucchini/recipes
+
+import os
+import re
+from datetime import datetime as dt
+
+from crawl_recipes.tools import (
+    get_input,
+    get_response,
+    make_folders,
+    remove_pyc,
+    write_text
+)
+
+
+
+def crawl_categories(category_dict, folder_path,
+                log_name, log_option="Exit", files_option="Exit",
+                verbose=False, short_cycle=False):
+    """
+    Get syntax framework for URLs of recipe category sub-pages (containing links to recipes).
+    Crawl through recipe category sub-pages (loop over varying parts of URLs). 
+    Get HTML, check for errors, and store results.    
+    """
+
+    log_list = []
+    log_path = os.path.join(folder_path, log_name)
+
+    # Get syntax framework
+    url_tuples = []
+    for k, v in category_dict.items():
+        test = re.match(r"(.*chefkoch.de/rs/s)\d+(.*)", v)
+        url_tuples.append((test.group(1), test.group(2)))
+
+    for url_tuple in url_tuples: 
+
+        # Cycle through URL sub-pages
+        SUBPAGE_NO = 0
+        URL_ERROR_COUNT = 0
+
+        while True:
+            # Cycle through increments in URL pages
+
+            # Set break points
+            break_now = URL_ERROR_COUNT > 5 # Break after 5 bad responses
+            if short_cycle:
+                break_now = URL_ERROR_COUNT > 5 or SUBPAGE_NO > 100 # Use shorter cycle for testing
+            if break_now:
+                break
+
+            else:
+                # Get response
+                cat_url = "".join([url_tuple[0], str(SUBPAGE_NO), url_tuple[1]])
+                html_response = get_response.HTMLresponse(cat_url)
+
+                # Check for html get errors
+                if isinstance(html_response.text, bytes):
+                    not_found = re.match("Zu deiner Suchanfrage konnten keine Rezepte gefunden werden.", html_response.text.decode("utf-8"))
+                else:
+                    not_found = re.match("Zu deiner Suchanfrage konnten keine Rezepte gefunden werden.", html_response.text)
+                if not_found or html_response.error:
+
+                    # Add to error count and log
+                    URL_ERROR_COUNT += 1
+                    log_list.append(html_response.error_message)
+                    if verbose:
+                        print(html_response.error_message)
+
+                else:
+                    # If no error write text to file
+                    get_message = " ".join(["Got category sub-page:", cat_url])
+                    log_list.append(get_message)
+                    file_name = ".".join(["_".join([url_tuple[1][:-5].replace("/",""), str(SUBPAGE_NO)]), "txt"])
+                    file_path = os.path.join(folder_path, file_name)
+                    write_text.write_text(html_response.text, file_path, files_option)
+
+                    if verbose:
+                        print(get_message)
+
+                # Increment of 30 due to specific syntax of chefkoch.de category sub-pages
+                SUBPAGE_NO += 30 
+
+    # Write log to file
+    write_text.write_text("\n".join(log_list), log_path, log_option)
+
+def main():
+    """ Crawl through the chefkoch.de recipe category sub-pages and store HTML to files.
+        Note:
+            Recipes in chefkoch.de are sorted into categories (e.g. baking).
+            Each category has many sub-pages containing links to 30 recipes.
+            This file crawles through and stores the sub-pages so I can parse for the links later. 
+    """
+
+    # Define path to configs 
+    config_path = "crawl_recipes/config/config.json"
+    
+    # Get configs and set up paths
+    config = get_input.get_input(config_path, print_config=False)
+    output_path = os.path.join(config['_home'], config['_linkFiles'])
+    make_folders.make_output_folder(output_path, debug=False) 
+
+    # Parse cateogry urls and store pages to local files
+    categories_path = os.path.join(config['_home'], config['category_links'])
+    categories_links = get_input.get_input(categories_path, print_config=False)
+    crawl_categories(
+        categories_links, output_path,
+        log_name = "_category_log.txt", log_option="Append",
+        files_option="Exit", verbose=True, short_cycle=False
+        )
+
+    # Clear up pyc files
+    remove_pyc.remove_pyc(config['_home'])
+    print("All done!")
+
+main()