initial commit of project

2021-04-11 19:51:12 +02:00
commit a21a8186d9
110 changed files with 16326178 additions and 0 deletions
--- a/crawl_recipes/crawl_recipe_pages.py
+++ b/crawl_recipes/crawl_recipe_pages.py
@@ -0,0 +1,79 @@
+# adapted by Franziska Paulus from:
+# Copyright 2016 Leon Zucchini
+#
+# This file is part of the "recipes" project
+# Repository: https://github.com/leonzucchini/recipes
+"""
+Crawl through recipe pages and parse data to database
+"""
+import json
+import os
+import requests
+# import grequests
+import pandas as pd
+import crawl_recipes.parse_recipe_pages
+from crawl_recipes.tools import (
+    get_user_agent,
+    get_response
+)
+
+def main():
+
+    # Get links for supporting files
+    # url_list_path = "/Users/Leon/Documents/02_Research_Learning/Research/Recipes/03_Data/link_data.csv"
+    # user_agents_path = "/Users/Leon/Documents/02_Research_Learning/Research/Recipes/02_Code/config/user_agents.txt"
+
+    # Get page url from database
+    # url_list = pd.read_csv(url_list_path)[['url']].values.tolist()
+    # url_list = [item for sublist in url_list for item in sublist] # Flatten list of lists
+    # num_urls = len(url_list)
+
+    # Select user-agents from list (downloaded)
+    # user_agent_list = get_user_agent.user_agent_list(user_agents_path)
+
+    ### THIS IS WHERE THE LOOP OVER URLS GOES
+    # Select url and user-agent
+    # url = url_list[2051]
+    # user_agent = get_user_agent.select_user_agent(user_agent_list)
+    # url = "http://www.chefkoch.de/rezepte/1651531272966946/Schaschlik-wie-im-Kaukasus-grillen.html"
+    user_agent = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; fr-FR) AppleWebKit/528.16 (KHTML, like Gecko) Version/4.0 Safari/528.16'}
+    url_list_path = "recipe_urls.txt"
+    # url_list_path = "errors_1.txt"
+    dataset_path = "crawl_recipes/dataset.json"
+
+    # Opening file
+    url_file = open(url_list_path, 'r')
+    count = 1
+
+    recipe_json = json.loads('{}')
+
+    # Using for loop
+    for url in url_file:
+
+        print("parsing recipe " + str(count) + " from URL: " + url)
+        count += 1
+        # Open page using selected user-agent
+        response = get_response.HTMLresponse(url, user_agent=user_agent)
+        text = response.text
+
+        # Parse info on page
+        result = crawl_recipes.parse_recipe_pages.parse_recipe_info(url, text)
+        if result is not None:
+            recipe_json.update(result)
+        if count % 1000 == 0:
+            with open(dataset_path, 'w') as f:
+                json.dump(recipe_json, f, indent=4)
+
+    # Closing files
+    url_file.close()
+
+    # Save dataset to file
+
+    with open(dataset_path, 'w') as f:
+        json.dump(recipe_json, f, indent=4)
+    # print(json.dumps(recipe_json))
+    print("Done!")
+
+
+
+main()