initial commit of project
This commit is contained in:
0
crawl_recipes/tools/__init__.py
Normal file
0
crawl_recipes/tools/__init__.py
Normal file
23
crawl_recipes/tools/get_input.py
Normal file
23
crawl_recipes/tools/get_input.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
|
||||
def get_input(file_path, print_config=False):
|
||||
""" Retrieve user input in json format. """
|
||||
|
||||
# Check path exists
|
||||
if not os.path.exists(file_path):
|
||||
print('GetInput error: Input path does not exist:\n%s' %(file_path))
|
||||
sys.exit(1)
|
||||
else:
|
||||
pass
|
||||
|
||||
# Retrieve user preferences from json file, return dict
|
||||
with open(file_path, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Print nicely as json
|
||||
if print_config:
|
||||
print(json.dumps(config, indent=4, sort_keys=True))
|
||||
|
||||
return config
|
||||
28
crawl_recipes/tools/get_response.py
Normal file
28
crawl_recipes/tools/get_response.py
Normal file
@@ -0,0 +1,28 @@
|
||||
import requests
|
||||
# from tools import get_user_agent
|
||||
|
||||
class HTMLresponse(object):
|
||||
""" Response from trying to get an HTML page, returning error and text """
|
||||
|
||||
def __init__(self, url, user_agent=None):
|
||||
""" Try to get response from URL """
|
||||
|
||||
self.error = 0
|
||||
self.error_message = ""
|
||||
self.text = ""
|
||||
|
||||
# Try to get HTML response
|
||||
try:
|
||||
if user_agent == None:
|
||||
r = requests.get(url)
|
||||
else:
|
||||
r = requests.get(url, headers=user_agent)
|
||||
|
||||
r.raise_for_status()
|
||||
self.text = r.text.encode('utf-8')
|
||||
|
||||
# If error, return error message and flag
|
||||
except requests.exceptions.RequestException as err:
|
||||
self.error = 1
|
||||
self.response = str(err)
|
||||
self.error_message = "".join(["Get error: ", url])
|
||||
17
crawl_recipes/tools/get_user_agent.py
Normal file
17
crawl_recipes/tools/get_user_agent.py
Normal file
@@ -0,0 +1,17 @@
|
||||
"""
|
||||
Read user agents from list (local file) and select one for the request.
|
||||
"""
|
||||
import random
|
||||
|
||||
def user_agent_list(user_agent_list_path):
|
||||
"""Read user agents from list (local file). """
|
||||
with open(user_agent_list_path, "r") as f:
|
||||
agent_list = f.readlines()
|
||||
return agent_list
|
||||
|
||||
def select_user_agent(agent_list):
|
||||
"""Select random user agent from pre-loaded list of agents. """
|
||||
n_agents = len(agent_list)
|
||||
agent = {}
|
||||
agent["user-agent"] = agent_list[random.randint(0, n_agents-1)].replace("\n", "").replace("\"", "")
|
||||
return agent
|
||||
50
crawl_recipes/tools/make_folders.py
Normal file
50
crawl_recipes/tools/make_folders.py
Normal file
@@ -0,0 +1,50 @@
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import re
|
||||
|
||||
def make_output_folder(folder_path, debug=False):
|
||||
""" Make folder for output, checking for previous results """
|
||||
|
||||
# Skip if debug (avoids replace prompt)
|
||||
if debug:
|
||||
print("FolderSetup warning: Not creating directory because debug = True")
|
||||
pass
|
||||
|
||||
else:
|
||||
# If destination folder does not exist then create it
|
||||
if not os.path.exists(folder_path):
|
||||
os.mkdir(folder_path)
|
||||
|
||||
else:
|
||||
# Otherwise give a choice to replace (overwrite), use, or exit
|
||||
confirm_prompt = "The following folder exists:" + "\n" + \
|
||||
str(folder_path) + "\n" + \
|
||||
"Would you like to add to it ('a'), overwrite ('o'), or exit ('e'): "
|
||||
confirm = input(confirm_prompt)
|
||||
|
||||
# Prompt for correctly formatted input (y/n)
|
||||
while not re.search(r'[aeo]', confirm):
|
||||
confirm_prompt = "Please confirm what you want to do." + "\n" + \
|
||||
"Would you like to add to it ('a'), overwrite ('o'), or exit ('e'):"
|
||||
confirm = input(confirm_prompt)
|
||||
|
||||
# If exit
|
||||
if confirm == "e":
|
||||
print("OK exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
# Else if overwrite
|
||||
elif confirm == "o":
|
||||
|
||||
# Make folder path
|
||||
shutil.rmtree(folder_path)
|
||||
os.mkdir(folder_path)
|
||||
|
||||
print("Created output folder: %s" %(folder_path))
|
||||
|
||||
# Else if add
|
||||
elif confirm == "a":
|
||||
print("OK adding to folder")
|
||||
|
||||
return None
|
||||
20
crawl_recipes/tools/remove_pyc.py
Normal file
20
crawl_recipes/tools/remove_pyc.py
Normal file
@@ -0,0 +1,20 @@
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
|
||||
def remove_pyc(folderpath):
|
||||
"""Remove all pyc files from a folder."""
|
||||
|
||||
pyc_paths = []
|
||||
|
||||
# Save paths to all .pyc files in folder to list
|
||||
for folder, subs, files in os.walk(folderpath):
|
||||
for filename in files:
|
||||
full_path = os.path.join(folder, filename)
|
||||
is_pyc = re.search(r'\.pyc', full_path)
|
||||
if is_pyc:
|
||||
pyc_paths.append(full_path)
|
||||
|
||||
# Remove all files in list
|
||||
for path in pyc_paths:
|
||||
file = os.remove(path)
|
||||
23
crawl_recipes/tools/write_text.py
Normal file
23
crawl_recipes/tools/write_text.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
def write_text(input_text, file_path, option="Exit"):
|
||||
""" Write text to disk checking options if it exists """
|
||||
|
||||
if not os.path.exists(file_path):
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(input_text.decode("utf-8"))
|
||||
|
||||
else:
|
||||
|
||||
if option == "Exit":
|
||||
print("Write file error: This file already exists.\n %s \nExiting..." %(file_path))
|
||||
sys.exit(1)
|
||||
|
||||
elif option == "Append":
|
||||
with open(file_path, 'a') as f:
|
||||
f.write(input_text)
|
||||
|
||||
elif option == "Overwrite":
|
||||
with open(file_path, 'w') as f:
|
||||
f.write(input_text)
|
||||
Reference in New Issue
Block a user