# This Code takes a folder containing text files, where each text file contains a list of (newline-separated) # URLS. It then downloads the contents of each of the URLs into a folder, one folder per text file. # NB: The folder containing the files # Writes an object file representing a single opinion import os import urllib.request # Key initial parameter: set name of folder input_folder = "C:/Users/thech/Code/Federal Register URLs" output_folder = "C:/Users/thech/Code/Fed Reg Plain Text Files" # Generates file_array, which is an array of the names in the folder for fileList in os.walk(input_folder): file_array = fileList[2] # Opens a file to write any URLs that were inaccessible. error_file = open(input_folder + "/errors.txt", "w") # Steps through, opening each file in input_folder for one_file in file_array: # Creates array of URLs (url_list) from one_file with open(input_folder + "/" + one_file) as my_file: url_list = my_file.readlines() # Creates new folder (one_file_folder) named after one_file one_file_folder = output_folder + "/" + one_file.replace(".txt", "") try: os.mkdir(one_file_folder) except: print("Folder already exists: " + one_file_folder) # Steps through url_list and downloads each of the URLs into one_file_folder for one_url in url_list: print("Loading: " + one_url) one_output_file = one_file_folder + "/" + one_url.split("/").pop().split(".txt")[0] + ".txt" print("Output file: " + one_output_file) try: urllib.request.urlretrieve(one_url, one_output_file) except: print("Unable to download the following URL: " + one_url) error_file.write("Unable to load: " + one_url)