# The purpose of this program is to sort through text files containing Federal Register documents, # separating the documents that contain regulations or regulatory preambles into one folder, # and all other documents into another folder. # NB: This program does NOT delete the original files in the process. from shutil import copyfile import re import os # Parameters: # source_folder = "" # positive_folder = "" # negative_folder = "" # Returns true if criteria are met to be sorted into positive folder, false otherwise. def meets_criteria(plain_text): if (re.search("\nACTION: *Notice of proposed rulemaking by cross-reference", plain_text, re.IGNORECASE) and not(re.search("\nSUPPLEMENTARY INFORMATION", plain_text, re.IGNORECASE))): return False if re.search("\nACTION: *Notice of proposed rulemaking", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Amendments to proposed regulations", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Final regulation", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Final and temporary regulation", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Temporary and final regulation", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Temporary regulation", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Advance notice of proposed rulemaking", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Final rule", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Proposed rule", plain_text, re.IGNORECASE) or \ re.search("\nACTION: *Withdrawal of previous proposed rules, notice of proposed", plain_text, re.IGNORECASE) or \ (re.search("\nBackground", plain_text, re.IGNORECASE) and (re.search("\nExplanation of Provisions", plain_text, re.IGNORECASE) or re.search("\nSummary of Comments and Explanation of Revisions", plain_text, re.IGNORECASE) or re.search("\nDrafting Information", plain_text, re.IGNORECASE))): return True else: return False def sort_folder_of_docs(input_folder, yes_folder, no_folder): # Get a list of all the files in input_folder for fileList in os.walk(input_folder): file_array = fileList[2] # For each file in input_folder, either put it into the yes_folder if it meets_criteria or the no_folder otherwise for one_file in file_array: if meets_criteria(open(input_folder + "/" + one_file, encoding='utf-8').read()): copyfile(input_folder + "/" + one_file, yes_folder + "/" + one_file) print(one_file + " matches, moved to yes_folder.") else: copyfile(input_folder + "/" + one_file, no_folder + "/" + one_file) print(one_file + " does not match, moved to no_folder.") source_folder = "C:/Users/thech/Code/Fed Reg Plain Text Files/" negative_folder = "C:/Users/thech/Code/Fed Reg Non-Rulemaking Text Files/" positive_folder = "C:/Users/thech/Code/Fed Reg Rulemaking Text Files/" for x in range(1994, 2020): current_positive_folder = positive_folder + str(x) current_negative_folder = negative_folder + str(x) # NB: the mkdir processes will crash if the folders already exist. os.mkdir(current_negative_folder) os.mkdir(current_positive_folder) sort_folder_of_docs(source_folder + str(x), current_positive_folder, current_negative_folder) # Testing Suite: #positive_folder = "C:/Users/thech/Code/Fed Reg Rulemaking Text Files/2000" #negative_folder = "C:/Users/thech/Code/Fed Reg Non-Rulemaking Text Files/2000" #source_folder = "C:/Users/thech/Code/Fed Reg Plain Text Files/2000" # sort_folder_of_docs(source_folder, positive_folder, negative_folder) # file_name = "C:/Users/thech/Code/Fed Reg Plain Text Files/IRS Plain Text URLs 1995/95-30873.txt" # one_file = open(file_name, encoding='utf-8') # one_file_plain_text = one_file.read() # if re.search("\nACTION: *Final and temporary regulation", one_file_plain_text, re.IGNORECASE): # print ("YES!")