# The purpose of this program is to sort through text files containing Federal Register documents,
# separating the documents that contain regulations or regulatory preambles into one folder,
# and all other documents into another folder.
# NB: This program does NOT delete the original files in the process.

from shutil import copyfile
import re
import os

# Parameters:
# source_folder = ""
# positive_folder = ""
# negative_folder = ""

# Returns true if criteria are met to be sorted into positive folder, false otherwise.
def meets_criteria(plain_text):
    if (re.search("\nACTION: *Notice of proposed rulemaking by cross-reference", plain_text, re.IGNORECASE) and not(re.search("\nSUPPLEMENTARY INFORMATION", plain_text, re.IGNORECASE))):
        return False

    if re.search("\nACTION: *Notice of proposed rulemaking", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Amendments to proposed regulations", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Final regulation", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Final and temporary regulation", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Temporary and final regulation", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Temporary regulation", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Advance notice of proposed rulemaking", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Final rule", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Proposed rule", plain_text, re.IGNORECASE) or \
       re.search("\nACTION: *Withdrawal of previous proposed rules, notice of proposed", plain_text, re.IGNORECASE) or \
       (re.search("\nBackground", plain_text, re.IGNORECASE) and (re.search("\nExplanation of Provisions", plain_text, re.IGNORECASE) or re.search("\nSummary of Comments and Explanation of Revisions", plain_text, re.IGNORECASE) or re.search("\nDrafting Information", plain_text, re.IGNORECASE))):
        return True
    else:
        return False


def sort_folder_of_docs(input_folder, yes_folder, no_folder):

    # Get a list of all the files in input_folder
    for fileList in os.walk(input_folder):
        file_array = fileList[2]

    # For each file in input_folder, either put it into the yes_folder if it meets_criteria or the no_folder otherwise
    for one_file in file_array:
        if meets_criteria(open(input_folder + "/" + one_file, encoding='utf-8').read()):
            copyfile(input_folder + "/" + one_file, yes_folder + "/" + one_file)
            print(one_file + " matches, moved to yes_folder.")
        else:
            copyfile(input_folder + "/" + one_file, no_folder + "/" + one_file)
            print(one_file + " does not match, moved to no_folder.")


source_folder = "C:/Users/thech/Code/Fed Reg Plain Text Files/"
negative_folder = "C:/Users/thech/Code/Fed Reg Non-Rulemaking Text Files/"
positive_folder = "C:/Users/thech/Code/Fed Reg Rulemaking Text Files/"

for x in range(1994, 2020):
    current_positive_folder = positive_folder + str(x)
    current_negative_folder = negative_folder + str(x)
    # NB: the mkdir processes will crash if the folders already exist.
    os.mkdir(current_negative_folder)
    os.mkdir(current_positive_folder)
    sort_folder_of_docs(source_folder + str(x), current_positive_folder, current_negative_folder)


# Testing Suite:

#positive_folder = "C:/Users/thech/Code/Fed Reg Rulemaking Text Files/2000"
#negative_folder = "C:/Users/thech/Code/Fed Reg Non-Rulemaking Text Files/2000"
#source_folder = "C:/Users/thech/Code/Fed Reg Plain Text Files/2000"

# sort_folder_of_docs(source_folder, positive_folder, negative_folder)


# file_name = "C:/Users/thech/Code/Fed Reg Plain Text Files/IRS Plain Text URLs 1995/95-30873.txt"

# one_file = open(file_name, encoding='utf-8')
# one_file_plain_text = one_file.read()

# if re.search("\nACTION: *Final and temporary regulation", one_file_plain_text, re.IGNORECASE):
#   print ("YES!")