# Provides tools to score pre-processed plain text.
import regex
import re
from spellchecker import SpellChecker
from Opinion import strip_tags

leg_hist_reports_list = ["conference report", "conf. rep.", "conf. rpt.", "conf.rep.", "conf.rpt.", "conf. rept.", "conf.rept.",\
                         "house report", "h. rep.", "h.rep.", "h. rpt.", "h.rpt."
                         "h.r. rep.", "h. r. rep.", "h.r. rpt.", "h. r. rpt.", "h.r.rep.", "h.r.rpt.", \
                                                  "h. rept.", "h.rept.", "h. r. rept.", "h.r.rept.", "h.r. rept.",\
                         "senate report", "s. rep.", "s. rpt.", "s.rep.", "s.rpt.", "s. rept.", "s.rept.",\
                         "committee report", "comm. rep.", "comm.rep.", "comm. rpt.", "comm.rpt.", "comm. rept.", "comm.rept."]

leg_hist_hearings_list = ["congressional hearing", "cong. rec.", "cong.rec.", "committee hearing",
                          "senate hearing", "house hearing", "conference hearing", "congressional record", "rec. doc."]

leg_hist_misc_list = ["legislative history", "h. subcomm.", "s. subcomm.", "history of the legislation", \
                      "house committee", "senate committee", "conference committee", \
                      "h.r. comm.", "h. r. subcomm.", "s. comm.", "joint committee", "congressional budget office",
                      "cbo", "jct"]

leg_hist_list = leg_hist_reports_list + leg_hist_hearings_list + leg_hist_misc_list

linguistic_canon_list = ["expressio ", "expresio ", "inclusio ", "noscitur a sociis", \
                         "ejusdem generis", "last antecedent", "plain meaning"]

dictionary_list = ["dictionary", "dictionarium", "linguae britannicae", "world book", "funk & wagnalls"]

whole_act_list = ["whole act", "whole-act", "whole code", "whole-code", "in pari materia", "meaningful variation", "consistent usage"]

surplusage_list = ["surplusage", "superfluity", "superfluities"]

holistic_textual_list = surplusage_list + whole_act_list

textualist_list = dictionary_list + linguistic_canon_list + holistic_textual_list

textualist_list_ex_dictionaries = linguistic_canon_list + holistic_textual_list

normative_neutral_list = ["compliance burden", "financial burden", "administrative burden", "regulatory burden", \
                          "compliance cost", \
                          "tax administration", "good public policy", "public policy goal", \
                          "efficient administration", "efficient tax collection", "efficient enforcement" \
                          "public policy grounds", "burdensome", "clarity"]

normative_pro_taxpayer_list = ["complexity", "fairness", "intrusive", "unjust", "unfair", "injustice"]

normative_list = normative_neutral_list + normative_pro_taxpayer_list

sub_over_form_list = ["substance over form", "substance-over-form"]

econ_substance_list = ["economic substance", "economic-substance", "sham transaction", "sham-transaction",
                       "business purpose doctrine", "business-purpose doctrine"]

ass_income_list = ["assignment of income", "assignment-of-income"]

step_trans_list = ["step transaction", "step-transaction"]

tax_canon_list_of_lists = [sub_over_form_list, econ_substance_list, ass_income_list, step_trans_list]

tax_sub_canons_list = sub_over_form_list + econ_substance_list + ass_income_list + step_trans_list

general_sub_canons_list = ["charming betsy", "rule of lenity", "absurd result", "implied repeal",
                           "implicit repeal", "implicitly repeal", "repeal by implication", "presumption against preemption",
                           "presumption against pre-emption", "avoidance canon", "canon of avoidance",
                           "constitutional avoidance"]

deference_canons_list = ["chevron", "auer", "seminole rock", "skidmore"]

jurisdiction_list = ["subject-matter jurisdiction", "subject matter jurisdiction",
                     "diversity jurisdiction", "federal-question jurisdiction", "federal question jurisdiction"]

interpretive_list = ["statute", "statutory", "legislation", "congress", "code", "section", "interpret", "construe", \
                     "construing", "construction", "reading"]

all_sub_canons_list = tax_sub_canons_list + general_sub_canons_list + deference_canons_list

leg_text_list = leg_hist_list + textualist_list
all_list = leg_hist_list + textualist_list + normative_list + all_sub_canons_list + interpretive_list + jurisdiction_list
all_but_substantive_canons_list = leg_hist_list + textualist_list + normative_list + interpretive_list + jurisdiction_list
faithful_agent_list = leg_hist_list + textualist_list + general_sub_canons_list + deference_canons_list

# Returns a list of strings containing all *words* (not just *strings*) used for analysis in this program
def get_all_analyzed_strings():

	all_analyzed_strings = []

	for string in all_list:
		all_analyzed_strings = all_analyzed_strings + string.split()

	return all_analyzed_strings

# Calculates total word count of document.
def document_word_count(plain_text):
	return (len(plain_text.split()))

def leg_hist_tf(plain_text):
	return leg_hist_count(plain_text) / len(plain_text)

# Investigates the use of legislative history by identifying which terms are the most popular
def investigate_term_use(plain_text, list):
	total = 0
	for term in list:
		count = plain_text.count(term)
		total = total + count
		if count > 0:
			print ("Occurrences of " + term + ": " + str(count))
	print("Total: " + str(total))

# Calculates the number of legislative history terms within a particular document.
def leg_hist_count(plain_text):

	hit_count = 0

	# Add every occurrence of a term related to legislative history
	for leg_hist_term in leg_hist_list:
		hit_count = hit_count + plain_text.count(leg_hist_term)

	return hit_count

# If document contains ANY legislative history terms, returns the document's word count.
# (This is so that the denominator doesn't have to change versus the word count metrics.)
def leg_hist_yes(plain_text):

	for leg_hist_term in leg_hist_list:

		if leg_hist_term in plain_text:
			return 1

	return 0

# Calculates the number of legislative history terms within a particular document, using regex for fuzzy matching.
def leg_hist_count_fuzzy(plain_text):

	tf_count = 0

	# Add every occurrence of a term related to legislative history
	tf_count = tf_count + len(regex.findall(r"(legislative history){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(h\.r\. conf\. rep\.){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(house report){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(conference report){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(h\.r\. rep\.){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(senate report){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(s\. rep\.){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(conf\. rep\.){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(h\. subcomm\.){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(s\. subcomm\.){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(congressional record){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(congressional hearing){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(history of the legislation){e<=1}", plain_text))

	return tf_count

# Calculates the number of baseline common terms in a particular document.
def common_term_count(text):
	hit_count = 0

	text = text.replace(".", " ")
	list_of_words = text.split()

	for word in list_of_words:
		if word == "the" or word == "of" or word == "and":
			hit_count = hit_count + 1

	return hit_count

# Calculates the number of dictionary citations within a particular document.
def dictionary_count(plain_text):
	hit_count = 0

	# Remove references to "dictionary act"
	if "dictionary act" in plain_text:
		plain_text = plain_text.replace("dictionary act", "")

	# Add every occurrence of a term related to textualism
	for dictionary_cite in dictionary_list:
		hit_count = hit_count + plain_text.count(dictionary_cite)

	return hit_count


# Calculates the number of textualist terms within a particular document, EXCLUDING dictionaries.
def textualist_count_ex_dictionaries(plain_text):
	hit_count = 0

	# Add every occurrence of a term related to textualism EXCEPT dictionaries
	for textualist_term in textualist_list_ex_dictionaries:
		hit_count = hit_count + plain_text.count(textualist_term)

	return hit_count

# Calculates the number of textualist terms within a particular document.
def textualist_count(plain_text):
	hit_count = 0

	# Remove references to "dictionary act"
	if "dictionary act" in plain_text:
		plain_text = plain_text.replace("dictionary act", "")

	# Add every occurrence of a term related to textualism
	for textualist_term in textualist_list:
		hit_count = hit_count + plain_text.count(textualist_term)

	return hit_count

def textualist_yes(plain_text):

	# Remove references to "dictionary act"
	if "dictionary act" in plain_text:
		plain_text = plain_text.replace("dictionary act", "")

	for textualist_term in textualist_list:
		if textualist_term in plain_text:
			return 1

	return 0

# Calculates the number of substantive tax canons within a particular document.
def substantive_canon_count(plain_text):
	hit_count = 0

	# Add every occurrence of a substantive canon
	for canon in tax_sub_canons_list:
		hit_count = hit_count + plain_text.count(canon)

	return hit_count

# Calculates the number of textualist terms within a particular document, using regex for fuzzy matching.
def textualist_count_fuzzy(plain_text):

	tf_count = 0

	# Add every occurrence of a term related to textualism

	# Linguistic canons
	tf_count = tf_count + len(regex.findall(r"(expressio unius){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(noscitur a sociis){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(ejusdem generis){e<=1}", plain_text))
	tf_count = tf_count + len(regex.findall(r"(last antecedent){e<=1}", plain_text))

	return tf_count

def string_contains_faithful_agency(plain_text):

	for fa_string in faithful_agent_list:
		if fa_string in plain_text:
			return True

	return False

def normative_count(plain_text):

	hit_count = 0

	# Remove erroneous matches
	plain_text = plain_text.replace("treasury inspector general for tax administration", "")
	# Refers to a specific line on an IRS form
	plain_text = plain_text.replace("effective tax administration", "")

	# In order to avoid statutory interpretation, remove all sentences containing "faithful agency"
	# (i.e., legislative history or textualist terms).
	plain_text = ".".join([sentence for sentence in plain_text.split('.') if not string_contains_faithful_agency(sentence)])

	# if "burden" in plain_text:
	#     plain_text = plain_text.replace("recordkeeping burden", "")
	#     plain_text = plain_text.replace("reporting burden", "")
	#     plain_text = plain_text.replace("burden on form", "")
	#     plain_text = plain_text.replace("burden for form", "")
	#     plain_text = plain_text.replace("annual burden", "")
	#     plain_text = plain_text.replace("paperwork burden", "")
	#     plain_text = plain_text.replace("burdens of ownership", "")
	#     plain_text = plain_text.replace("burden of ownership", "")
	#     plain_text = plain_text.replace("burden of pro", "")

	# Add every occurrence of a normative term
	for normative_term in normative_neutral_list:
		hit_count = hit_count + plain_text.count(normative_term)

	return hit_count + normative_pro_taxpayer_count(plain_text)

def normative_yes(plain_text):

	# Remove erroneous matches
	plain_text = plain_text.replace("treasury inspector general for tax administration", "")
	# Refers to a specific line on an IRS form
	plain_text = plain_text.replace("effective tax administration", "")

	# In order to avoid statutory interpretation, remove all sentences containing "faithful agency"
	# (i.e., legislative history or textualist terms).
	plain_text = ".".join([sentence for sentence in plain_text.split('.') if not string_contains_faithful_agency(sentence)])

	for normative_term in normative_neutral_list:

		if normative_term in plain_text:

			return 1

	return 0

def normative_pro_taxpayer_count(plain_text):

	hit_count = 0

	# Remove erroneous matches
	if "fairness" in plain_text:
		plain_text = plain_text.replace("small business regulatory enforcement fairness act", "")

	# Add every occurrence of a normative pro-taxpayer term
	for pro_taxpayer_term in normative_pro_taxpayer_list:
		hit_count = hit_count + plain_text.count(pro_taxpayer_term)

	return hit_count

# Returns number of sentences within plain_text including an item from must_have_list_1 AND must_have_list_2
def within_sentence_count(plain_text, must_have_list_1, must_have_list_2):

	count = 0

	sentences = plain_text.split(".")
	for sentence in sentences:

		sentence_contains = False

		for must_have_1 in must_have_list_1:
			if must_have_1 in sentence:
				for must_have_2 in must_have_list_2:
					if must_have_2 in sentence:

						sentence_contains = True

		if sentence_contains:
			# print(sentence)
			count = count + 1

	return count


def interpretive_count(plain_text):

	return within_sentence_count(plain_text, ["statute", "statutory",
	                                          "legislation", "congress", "code", "section"],
	                             ["interpret", "construe", "construing", "construction", "reading"])

def interpretive_yes(plain_text):

	if interpretive_count(plain_text) > 0:
		return 1
	else:
		return 0

class Tf_counter:

	hit_list: []

	def __init__(self, hit_list=None):
		self.hit_list = hit_list

	def tf_list_count (self, text=""):

		count = 0

		for term in self.hit_list:
			count = count + text.count(term)

		return count

# Given a block of text, returns all of the text after the first occurrence of a phrase indicating the start of
# and administrative/procedural section.
def get_first_occurrence_of_admin(plain_text, year):

	admin_phrase_list = ["administrative, procedural, and miscellaneous", \
	                     "administrative, procedural and miscellaneous", \
	                     "administrative and miscellaneous matters"]

	min_index = 10000000000
	min_phrase = None
	for phrase in admin_phrase_list:

		index = plain_text.find(phrase)
		if index != -1 and index < min_index:
			min_index = index
			min_phrase = phrase

	if min_phrase:

		split_list = plain_text.split(min_phrase)
		print(str(year) + " has phrase: " + min_phrase)
		return min_phrase.join(split_list[1:])

	return ""

def is_leg_phrase_in_text(plain_text, leg_phrase_list):

	for phrase in leg_phrase_list:

		if phrase in plain_text:
			return True

	return False

# Simple pre-processing to remove whitespace and lower-case. Also has some specific cleaning for IRB.
# Include spellcheck.
def pre_process_IRB_with_spellcheck(text, year):

	leg_plus_year = "LEGISLATION. " + str(year) + "-"
	if leg_plus_year in text:
		text = text.split(leg_plus_year)[0]
		print("Has LEGISLATION: " + str(year))

	# Regularize unicode dashes.
	# Replace all dashes with spaces
	text = text.replace('\u2013', '-')
	text = text.replace('\u2014', '-')
	text = text.replace('\u2015', '-')

	# Make the plain text entirely lowercase.
	text = text.lower()

	# Replace all whitespace with single spaces (to avoid problems with line breaks and multiple spaces).
	text = " ".join(text.split())

	# Remove hyphenation.
	text = text.replace("- ", "")

	# Given a block of text, iterates through and eliminates legislation, setting aside administrative text first.
	leg_phrase_list = ["legislation and treaties table", "treaties and tax legislation contents", "treaties and tax legislation table", \
	                   "treaties and tax legislation subpart", "legislation and treaties subpart", "committee reports. " + str(year)]

	leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list)
	while leg_phrase_in_text == True:

		min_index = 10000000000
		min_phrase = None

		for phrase in leg_phrase_list:

			index = text.find(phrase)
			if index != -1 and index < min_index:
				min_index = index
				min_phrase = phrase

		if min_phrase:
			split_list = text.split(min_phrase)
			print(str(year) + " has phrase: " + min_phrase)
			remainder_of_text = min_phrase.join(split_list[1:])
			text = split_list[0] + get_first_occurrence_of_admin(remainder_of_text, year)

		leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list)

	# replace "petitioner" with "plaintiff"
	# self.plain_text = self.plain_text.replace("petitioner", "plaintiff")

	# replace "respondent" with "defendant"
	# self.plain_text = self.plain_text.replace("respondent", "defendant")

	# replace "commissioner" with "defendant"
	# self.plain_text = self.plain_text.replace("commissioner", "defendant")

	# Removes any words that contain no letters, only numbers or symbols.
	text = "".join([i for i in text if i.isalpha() or i == "." or i == " " or i == "-"])

	# Iterates through and corrects spelling by changing words to the closest correct match.
	# Uses a Levenshtein Distance of 1, rather than 2.
	spell = SpellChecker(distance=1)

	# Exempt all words and phrases we're trying to search for
	spell.word_frequency.load_words(get_all_analyzed_strings())

	new_text = ""

	for word in text.split():

		# Excise invalid 1-letter words.
		if len(word) == 1:
			if word != "a" and word != "i":
				# print ("Excluding: " + word)
				continue

		# If a word has a period or dash, add it without further analysis.
		if "-" in word or "." in word:
			new_text = new_text + word + " "
		elif word in spell:
			new_text = new_text + word + " "
		else:
			new_text = new_text + spell.correction(word) + " "

	return new_text

# Simple pre-processing to remove whitespace and lower-case. Also has some specific cleaning for IRB.
# No spellcheck.
def pre_process_IRB(text, year):

	# print("Pre-processing: " + str(year))

	leg_plus_year = "LEGISLATION. " + str(year) + "-"
	if leg_plus_year in text:
		text = text.split(leg_plus_year)[0]
		print("Has LEGISLATION: " + str(year))

	# Regularize unicode dashes.
	# Replace all dashes with spaces
	text = text.replace('\u2013', '-')
	text = text.replace('\u2014', '-')
	text = text.replace('\u2015', '-')

	# Make the plain text entirely lowercase.
	text = text.lower()

	# Replace all whitespace with single spaces (to avoid problems with line breaks and multiple spaces).
	text = " ".join(text.split())

	# Remove hyphenation.
	text = text.replace("- ", "")

	# print("Removing irrelevant information: " + str(year))

	# Given a block of text, iterates through and eliminates legislation, setting aside administrative text first.
	leg_phrase_list = ["legislation and treaties table", "treaties and tax legislation contents", "treaties and tax legislation table", \
	                   "treaties and tax legislation subpart", "legislation and treaties subpart", "committee reports. " + str(year)]

	leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list)
	while leg_phrase_in_text == True:

		min_index = 10000000000
		min_phrase = None

		for phrase in leg_phrase_list:

			index = text.find(phrase)
			if index != -1 and index < min_index:
				min_index = index
				min_phrase = phrase

		if min_phrase:
			split_list = text.split(min_phrase)
			print(str(year) + " has phrase: " + min_phrase)
			remainder_of_text = min_phrase.join(split_list[1:])
			text = split_list[0] + get_first_occurrence_of_admin(remainder_of_text, year)

		leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list)

	# replace "petitioner" with "plaintiff"
	# self.plain_text = self.plain_text.replace("petitioner", "plaintiff")

	# replace "respondent" with "defendant"
	# self.plain_text = self.plain_text.replace("respondent", "defendant")

	# replace "commissioner" with "defendant"
	# self.plain_text = self.plain_text.replace("commissioner", "defendant")

	# print("Regularizing text: " + str(year))

	# Removes any words that contain no letters, only numbers or symbols.
	text = "".join([i for i in text if i.isalpha() or i == "." or i == " " or i == "-"])

	new_text = ""

	# print("Removing invalid 1-letter words: " + str(year))

	text = " ".join([word for word in text.split() if len(word) > 1 or word == "a" or word == "i"])

	return text


# This method processes the opinion to remove the most conspicuous procedural phrases that may provide biased results.
# For regs, this function also removes
def pre_process_reg(text):

	# For regulations, remove the text of the regulation in order to isolate the text of the preamble.
	text = text.split("\nDrafting Information")[0]
	text = text.split("\nList of Subjects")[0]
	text = text.split("\nProposed Amendments")[0]
	text = text.split("\nProposed Effective Date")[0]

	# Strip HTML tags, if any
	text = strip_tags(text)

	# Make the plain text entirely lowercase.
	text = text.lower()

	# Replace all whitespace with single spaces (to avoid problems with line breaks and multiple spaces).
	text = " ".join(text.split())

	# replace "petitioner" with "plaintiff"
	# self.plain_text = self.plain_text.replace("petitioner", "plaintiff")

	# replace "respondent" with "defendant"
	# self.plain_text = self.plain_text.replace("respondent", "defendant")

	# replace "commissioner" with "defendant"
	# self.plain_text = self.plain_text.replace("commissioner", "defendant")

	return text

def td_count(text):

	count = 0

	td_list = [m.start() for m in re.finditer('t\.d\.', text)]
	td_list = td_list + [m.start() for m in re.finditer('t\. d\.', text)]

	for index in td_list:

		substring_to_check = text[index:index + 200]

		no_matches = True

		if "amended" in substring_to_check:
			no_matches = False

		if "modified" in substring_to_check:
			no_matches = False

		if "action" in substring_to_check:
			no_matches = False

		if no_matches:
			continue

		substring = text[index + 5:]
		end = substring.find("approved")
		if end == -1:
			end = 9999999

		alt_end_list = ["secretary of the treasury", "commissioner of internal revenue", "filed", "january",
		                "february", "march", "april", "june", "july", "august", "september", "october",
		                "november", "december", "t.d.", "t. d."]

		for alt_end in alt_end_list:

			alt_end_loc = substring.find(alt_end)

			if alt_end_loc < end and alt_end_loc != -1:
				end = alt_end_loc

		if end == 9999999:
			end = 0

		td_text = substring[:end]

		if len(td_text) > 50000:
			print(td_text)

		count = count + len(td_text.split())

	return count