# Provides tools to score pre-processed plain text. import regex import re from spellchecker import SpellChecker from Opinion import strip_tags leg_hist_reports_list = ["conference report", "conf. rep.", "conf. rpt.", "conf.rep.", "conf.rpt.", "conf. rept.", "conf.rept.",\ "house report", "h. rep.", "h.rep.", "h. rpt.", "h.rpt." "h.r. rep.", "h. r. rep.", "h.r. rpt.", "h. r. rpt.", "h.r.rep.", "h.r.rpt.", \ "h. rept.", "h.rept.", "h. r. rept.", "h.r.rept.", "h.r. rept.",\ "senate report", "s. rep.", "s. rpt.", "s.rep.", "s.rpt.", "s. rept.", "s.rept.",\ "committee report", "comm. rep.", "comm.rep.", "comm. rpt.", "comm.rpt.", "comm. rept.", "comm.rept."] leg_hist_hearings_list = ["congressional hearing", "cong. rec.", "cong.rec.", "committee hearing", "senate hearing", "house hearing", "conference hearing", "congressional record", "rec. doc."] leg_hist_misc_list = ["legislative history", "h. subcomm.", "s. subcomm.", "history of the legislation", \ "house committee", "senate committee", "conference committee", \ "h.r. comm.", "h. r. subcomm.", "s. comm.", "joint committee", "congressional budget office", "cbo", "jct"] leg_hist_list = leg_hist_reports_list + leg_hist_hearings_list + leg_hist_misc_list linguistic_canon_list = ["expressio ", "expresio ", "inclusio ", "noscitur a sociis", \ "ejusdem generis", "last antecedent", "plain meaning"] dictionary_list = ["dictionary", "dictionarium", "linguae britannicae", "world book", "funk & wagnalls"] whole_act_list = ["whole act", "whole-act", "whole code", "whole-code", "in pari materia", "meaningful variation", "consistent usage"] surplusage_list = ["surplusage", "superfluity", "superfluities"] holistic_textual_list = surplusage_list + whole_act_list textualist_list = dictionary_list + linguistic_canon_list + holistic_textual_list textualist_list_ex_dictionaries = linguistic_canon_list + holistic_textual_list normative_neutral_list = ["compliance burden", "financial burden", "administrative burden", "regulatory burden", \ "compliance cost", \ "tax administration", "good public policy", "public policy goal", \ "efficient administration", "efficient tax collection", "efficient enforcement" \ "public policy grounds", "burdensome", "clarity"] normative_pro_taxpayer_list = ["complexity", "fairness", "intrusive", "unjust", "unfair", "injustice"] normative_list = normative_neutral_list + normative_pro_taxpayer_list sub_over_form_list = ["substance over form", "substance-over-form"] econ_substance_list = ["economic substance", "economic-substance", "sham transaction", "sham-transaction", "business purpose doctrine", "business-purpose doctrine"] ass_income_list = ["assignment of income", "assignment-of-income"] step_trans_list = ["step transaction", "step-transaction"] tax_canon_list_of_lists = [sub_over_form_list, econ_substance_list, ass_income_list, step_trans_list] tax_sub_canons_list = sub_over_form_list + econ_substance_list + ass_income_list + step_trans_list general_sub_canons_list = ["charming betsy", "rule of lenity", "absurd result", "implied repeal", "implicit repeal", "implicitly repeal", "repeal by implication", "presumption against preemption", "presumption against pre-emption", "avoidance canon", "canon of avoidance", "constitutional avoidance"] deference_canons_list = ["chevron", "auer", "seminole rock", "skidmore"] jurisdiction_list = ["subject-matter jurisdiction", "subject matter jurisdiction", "diversity jurisdiction", "federal-question jurisdiction", "federal question jurisdiction"] interpretive_list = ["statute", "statutory", "legislation", "congress", "code", "section", "interpret", "construe", \ "construing", "construction", "reading"] all_sub_canons_list = tax_sub_canons_list + general_sub_canons_list + deference_canons_list leg_text_list = leg_hist_list + textualist_list all_list = leg_hist_list + textualist_list + normative_list + all_sub_canons_list + interpretive_list + jurisdiction_list all_but_substantive_canons_list = leg_hist_list + textualist_list + normative_list + interpretive_list + jurisdiction_list faithful_agent_list = leg_hist_list + textualist_list + general_sub_canons_list + deference_canons_list # Returns a list of strings containing all *words* (not just *strings*) used for analysis in this program def get_all_analyzed_strings(): all_analyzed_strings = [] for string in all_list: all_analyzed_strings = all_analyzed_strings + string.split() return all_analyzed_strings # Calculates total word count of document. def document_word_count(plain_text): return (len(plain_text.split())) def leg_hist_tf(plain_text): return leg_hist_count(plain_text) / len(plain_text) # Investigates the use of legislative history by identifying which terms are the most popular def investigate_term_use(plain_text, list): total = 0 for term in list: count = plain_text.count(term) total = total + count if count > 0: print ("Occurrences of " + term + ": " + str(count)) print("Total: " + str(total)) # Calculates the number of legislative history terms within a particular document. def leg_hist_count(plain_text): hit_count = 0 # Add every occurrence of a term related to legislative history for leg_hist_term in leg_hist_list: hit_count = hit_count + plain_text.count(leg_hist_term) return hit_count # If document contains ANY legislative history terms, returns the document's word count. # (This is so that the denominator doesn't have to change versus the word count metrics.) def leg_hist_yes(plain_text): for leg_hist_term in leg_hist_list: if leg_hist_term in plain_text: return 1 return 0 # Calculates the number of legislative history terms within a particular document, using regex for fuzzy matching. def leg_hist_count_fuzzy(plain_text): tf_count = 0 # Add every occurrence of a term related to legislative history tf_count = tf_count + len(regex.findall(r"(legislative history){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(h\.r\. conf\. rep\.){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(house report){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(conference report){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(h\.r\. rep\.){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(senate report){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(s\. rep\.){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(conf\. rep\.){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(h\. subcomm\.){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(s\. subcomm\.){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(congressional record){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(congressional hearing){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(history of the legislation){e<=1}", plain_text)) return tf_count # Calculates the number of baseline common terms in a particular document. def common_term_count(text): hit_count = 0 text = text.replace(".", " ") list_of_words = text.split() for word in list_of_words: if word == "the" or word == "of" or word == "and": hit_count = hit_count + 1 return hit_count # Calculates the number of dictionary citations within a particular document. def dictionary_count(plain_text): hit_count = 0 # Remove references to "dictionary act" if "dictionary act" in plain_text: plain_text = plain_text.replace("dictionary act", "") # Add every occurrence of a term related to textualism for dictionary_cite in dictionary_list: hit_count = hit_count + plain_text.count(dictionary_cite) return hit_count # Calculates the number of textualist terms within a particular document, EXCLUDING dictionaries. def textualist_count_ex_dictionaries(plain_text): hit_count = 0 # Add every occurrence of a term related to textualism EXCEPT dictionaries for textualist_term in textualist_list_ex_dictionaries: hit_count = hit_count + plain_text.count(textualist_term) return hit_count # Calculates the number of textualist terms within a particular document. def textualist_count(plain_text): hit_count = 0 # Remove references to "dictionary act" if "dictionary act" in plain_text: plain_text = plain_text.replace("dictionary act", "") # Add every occurrence of a term related to textualism for textualist_term in textualist_list: hit_count = hit_count + plain_text.count(textualist_term) return hit_count def textualist_yes(plain_text): # Remove references to "dictionary act" if "dictionary act" in plain_text: plain_text = plain_text.replace("dictionary act", "") for textualist_term in textualist_list: if textualist_term in plain_text: return 1 return 0 # Calculates the number of substantive tax canons within a particular document. def substantive_canon_count(plain_text): hit_count = 0 # Add every occurrence of a substantive canon for canon in tax_sub_canons_list: hit_count = hit_count + plain_text.count(canon) return hit_count # Calculates the number of textualist terms within a particular document, using regex for fuzzy matching. def textualist_count_fuzzy(plain_text): tf_count = 0 # Add every occurrence of a term related to textualism # Linguistic canons tf_count = tf_count + len(regex.findall(r"(expressio unius){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(noscitur a sociis){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(ejusdem generis){e<=1}", plain_text)) tf_count = tf_count + len(regex.findall(r"(last antecedent){e<=1}", plain_text)) return tf_count def string_contains_faithful_agency(plain_text): for fa_string in faithful_agent_list: if fa_string in plain_text: return True return False def normative_count(plain_text): hit_count = 0 # Remove erroneous matches plain_text = plain_text.replace("treasury inspector general for tax administration", "") # Refers to a specific line on an IRS form plain_text = plain_text.replace("effective tax administration", "") # In order to avoid statutory interpretation, remove all sentences containing "faithful agency" # (i.e., legislative history or textualist terms). plain_text = ".".join([sentence for sentence in plain_text.split('.') if not string_contains_faithful_agency(sentence)]) # if "burden" in plain_text: # plain_text = plain_text.replace("recordkeeping burden", "") # plain_text = plain_text.replace("reporting burden", "") # plain_text = plain_text.replace("burden on form", "") # plain_text = plain_text.replace("burden for form", "") # plain_text = plain_text.replace("annual burden", "") # plain_text = plain_text.replace("paperwork burden", "") # plain_text = plain_text.replace("burdens of ownership", "") # plain_text = plain_text.replace("burden of ownership", "") # plain_text = plain_text.replace("burden of pro", "") # Add every occurrence of a normative term for normative_term in normative_neutral_list: hit_count = hit_count + plain_text.count(normative_term) return hit_count + normative_pro_taxpayer_count(plain_text) def normative_yes(plain_text): # Remove erroneous matches plain_text = plain_text.replace("treasury inspector general for tax administration", "") # Refers to a specific line on an IRS form plain_text = plain_text.replace("effective tax administration", "") # In order to avoid statutory interpretation, remove all sentences containing "faithful agency" # (i.e., legislative history or textualist terms). plain_text = ".".join([sentence for sentence in plain_text.split('.') if not string_contains_faithful_agency(sentence)]) for normative_term in normative_neutral_list: if normative_term in plain_text: return 1 return 0 def normative_pro_taxpayer_count(plain_text): hit_count = 0 # Remove erroneous matches if "fairness" in plain_text: plain_text = plain_text.replace("small business regulatory enforcement fairness act", "") # Add every occurrence of a normative pro-taxpayer term for pro_taxpayer_term in normative_pro_taxpayer_list: hit_count = hit_count + plain_text.count(pro_taxpayer_term) return hit_count # Returns number of sentences within plain_text including an item from must_have_list_1 AND must_have_list_2 def within_sentence_count(plain_text, must_have_list_1, must_have_list_2): count = 0 sentences = plain_text.split(".") for sentence in sentences: sentence_contains = False for must_have_1 in must_have_list_1: if must_have_1 in sentence: for must_have_2 in must_have_list_2: if must_have_2 in sentence: sentence_contains = True if sentence_contains: # print(sentence) count = count + 1 return count def interpretive_count(plain_text): return within_sentence_count(plain_text, ["statute", "statutory", "legislation", "congress", "code", "section"], ["interpret", "construe", "construing", "construction", "reading"]) def interpretive_yes(plain_text): if interpretive_count(plain_text) > 0: return 1 else: return 0 class Tf_counter: hit_list: [] def __init__(self, hit_list=None): self.hit_list = hit_list def tf_list_count (self, text=""): count = 0 for term in self.hit_list: count = count + text.count(term) return count # Given a block of text, returns all of the text after the first occurrence of a phrase indicating the start of # and administrative/procedural section. def get_first_occurrence_of_admin(plain_text, year): admin_phrase_list = ["administrative, procedural, and miscellaneous", \ "administrative, procedural and miscellaneous", \ "administrative and miscellaneous matters"] min_index = 10000000000 min_phrase = None for phrase in admin_phrase_list: index = plain_text.find(phrase) if index != -1 and index < min_index: min_index = index min_phrase = phrase if min_phrase: split_list = plain_text.split(min_phrase) print(str(year) + " has phrase: " + min_phrase) return min_phrase.join(split_list[1:]) return "" def is_leg_phrase_in_text(plain_text, leg_phrase_list): for phrase in leg_phrase_list: if phrase in plain_text: return True return False # Simple pre-processing to remove whitespace and lower-case. Also has some specific cleaning for IRB. # Include spellcheck. def pre_process_IRB_with_spellcheck(text, year): leg_plus_year = "LEGISLATION. " + str(year) + "-" if leg_plus_year in text: text = text.split(leg_plus_year)[0] print("Has LEGISLATION: " + str(year)) # Regularize unicode dashes. # Replace all dashes with spaces text = text.replace('\u2013', '-') text = text.replace('\u2014', '-') text = text.replace('\u2015', '-') # Make the plain text entirely lowercase. text = text.lower() # Replace all whitespace with single spaces (to avoid problems with line breaks and multiple spaces). text = " ".join(text.split()) # Remove hyphenation. text = text.replace("- ", "") # Given a block of text, iterates through and eliminates legislation, setting aside administrative text first. leg_phrase_list = ["legislation and treaties table", "treaties and tax legislation contents", "treaties and tax legislation table", \ "treaties and tax legislation subpart", "legislation and treaties subpart", "committee reports. " + str(year)] leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list) while leg_phrase_in_text == True: min_index = 10000000000 min_phrase = None for phrase in leg_phrase_list: index = text.find(phrase) if index != -1 and index < min_index: min_index = index min_phrase = phrase if min_phrase: split_list = text.split(min_phrase) print(str(year) + " has phrase: " + min_phrase) remainder_of_text = min_phrase.join(split_list[1:]) text = split_list[0] + get_first_occurrence_of_admin(remainder_of_text, year) leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list) # replace "petitioner" with "plaintiff" # self.plain_text = self.plain_text.replace("petitioner", "plaintiff") # replace "respondent" with "defendant" # self.plain_text = self.plain_text.replace("respondent", "defendant") # replace "commissioner" with "defendant" # self.plain_text = self.plain_text.replace("commissioner", "defendant") # Removes any words that contain no letters, only numbers or symbols. text = "".join([i for i in text if i.isalpha() or i == "." or i == " " or i == "-"]) # Iterates through and corrects spelling by changing words to the closest correct match. # Uses a Levenshtein Distance of 1, rather than 2. spell = SpellChecker(distance=1) # Exempt all words and phrases we're trying to search for spell.word_frequency.load_words(get_all_analyzed_strings()) new_text = "" for word in text.split(): # Excise invalid 1-letter words. if len(word) == 1: if word != "a" and word != "i": # print ("Excluding: " + word) continue # If a word has a period or dash, add it without further analysis. if "-" in word or "." in word: new_text = new_text + word + " " elif word in spell: new_text = new_text + word + " " else: new_text = new_text + spell.correction(word) + " " return new_text # Simple pre-processing to remove whitespace and lower-case. Also has some specific cleaning for IRB. # No spellcheck. def pre_process_IRB(text, year): # print("Pre-processing: " + str(year)) leg_plus_year = "LEGISLATION. " + str(year) + "-" if leg_plus_year in text: text = text.split(leg_plus_year)[0] print("Has LEGISLATION: " + str(year)) # Regularize unicode dashes. # Replace all dashes with spaces text = text.replace('\u2013', '-') text = text.replace('\u2014', '-') text = text.replace('\u2015', '-') # Make the plain text entirely lowercase. text = text.lower() # Replace all whitespace with single spaces (to avoid problems with line breaks and multiple spaces). text = " ".join(text.split()) # Remove hyphenation. text = text.replace("- ", "") # print("Removing irrelevant information: " + str(year)) # Given a block of text, iterates through and eliminates legislation, setting aside administrative text first. leg_phrase_list = ["legislation and treaties table", "treaties and tax legislation contents", "treaties and tax legislation table", \ "treaties and tax legislation subpart", "legislation and treaties subpart", "committee reports. " + str(year)] leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list) while leg_phrase_in_text == True: min_index = 10000000000 min_phrase = None for phrase in leg_phrase_list: index = text.find(phrase) if index != -1 and index < min_index: min_index = index min_phrase = phrase if min_phrase: split_list = text.split(min_phrase) print(str(year) + " has phrase: " + min_phrase) remainder_of_text = min_phrase.join(split_list[1:]) text = split_list[0] + get_first_occurrence_of_admin(remainder_of_text, year) leg_phrase_in_text = is_leg_phrase_in_text(text, leg_phrase_list) # replace "petitioner" with "plaintiff" # self.plain_text = self.plain_text.replace("petitioner", "plaintiff") # replace "respondent" with "defendant" # self.plain_text = self.plain_text.replace("respondent", "defendant") # replace "commissioner" with "defendant" # self.plain_text = self.plain_text.replace("commissioner", "defendant") # print("Regularizing text: " + str(year)) # Removes any words that contain no letters, only numbers or symbols. text = "".join([i for i in text if i.isalpha() or i == "." or i == " " or i == "-"]) new_text = "" # print("Removing invalid 1-letter words: " + str(year)) text = " ".join([word for word in text.split() if len(word) > 1 or word == "a" or word == "i"]) return text # This method processes the opinion to remove the most conspicuous procedural phrases that may provide biased results. # For regs, this function also removes def pre_process_reg(text): # For regulations, remove the text of the regulation in order to isolate the text of the preamble. text = text.split("\nDrafting Information")[0] text = text.split("\nList of Subjects")[0] text = text.split("\nProposed Amendments")[0] text = text.split("\nProposed Effective Date")[0] # Strip HTML tags, if any text = strip_tags(text) # Make the plain text entirely lowercase. text = text.lower() # Replace all whitespace with single spaces (to avoid problems with line breaks and multiple spaces). text = " ".join(text.split()) # replace "petitioner" with "plaintiff" # self.plain_text = self.plain_text.replace("petitioner", "plaintiff") # replace "respondent" with "defendant" # self.plain_text = self.plain_text.replace("respondent", "defendant") # replace "commissioner" with "defendant" # self.plain_text = self.plain_text.replace("commissioner", "defendant") return text def td_count(text): count = 0 td_list = [m.start() for m in re.finditer('t\.d\.', text)] td_list = td_list + [m.start() for m in re.finditer('t\. d\.', text)] for index in td_list: substring_to_check = text[index:index + 200] no_matches = True if "amended" in substring_to_check: no_matches = False if "modified" in substring_to_check: no_matches = False if "action" in substring_to_check: no_matches = False if no_matches: continue substring = text[index + 5:] end = substring.find("approved") if end == -1: end = 9999999 alt_end_list = ["secretary of the treasury", "commissioner of internal revenue", "filed", "january", "february", "march", "april", "june", "july", "august", "september", "october", "november", "december", "t.d.", "t. d."] for alt_end in alt_end_list: alt_end_loc = substring.find(alt_end) if alt_end_loc < end and alt_end_loc != -1: end = alt_end_loc if end == 9999999: end = 0 td_text = substring[:end] if len(td_text) > 50000: print(td_text) count = count + len(td_text.split()) return count