from io import StringIO from pdfminer3.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer3.converter import TextConverter from pdfminer3.layout import LAParams from pdfminer3.pdfpage import PDFPage import os from PyPDF2 import PdfFileReader import sys, getopt #converts pdf, returns its text content as a string def convert(fname, pages=None): if not pages: pagenums = set() else: pagenums = set(pages) output = StringIO() manager = PDFResourceManager() converter = TextConverter(manager, output, laparams=LAParams()) interpreter = PDFPageInterpreter(manager, converter) infile = open(fname, 'rb') for page in PDFPage.get_pages(infile, pagenums): interpreter.process_page(page) infile.close() converter.close() text = output.getvalue() output.close return text #converts all pdfs in directory pdfDir, saves all resulting txt files to txtdir def convertMultiple(pdfDir, txtDir): if pdfDir == "": pdfDir = os.getcwd() + "\\" #if no pdfDir passed in for pdf in os.listdir(pdfDir): #iterate through pdfs in pdf directory fileExtension = pdf.split(".")[-1] if fileExtension == "pdf": pdfFilename = pdfDir + pdf text = convert(pdfFilename) #get string of text content of pdf textFilename = txtDir + pdf + ".txt" textFile = open(textFilename, "w") #make text file textFile.write(text) #write text to text file # Counts the number of pages in all pdfs in a folder def count_pages(pdf_dir): page_count = 0 for pdf in os.listdir(pdf_dir): #iterate through pdfs in pdf directory pdfFilename = pdf_dir + pdf print("Reading: " + pdfFilename) try: pdf_object = PdfFileReader(open(pdfFilename, "rb")) page_count = page_count + pdf_object.getNumPages() except: print("No EOF! Yeesh") print(page_count) pdfDir = "C:/Users/thech/Data/IRB Website PDFs/" txtDir = "C:/Users/thech/Data/IRB Texts - Unmerged/" #convertMultiple(pdfDir, txtDir) count_pages("C:/Users/thech/Data/IRB Website PDFs/")