import unicodedata from io import StringIO from pdfminer.high_level import extract_text, extract_text_to_fp from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter from pdfminer.converter import HTMLConverter, TextConverter from pdfminer.pdfpage import PDFPage from pdfminer.layout import LAParams from tqdm import tqdm from bs4 import BeautifulSoup import pandas as pd class PDFReader(): def __init__(self): pass def read(self, path, html=False): text = StringIO() if html: with open(path, "rb") as f: extract_text_to_fp(f, text, laparams=LAParams(), output_type="html", codec=None) text = text.getvalue() else: text = extract_text(path) return text def read_pages(self, path, html=False, laparams=None, maxpages=0, page_numbers=None, password="", scale=1.0, rotation=0, layoutmode='normal', output_dir=None, strip_control=False, debug=False, disable_caching=False, **kwargs): rsrcmgr = PDFResourceManager(caching=True) pages = [] with open(path, "rb") as f: for page in PDFPage.get_pages(f, None, maxpages=0, check_extractable=True): page.rotate = (page.rotate + rotation) % 360 text = StringIO() if html: device = HTMLConverter(rsrcmgr, text, codec=None, scale=scale, layoutmode=layoutmode, laparams=laparams) else: device = TextConverter(rsrcmgr, text, codec=None, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) interpreter.process_page(page) pages.append(text.getvalue()) device.close() return pages def read_to_frame(self, path): items = [] pages = self.read_pages(path, html=True) page_index = 0 for i, p in enumerate(pages): p = p.replace("
", "\n").replace("
", "\n") html = BeautifulSoup(p, "html.parser") if not html: continue page = html.get_text("\n") contents = page.split("\n\n") content_index = 0 for j, s in enumerate(contents): c = s.strip() if not c: continue item = { "page": page_index, "order": content_index, "content": c } items.append(item) content_index += 1 page_index += 1 df = pd.DataFrame(items) return df def normalize(self, text): if text is None: return "" _text = text.replace("\r", "").replace("\n", "").strip() _text = unicodedata.normalize("NFKC", _text) return _text def preprocess_frame(self, df, lower=True): repeat_check = [] preprocessed = [] for i, row in df.iterrows(): content = self.normalize(row["content"]) if lower: content = content.lower() if content in repeat_check: continue repeat_check.append(content) item = {} for c in df.columns: item[c] = row[c] if c == "content": item[c] = content preprocessed.append(item) df = pd.DataFrame(preprocessed) return df