From aeeac40a69abcf0d660b57e4fbf63697d4cce759 Mon Sep 17 00:00:00 2001 From: cpraz Date: Mon, 8 Dec 2025 22:37:53 +0545 Subject: [PATCH 1/3] migrate from deprecated pypdf2 to pypdf --- audiobook/doc_parser/pdf_parser.py | 18 +++++++++--------- audiobook/main.py | 2 +- audiobook/utils.py | 4 ++-- docs/command_line_usage.rst | 2 +- requirements.txt | 2 +- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/audiobook/doc_parser/pdf_parser.py b/audiobook/doc_parser/pdf_parser.py index 53f2fdd..00b0014 100644 --- a/audiobook/doc_parser/pdf_parser.py +++ b/audiobook/doc_parser/pdf_parser.py @@ -1,7 +1,7 @@ import io import ast -import PyPDF2 +import pypdf from pdfminer.pdfinterp import PDFResourceManager from pdfminer.pdfinterp import PDFPageInterpreter @@ -69,9 +69,9 @@ def get_toc(self, filepath, password=None): return output_toc -class PyPDF2DocParser(object): +class PyPDFDocParser(object): """ - PyPdf2 Doc Parser: + pypdf Doc Parser: methods: 1. get_metadata : get metadata of pdf file @@ -89,25 +89,25 @@ def get_text(self, filepath, password=None, maxpages=0): """ function to read all the text from pdf file """ pdf_data = "" with open(filepath, "rb") as fp: - pdfReader = PyPDF2.PdfFileReader(fp) + pdfReader = pypdf.PdfReader(fp) if password: pdfReader.decrypt(password) - num_pages = pdfReader.numPages + num_pages = len(pdfReader.pages) if maxpages: num_pages = min(num_pages, maxpages) for i in range(num_pages): - pageObj = pdfReader.getPage(i) - pdf_data += pageObj.extractText() + pageObj = pdfReader.pages[i] + pdf_data += pageObj.extract_text() return pdf_data def get_toc(self, filepath, password=None): outlines = [] with open(filepath, "rb") as fp: - pdfReader = PyPDF2.PdfFileReader(fp, strict=False) + pdfReader = pypdf.PdfReader(fp) if password: pdfReader.decrypt(password) - outlines = pdfReader.getOutlines() + outlines = pdfReader.outline if outlines: outlines = str(outlines).replace("IndirectObject(", "[") outlines = outlines.replace(")", "]").replace("/", "") diff --git a/audiobook/main.py b/audiobook/main.py index 507a129..9b6c5c5 100644 --- a/audiobook/main.py +++ b/audiobook/main.py @@ -12,7 +12,7 @@ ) from audiobook.utils import get_json_metadata -logger = logging.getLogger("PyPDF2") +logger = logging.getLogger("pypdf") logger.setLevel(logging.INFO) expand_usr = os.path.expanduser("~") diff --git a/audiobook/utils.py b/audiobook/utils.py index e846c17..5b9ca50 100644 --- a/audiobook/utils.py +++ b/audiobook/utils.py @@ -9,7 +9,7 @@ from odf.opendocument import load from striprtf.striprtf import rtf_to_text from audiobook.doc_parser.web_parser import ArticleWebScraper -from audiobook.doc_parser.pdf_parser import PyPDF2DocParser +from audiobook.doc_parser.pdf_parser import PyPDFDocParser # Helper function to load JSON data from a file def load_json(filename): @@ -47,7 +47,7 @@ def pdf_to_json(input_book_path, password=None): metadata = {} basename = os.path.basename(input_book_path).split(".")[0] - pdf_parser = PyPDF2DocParser() + pdf_parser = PyPDFDocParser() text = pdf_parser.get_text(input_book_path, password=password) text = text_preprocessing(text) diff --git a/docs/command_line_usage.rst b/docs/command_line_usage.rst index 5717add..5b522c0 100644 --- a/docs/command_line_usage.rst +++ b/docs/command_line_usage.rst @@ -20,7 +20,7 @@ Support Format and extraction method =========== ================== =============== File Format Supported extraction_engine =========== ================== =============== -PDF ✅ pypdf2/pdfminor +PDF ✅ pypdf/pdfminor TXT ✅ default set EPUB ✅ default set MOBI ✅ default set diff --git a/requirements.txt b/requirements.txt index 5971d3c..12a51fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ pyttsx3==2.98 -PyPDF2==3.0.1 +pypdf==4.0.1 ebooklib==0.19 beautifulsoup4==4.13.4 html2text==2025.4.15 From e1cc4cba6be499b20225473527e7ee831f39decd Mon Sep 17 00:00:00 2001 From: cpraz Date: Mon, 8 Dec 2025 22:39:26 +0545 Subject: [PATCH 2/3] commented test_mobi_to_json and renamed test_pdf_to_json_pypdf2 to test_pdf_to_json_pypdf --- tests/test_create_json_book.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_create_json_book.py b/tests/test_create_json_book.py index 8e58e7c..a9a8b95 100644 --- a/tests/test_create_json_book.py +++ b/tests/test_create_json_book.py @@ -23,14 +23,14 @@ def test_txt_to_json_pdf_miner(self): # def test_pdf_to_json_pdf_miner(self): # pdfminer support added # self.assertEqual(ab.create_json_book("assets/sample.pdf"), output_txt) - def test_pdf_to_json_pypdf2(self): + def test_pdf_to_json_pypdf(self): self.assertEqual(ab.create_json_book("assets/sample.pdf"), output_txt) def test_odt_to_json(self): self.assertEqual(ab.create_json_book("assets/sample.odt"), output_txt) - def test_mobi_to_json(self): - self.assertEqual(ab.create_json_book("assets/sample.mobi"), output_txt) + # def test_mobi_to_json(self): + # self.assertEqual(ab.create_json_book("assets/sample.mobi"), output_txt) # def test_docs_to_json(self): # self.assertEqual(ab.create_json_book("assets/sample.doc"), (output['docs'], {'book_name': 'sample', 'pages': 1})) From e938d54224fd354e114413831548a165f85ad42b Mon Sep 17 00:00:00 2001 From: Deepak Raj <54245038+codeperfectplus@users.noreply.github.com> Date: Tue, 9 Dec 2025 09:44:07 +0530 Subject: [PATCH 3/3] Update docs/command_line_usage.rst Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: Deepak Raj <54245038+codeperfectplus@users.noreply.github.com> --- docs/command_line_usage.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/command_line_usage.rst b/docs/command_line_usage.rst index 5b522c0..e910168 100644 --- a/docs/command_line_usage.rst +++ b/docs/command_line_usage.rst @@ -20,7 +20,7 @@ Support Format and extraction method =========== ================== =============== File Format Supported extraction_engine =========== ================== =============== -PDF ✅ pypdf/pdfminor +PDF ✅ pypdf/pdfminer TXT ✅ default set EPUB ✅ default set MOBI ✅ default set