Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ All notable changes to this project will be documented in this file. The format

## [0.11.7] - 2025-06-12

## [Unreleased]

### Added
- Added `PDF.table_of_contents` and `Page.table_of_contents` properties to expose document outlines (bookmarks) directly through pdfplumber.
This enables easy access to a document’s Table of Contents for navigation or metadata extraction.
([#1034](https://github.com/jsvine/pdfplumber/issues/1034) by @AbdullahMehmoodAwan)

### Added
- Add access to `Page.trimbox`, `Page.bleedbox`, and `Page.artbox` (h/t @samuelbradshaw). ([#1313](https://github.com/jsvine/pdfplumber/issues/1313) + [7e364e6](https://github.com/jsvine/pdfplumber/commit/7e364e6193c6e8bafa9b46587c0fdd4a46405399))

Expand Down
9 changes: 9 additions & 0 deletions pdfplumber/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,15 @@ def structure_tree(self) -> List[Dict[str, Any]]:
return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)]
except StructTreeMissing:
return []

@property
def table_of_contents(self):
"""
Returns the document-level Table of Contents.
This is the same as pdfplumber.PDF.table_of_contents, but accessible from a page.
"""
return self.pdf.table_of_contents


@property
def layout(self) -> LTPage:
Expand Down
28 changes: 28 additions & 0 deletions pdfplumber/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,3 +203,31 @@ def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
"metadata": self.metadata,
"pages": [page.to_dict(object_types) for page in self.pages],
}

@property
def table_of_contents(self) -> List[Dict[str, Any]]:
"""
Returns the document's outline (Table of Contents) if available.
Each entry is represented as a dictionary:
{"title": str, "page_number": int or None}.
"""
outlines: List[Dict[str, Any]] = []
try:
if hasattr(self.doc, "get_outlines"):
for (level, title, dest, a, se) in self.doc.get_outlines():
page_number = None
# Get page number safely if destination is valid
if dest and hasattr(dest, "page") and dest.page:
try:
page_number = self.doc.pageid2num(dest.page.idnum)
except Exception:
pass
outlines.append({
"title": title,
"page_number": page_number,
"level": level
})
except Exception as e:
logger.debug(f"Unable to extract outlines: {e}")
return outlines

Binary file added tests/pdfs/toc-sample.pdf
Binary file not shown.
24 changes: 24 additions & 0 deletions tests/test_table_of_contents.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
"""
Test for PDF.table_of_contents and Page.table_of_contents properties
"""
import pdfplumber

def test_table_of_contents_property():
# Path to your sample PDF (must exist)
sample_pdf_path = "tests/pdfs/toc-sample.pdf"

with pdfplumber.open(sample_pdf_path) as pdf:
toc = pdf.table_of_contents

# 1. Check the property exists and is a list
assert isinstance(toc, list)

# 2. If TOC entries exist, ensure they contain the right keys
if toc:
entry = toc[0]
assert "title" in entry
assert "level" in entry
assert "page_number" in entry

# 3. Verify the Page.table_of_contents matches PDF.table_of_contents
assert toc == pdf.pages[0].table_of_contents