Skip to content

create_searchable_pdf

Create a searchable PDF from an image and OCR results.

引数:

名前 タイプ デスクリプション デフォルト
images List[Image]

A list of pillow images.

必須
docs List[DocumentAnalyzerSchema]

A list of OCR results.

必須
output_path str

Path to the output PDF file.

必須
font_path str

Path to the font file. Defaults to None.

None
image_quality str

Image quality preset ("high", "middle", "low"). Defaults to "high".

'high'
ソースコード位置: src/yomitoku/utils/searchable_pdf.py
def create_searchable_pdf(
    images: List[Image.Image],
    docs: List[Any],
    output_path: str,
    font_path: Optional[str] = None,
    image_quality: str = "high",
):
    """
    Create a searchable PDF from an image and OCR results.

    Args:
        images (List[Image.Image]): A list of pillow images.
        docs (List[DocumentAnalyzerSchema]): A list of OCR results.
        output_path (str): Path to the output PDF file.
        font_path (str, optional): Path to the font file. Defaults to None.
        image_quality (str, optional): Image quality preset ("high", "middle", "low").
            Defaults to "high".
    """
    if font_path is None:
        font_path = FONT_PATH

    preset = IMAGE_QUALITY_PRESETS[image_quality]
    jpeg_quality = preset["jpeg_quality"]
    max_long_side = preset["max_long_side"]

    pdfmetrics.registerFont(TTFont("MPLUS1p-Medium", font_path))

    packet = BytesIO()
    c = canvas.Canvas(packet)

    for i, (image, doc) in enumerate(zip(images, docs)):
        image = Image.fromarray(image[:, :, ::-1])  # Convert BGR to RGB
        orig_w, orig_h = image.size

        embed_image = image
        if max_long_side is not None and max(orig_w, orig_h) > max_long_side:
            scale = max_long_side / max(orig_w, orig_h)
            new_w = int(orig_w * scale)
            new_h = int(orig_h * scale)
            embed_image = image.resize((new_w, new_h), Image.LANCZOS)

        image_path = f"tmp_{i}.png"
        embed_image.save(image_path, format="JPEG", quality=jpeg_quality)

        # Page size is based on original image dimensions to preserve text coordinates
        c.setPageSize((orig_w, orig_h))
        c.drawImage(image_path, 0, 0, width=orig_w, height=orig_h)
        os.remove(image_path)

        # Collect all text containers
        containers = []
        for p in doc.paragraphs:
            containers.append(
                {
                    "box": p.box,
                    "order": p.order,
                    "sub_order": 0,
                    "direction": p.direction,
                    "type": "paragraph",
                },
            )
        for t in doc.tables:
            for cell in t.cells:
                containers.append(
                    {
                        "box": cell.box,
                        "order": t.order,
                        "sub_order": (cell.row, cell.col),
                        "direction": "horizontal",  # Assuming table text is horizontal
                        "type": "table_cell",
                    },
                )

            if t.caption is not None:
                containers.append(
                    {
                        "box": t.caption.box,
                        "order": t.order,
                        "sub_order": (-1, -1),
                        "direction": t.caption.direction,
                        "type": "table_caption",
                    },
                )

        for f in doc.figures:
            for para_idx, p in enumerate(f.paragraphs):
                containers.append(
                    {
                        "box": p.box,
                        "order": f.order,
                        "sub_order": para_idx,
                        "direction": p.direction,
                        "type": "figure_paragraph",
                    },
                )

            if f.caption is not None:
                containers.append(
                    {
                        "box": f.caption.box,
                        "order": f.order,
                        "sub_order": 0,
                        "direction": f.caption.direction,
                        "type": "figure_caption",
                    },
                )

        # Sort containers by reading order
        containers = sorted(containers, key=lambda c: (c["order"], c["sub_order"]))

        all_words = []
        for container in containers:
            container_words = []
            for word in doc.words:
                word_box = _poly2rect(word.points)
                if is_contained(container["box"], word_box, 0.5):
                    container_words.append(word)

            # Sort words within the container
            if container["direction"] == "vertical":
                # Right-to-left column, then top-to-bottom
                container_words.sort(
                    key=lambda w: (
                        -_poly2rect(w.points)[0],
                        _poly2rect(w.points)[1],
                    )
                )
            else:
                # Top-to-bottom, then left-to-right
                container_words.sort(
                    key=lambda w: (
                        _poly2rect(w.points)[1],
                        _poly2rect(w.points)[0],
                    )
                )
            all_words.extend(container_words)

        # Set transparent color for text
        text_color = Color(1, 1, 1, alpha=0)
        c.setFillColor(text_color)

        for word in all_words:
            text = word.content
            bbox = _poly2rect(word.points)
            direction = word.direction

            x1, y1, x2, y2 = bbox
            bbox_height = y2 - y1
            bbox_width = x2 - x1

            if direction == "vertical":
                text = to_full_width(text)
                font_size = _calc_font_size(text, bbox_width, bbox_height)
            else:
                font_size = _calc_font_size(text, bbox_height, bbox_width)

            if not font_size:
                continue

            c.setFont("MPLUS1p-Medium", font_size)

            if direction == "vertical":
                # Adjust for vertical text rendering
                base_y = orig_h - y1
                char_height = bbox_height / len(text) if text else 0

                for j, ch in enumerate(text):
                    char_x = x1 + (bbox_width - font_size) / 2
                    char_y = base_y - (j * char_height) - char_height / 2

                    c.saveState()
                    c.translate(char_x, char_y + font_size / 2)
                    c.rotate(-90)
                    c.drawString(0, 0, ch)
                    c.restoreState()
            else:
                base_y = orig_h - y2 + (bbox_height - font_size) * 0.5
                c.drawString(x1, base_y, text)

        c.showPage()

    c.save()

    with open(output_path, "wb") as f:
        f.write(packet.getvalue())