Skip to content

DocumentAnalyzerSchema

Bases: BaseSchema

ソースコード位置: src/yomitoku/schemas/document_analyzer.py
class DocumentAnalyzerSchema(BaseSchema):
    preprocess: Union[PreprocessSchema, None] = Field(
        ..., description="Preprocessing information of the document"
    )
    paragraphs: List[ParagraphSchema] = Field(
        ..., description="List of detected paragraphs"
    )
    tables: List[TableStructureRecognizerSchema] = Field(
        ..., description="List of detected tables"
    )
    words: List[WordPrediction] = Field(..., description="List of recognized words")
    figures: List[FigureSchema] = Field(..., description="List of detected figures")

    def to_html(self, out_path: str, **kwargs: Any) -> str:
        """
        Export the document analysis results to an HTML file.

        This method uses the `export_html` function to convert the document analysis results
        into an HTML format and save it to the specified file path.

        Args:
            out_path (str): The file path where the HTML output will be saved.
            **kwargs: Additional keyword arguments for `export_html`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                export_figure (bool, optional):
                    Whether to export figures detected in the document. Defaults to True.

                export_figure_letter (bool, optional):
                    Whether to export individual letters from figures. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                figure_width (int, optional):
                    The width (in pixels) of the exported figures in the HTML. Defaults to 200.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the HTML file. Defaults to "utf-8".

        Returns:
            str: The formatted HTML content as a string.
        """
        return export_html(self, out_path, **kwargs)

    def to_markdown(self, out_path: str, **kwargs: Any) -> str:
        """
        Export the document analysis results to a Markdown file.

        This method uses the `export_markdown` function to convert the document analysis results
        into a Markdown format and save it to the specified file path.

        Args:
            out_path (str): The file path where the Markdown output will be saved.
            **kwargs: Additional keyword arguments for `export_markdown`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                export_figure (bool, optional):
                    Whether to export figures detected in the document. Defaults to True.

                export_figure_letter (bool, optional):
                    Whether to export individual letters from figures. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                figure_width (int, optional):
                    The width (in pixels) of the exported figures in the Markdown. Defaults to 200.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the Markdown file. Defaults to "utf-8".

        Returns:
            str: The formatted Markdown content as a string.
        """
        return export_markdown(self, out_path, **kwargs)

    def to_csv(self, out_path: str, **kwargs: Any) -> list[dict[str, Any]]:
        """
        Export the document analysis results to a CSV file.

        This method uses the `export_csv` function to convert the document analysis results
        into a CSV format and save it to the specified file path.

        Args:
            out_path (str): The file path where the CSV output will be saved.

            **kwargs: Additional keyword arguments for `export_csv`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                export_figure (bool, optional):
                    Whether to export figures detected in the document. Defaults to True.

                export_figure_letter (bool, optional):
                    Whether to export individual letters from figures. Defaults to False.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the CSV file. Defaults to "utf-8".

        Returns:
            list: A list of elements representing the exported data,
            sorted by their order in the document.
        """
        return export_csv(self, out_path, **kwargs)

    def to_json(self, out_path: str, **kwargs: Any) -> "DocumentAnalyzerSchema":
        """
        Export the document analysis results to a JSON file.

        This method uses the `export_json` function to convert the document analysis results
        into a JSON format and save it to the specified file path.

        Args:
            out_path (str): The file path where the JSON output will be saved.

            **kwargs: Additional keyword arguments for `export_json`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                export_figure (bool, optional):
                    Whether to include figures detected in the document in the JSON output. Defaults to False.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the JSON file. Defaults to "utf-8".

        Returns:
            DocumentAnalyzerSchema: The converted document analysis results.
        """
        return export_json(self, out_path, **kwargs)

to_csv(out_path, **kwargs)

Export the document analysis results to a CSV file.

This method uses the export_csv function to convert the document analysis results into a CSV format and save it to the specified file path.

引数:

名前 タイプ デスクリプション デフォルト
out_path str

The file path where the CSV output will be saved.

必須
**kwargs Any

Additional keyword arguments for export_csv:

ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False.

img (np.ndarray, optional): The input image associated with the document. Required if export_figure is True.

export_figure (bool, optional): Whether to export figures detected in the document. Defaults to True.

export_figure_letter (bool, optional): Whether to export individual letters from figures. Defaults to False.

figure_dir (str, optional): The directory where figures will be saved if export_figure is True. Defaults to "figures".

encoding (str, optional): The encoding to use for the CSV file. Defaults to "utf-8".

{}

戻り値:

名前 タイプ デスクリプション
list list[dict[str, Any]]

A list of elements representing the exported data,

list[dict[str, Any]]

sorted by their order in the document.

ソースコード位置: src/yomitoku/schemas/document_analyzer.py
def to_csv(self, out_path: str, **kwargs: Any) -> list[dict[str, Any]]:
    """
    Export the document analysis results to a CSV file.

    This method uses the `export_csv` function to convert the document analysis results
    into a CSV format and save it to the specified file path.

    Args:
        out_path (str): The file path where the CSV output will be saved.

        **kwargs: Additional keyword arguments for `export_csv`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            export_figure (bool, optional):
                Whether to export figures detected in the document. Defaults to True.

            export_figure_letter (bool, optional):
                Whether to export individual letters from figures. Defaults to False.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the CSV file. Defaults to "utf-8".

    Returns:
        list: A list of elements representing the exported data,
        sorted by their order in the document.
    """
    return export_csv(self, out_path, **kwargs)

to_html(out_path, **kwargs)

Export the document analysis results to an HTML file.

This method uses the export_html function to convert the document analysis results into an HTML format and save it to the specified file path.

引数:

名前 タイプ デスクリプション デフォルト
out_path str

The file path where the HTML output will be saved.

必須
**kwargs Any

Additional keyword arguments for export_html:

ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False.

export_figure (bool, optional): Whether to export figures detected in the document. Defaults to True.

export_figure_letter (bool, optional): Whether to export individual letters from figures. Defaults to False.

img (np.ndarray, optional): The input image associated with the document. Required if export_figure is True.

figure_width (int, optional): The width (in pixels) of the exported figures in the HTML. Defaults to 200.

figure_dir (str, optional): The directory where figures will be saved if export_figure is True. Defaults to "figures".

encoding (str, optional): The encoding to use for the HTML file. Defaults to "utf-8".

{}

戻り値:

名前 タイプ デスクリプション
str str

The formatted HTML content as a string.

ソースコード位置: src/yomitoku/schemas/document_analyzer.py
def to_html(self, out_path: str, **kwargs: Any) -> str:
    """
    Export the document analysis results to an HTML file.

    This method uses the `export_html` function to convert the document analysis results
    into an HTML format and save it to the specified file path.

    Args:
        out_path (str): The file path where the HTML output will be saved.
        **kwargs: Additional keyword arguments for `export_html`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            export_figure (bool, optional):
                Whether to export figures detected in the document. Defaults to True.

            export_figure_letter (bool, optional):
                Whether to export individual letters from figures. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            figure_width (int, optional):
                The width (in pixels) of the exported figures in the HTML. Defaults to 200.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the HTML file. Defaults to "utf-8".

    Returns:
        str: The formatted HTML content as a string.
    """
    return export_html(self, out_path, **kwargs)

to_json(out_path, **kwargs)

Export the document analysis results to a JSON file.

This method uses the export_json function to convert the document analysis results into a JSON format and save it to the specified file path.

引数:

名前 タイプ デスクリプション デフォルト
out_path str

The file path where the JSON output will be saved.

必須
**kwargs Any

Additional keyword arguments for export_json:

ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False.

img (np.ndarray, optional): The input image associated with the document. Required if export_figure is True.

export_figure (bool, optional): Whether to include figures detected in the document in the JSON output. Defaults to False.

figure_dir (str, optional): The directory where figures will be saved if export_figure is True. Defaults to "figures".

encoding (str, optional): The encoding to use for the JSON file. Defaults to "utf-8".

{}

戻り値:

名前 タイプ デスクリプション
DocumentAnalyzerSchema DocumentAnalyzerSchema

The converted document analysis results.

ソースコード位置: src/yomitoku/schemas/document_analyzer.py
def to_json(self, out_path: str, **kwargs: Any) -> "DocumentAnalyzerSchema":
    """
    Export the document analysis results to a JSON file.

    This method uses the `export_json` function to convert the document analysis results
    into a JSON format and save it to the specified file path.

    Args:
        out_path (str): The file path where the JSON output will be saved.

        **kwargs: Additional keyword arguments for `export_json`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            export_figure (bool, optional):
                Whether to include figures detected in the document in the JSON output. Defaults to False.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the JSON file. Defaults to "utf-8".

    Returns:
        DocumentAnalyzerSchema: The converted document analysis results.
    """
    return export_json(self, out_path, **kwargs)

to_markdown(out_path, **kwargs)

Export the document analysis results to a Markdown file.

This method uses the export_markdown function to convert the document analysis results into a Markdown format and save it to the specified file path.

引数:

名前 タイプ デスクリプション デフォルト
out_path str

The file path where the Markdown output will be saved.

必須
**kwargs Any

Additional keyword arguments for export_markdown:

ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False.

export_figure (bool, optional): Whether to export figures detected in the document. Defaults to True.

export_figure_letter (bool, optional): Whether to export individual letters from figures. Defaults to False.

img (np.ndarray, optional): The input image associated with the document. Required if export_figure is True.

figure_width (int, optional): The width (in pixels) of the exported figures in the Markdown. Defaults to 200.

figure_dir (str, optional): The directory where figures will be saved if export_figure is True. Defaults to "figures".

encoding (str, optional): The encoding to use for the Markdown file. Defaults to "utf-8".

{}

戻り値:

名前 タイプ デスクリプション
str str

The formatted Markdown content as a string.

ソースコード位置: src/yomitoku/schemas/document_analyzer.py
def to_markdown(self, out_path: str, **kwargs: Any) -> str:
    """
    Export the document analysis results to a Markdown file.

    This method uses the `export_markdown` function to convert the document analysis results
    into a Markdown format and save it to the specified file path.

    Args:
        out_path (str): The file path where the Markdown output will be saved.
        **kwargs: Additional keyword arguments for `export_markdown`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            export_figure (bool, optional):
                Whether to export figures detected in the document. Defaults to True.

            export_figure_letter (bool, optional):
                Whether to export individual letters from figures. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            figure_width (int, optional):
                The width (in pixels) of the exported figures in the Markdown. Defaults to 200.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the Markdown file. Defaults to "utf-8".

    Returns:
        str: The formatted Markdown content as a string.
    """
    return export_markdown(self, out_path, **kwargs)