DocumentAnalyzerSchema¶

Bases: BaseSchema

ソースコード位置： src/yomitoku/schemas/document_analyzer.py

class DocumentAnalyzerSchema(BaseSchema):
    preprocess: Union[PreprocessSchema, None] = Field(
        ..., description="Preprocessing information of the document"
    )
    paragraphs: List[ParagraphSchema] = Field(
        ..., description="List of detected paragraphs"
    )
    tables: List[TableStructureRecognizerSchema] = Field(
        ..., description="List of detected tables"
    )
    words: List[WordPrediction] = Field(..., description="List of recognized words")
    figures: List[FigureSchema] = Field(..., description="List of detected figures")

    def to_html(self, out_path: str, **kwargs: Any) -> str:
        """
        Export the document analysis results to an HTML file.

        This method uses the `export_html` function to convert the document analysis results
        into an HTML format and save it to the specified file path.

        Args:
            out_path (str): The file path where the HTML output will be saved.
            **kwargs: Additional keyword arguments for `export_html`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                export_figure (bool, optional):
                    Whether to export figures detected in the document. Defaults to True.

                export_figure_letter (bool, optional):
                    Whether to export individual letters from figures. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                figure_width (int, optional):
                    The width (in pixels) of the exported figures in the HTML. Defaults to 200.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the HTML file. Defaults to "utf-8".

        Returns:
            str: The formatted HTML content as a string.
        """
        return export_html(self, out_path, **kwargs)

    def to_markdown(self, out_path: str, **kwargs: Any) -> str:
        """
        Export the document analysis results to a Markdown file.

        This method uses the `export_markdown` function to convert the document analysis results
        into a Markdown format and save it to the specified file path.

        Args:
            out_path (str): The file path where the Markdown output will be saved.
            **kwargs: Additional keyword arguments for `export_markdown`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                export_figure (bool, optional):
                    Whether to export figures detected in the document. Defaults to True.

                export_figure_letter (bool, optional):
                    Whether to export individual letters from figures. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                figure_width (int, optional):
                    The width (in pixels) of the exported figures in the Markdown. Defaults to 200.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the Markdown file. Defaults to "utf-8".

        Returns:
            str: The formatted Markdown content as a string.
        """
        return export_markdown(self, out_path, **kwargs)

    def to_csv(self, out_path: str, **kwargs: Any) -> list[dict[str, Any]]:
        """
        Export the document analysis results to a CSV file.

        This method uses the `export_csv` function to convert the document analysis results
        into a CSV format and save it to the specified file path.

        Args:
            out_path (str): The file path where the CSV output will be saved.

            **kwargs: Additional keyword arguments for `export_csv`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                export_figure (bool, optional):
                    Whether to export figures detected in the document. Defaults to True.

                export_figure_letter (bool, optional):
                    Whether to export individual letters from figures. Defaults to False.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the CSV file. Defaults to "utf-8".

        Returns:
            list: A list of elements representing the exported data,
            sorted by their order in the document.
        """
        return export_csv(self, out_path, **kwargs)

    def to_json(self, out_path: str, **kwargs: Any) -> "DocumentAnalyzerSchema":
        """
        Export the document analysis results to a JSON file.

        This method uses the `export_json` function to convert the document analysis results
        into a JSON format and save it to the specified file path.

        Args:
            out_path (str): The file path where the JSON output will be saved.

            **kwargs: Additional keyword arguments for `export_json`:

                ignore_line_break (bool, optional):
                    Whether to ignore line breaks in the text content. Defaults to False.

                img (np.ndarray, optional):
                    The input image associated with the document. Required if `export_figure` is True.

                export_figure (bool, optional):
                    Whether to include figures detected in the document in the JSON output. Defaults to False.

                figure_dir (str, optional):
                    The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

                encoding (str, optional):
                    The encoding to use for the JSON file. Defaults to "utf-8".

        Returns:
            DocumentAnalyzerSchema: The converted document analysis results.
        """
        return export_json(self, out_path, **kwargs)

`to_csv(out_path, **kwargs)` ¶

Export the document analysis results to a CSV file.

This method uses the export_csv function to convert the document analysis results into a CSV format and save it to the specified file path.

引数：

名前	タイプ	デスクリプション	デフォルト
`out_path`	`str`	The file path where the CSV output will be saved.	必須
`**kwargs`	`Any`	Additional keyword arguments for `export_csv`: ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False. img (np.ndarray, optional): The input image associated with the document. Required if `export_figure` is True. export_figure (bool, optional): Whether to export figures detected in the document. Defaults to True. export_figure_letter (bool, optional): Whether to export individual letters from figures. Defaults to False. figure_dir (str, optional): The directory where figures will be saved if `export_figure` is True. Defaults to "figures". encoding (str, optional): The encoding to use for the CSV file. Defaults to "utf-8".	`{}`

戻り値：

名前	タイプ	デスクリプション
`list`	`list[dict[str, Any]]`	A list of elements representing the exported data,
	`list[dict[str, Any]]`	sorted by their order in the document.

ソースコード位置： src/yomitoku/schemas/document_analyzer.py

def to_csv(self, out_path: str, **kwargs: Any) -> list[dict[str, Any]]:
    """
    Export the document analysis results to a CSV file.

    This method uses the `export_csv` function to convert the document analysis results
    into a CSV format and save it to the specified file path.

    Args:
        out_path (str): The file path where the CSV output will be saved.

        **kwargs: Additional keyword arguments for `export_csv`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            export_figure (bool, optional):
                Whether to export figures detected in the document. Defaults to True.

            export_figure_letter (bool, optional):
                Whether to export individual letters from figures. Defaults to False.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the CSV file. Defaults to "utf-8".

    Returns:
        list: A list of elements representing the exported data,
        sorted by their order in the document.
    """
    return export_csv(self, out_path, **kwargs)

`to_html(out_path, **kwargs)` ¶

Export the document analysis results to an HTML file.

This method uses the export_html function to convert the document analysis results into an HTML format and save it to the specified file path.

引数：

名前	タイプ	デスクリプション	デフォルト
`out_path`	`str`	The file path where the HTML output will be saved.	必須
`**kwargs`	`Any`	Additional keyword arguments for `export_html`: ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False. export_figure (bool, optional): Whether to export figures detected in the document. Defaults to True. export_figure_letter (bool, optional): Whether to export individual letters from figures. Defaults to False. img (np.ndarray, optional): The input image associated with the document. Required if `export_figure` is True. figure_width (int, optional): The width (in pixels) of the exported figures in the HTML. Defaults to 200. figure_dir (str, optional): The directory where figures will be saved if `export_figure` is True. Defaults to "figures". encoding (str, optional): The encoding to use for the HTML file. Defaults to "utf-8".	`{}`

戻り値：

名前	タイプ	デスクリプション
`str`	`str`	The formatted HTML content as a string.

ソースコード位置： src/yomitoku/schemas/document_analyzer.py

def to_html(self, out_path: str, **kwargs: Any) -> str:
    """
    Export the document analysis results to an HTML file.

    This method uses the `export_html` function to convert the document analysis results
    into an HTML format and save it to the specified file path.

    Args:
        out_path (str): The file path where the HTML output will be saved.
        **kwargs: Additional keyword arguments for `export_html`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            export_figure (bool, optional):
                Whether to export figures detected in the document. Defaults to True.

            export_figure_letter (bool, optional):
                Whether to export individual letters from figures. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            figure_width (int, optional):
                The width (in pixels) of the exported figures in the HTML. Defaults to 200.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the HTML file. Defaults to "utf-8".

    Returns:
        str: The formatted HTML content as a string.
    """
    return export_html(self, out_path, **kwargs)

`to_json(out_path, **kwargs)` ¶

Export the document analysis results to a JSON file.

This method uses the export_json function to convert the document analysis results into a JSON format and save it to the specified file path.

引数：

名前	タイプ	デスクリプション	デフォルト
`out_path`	`str`	The file path where the JSON output will be saved.	必須
`**kwargs`	`Any`	Additional keyword arguments for `export_json`: ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False. img (np.ndarray, optional): The input image associated with the document. Required if `export_figure` is True. export_figure (bool, optional): Whether to include figures detected in the document in the JSON output. Defaults to False. figure_dir (str, optional): The directory where figures will be saved if `export_figure` is True. Defaults to "figures". encoding (str, optional): The encoding to use for the JSON file. Defaults to "utf-8".	`{}`

戻り値：

名前	タイプ	デスクリプション
`DocumentAnalyzerSchema`	`DocumentAnalyzerSchema`	The converted document analysis results.

ソースコード位置： src/yomitoku/schemas/document_analyzer.py

def to_json(self, out_path: str, **kwargs: Any) -> "DocumentAnalyzerSchema":
    """
    Export the document analysis results to a JSON file.

    This method uses the `export_json` function to convert the document analysis results
    into a JSON format and save it to the specified file path.

    Args:
        out_path (str): The file path where the JSON output will be saved.

        **kwargs: Additional keyword arguments for `export_json`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            export_figure (bool, optional):
                Whether to include figures detected in the document in the JSON output. Defaults to False.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the JSON file. Defaults to "utf-8".

    Returns:
        DocumentAnalyzerSchema: The converted document analysis results.
    """
    return export_json(self, out_path, **kwargs)

`to_markdown(out_path, **kwargs)` ¶

Export the document analysis results to a Markdown file.

This method uses the export_markdown function to convert the document analysis results into a Markdown format and save it to the specified file path.

引数：

名前	タイプ	デスクリプション	デフォルト
`out_path`	`str`	The file path where the Markdown output will be saved.	必須
`**kwargs`	`Any`	Additional keyword arguments for `export_markdown`: ignore_line_break (bool, optional): Whether to ignore line breaks in the text content. Defaults to False. export_figure (bool, optional): Whether to export figures detected in the document. Defaults to True. export_figure_letter (bool, optional): Whether to export individual letters from figures. Defaults to False. img (np.ndarray, optional): The input image associated with the document. Required if `export_figure` is True. figure_width (int, optional): The width (in pixels) of the exported figures in the Markdown. Defaults to 200. figure_dir (str, optional): The directory where figures will be saved if `export_figure` is True. Defaults to "figures". encoding (str, optional): The encoding to use for the Markdown file. Defaults to "utf-8".	`{}`

戻り値：

名前	タイプ	デスクリプション
`str`	`str`	The formatted Markdown content as a string.

ソースコード位置： src/yomitoku/schemas/document_analyzer.py

def to_markdown(self, out_path: str, **kwargs: Any) -> str:
    """
    Export the document analysis results to a Markdown file.

    This method uses the `export_markdown` function to convert the document analysis results
    into a Markdown format and save it to the specified file path.

    Args:
        out_path (str): The file path where the Markdown output will be saved.
        **kwargs: Additional keyword arguments for `export_markdown`:

            ignore_line_break (bool, optional):
                Whether to ignore line breaks in the text content. Defaults to False.

            export_figure (bool, optional):
                Whether to export figures detected in the document. Defaults to True.

            export_figure_letter (bool, optional):
                Whether to export individual letters from figures. Defaults to False.

            img (np.ndarray, optional):
                The input image associated with the document. Required if `export_figure` is True.

            figure_width (int, optional):
                The width (in pixels) of the exported figures in the Markdown. Defaults to 200.

            figure_dir (str, optional):
                The directory where figures will be saved if `export_figure` is True. Defaults to "figures".

            encoding (str, optional):
                The encoding to use for the Markdown file. Defaults to "utf-8".

    Returns:
        str: The formatted Markdown content as a string.
    """
    return export_markdown(self, out_path, **kwargs)

DocumentAnalyzerSchema¶

to_csv(out_path, **kwargs) ¶

to_html(out_path, **kwargs) ¶

to_json(out_path, **kwargs) ¶

to_markdown(out_path, **kwargs) ¶

`to_csv(out_path, **kwargs)` ¶

`to_html(out_path, **kwargs)` ¶

`to_json(out_path, **kwargs)` ¶

`to_markdown(out_path, **kwargs)` ¶