>>> import json
>>> # Parse a PDF document with default OCR mode
>>> _ = session.sql("CREATE OR REPLACE TEMP STAGE mystage ENCRYPTION = (TYPE = 'SNOWFLAKE_SSE')").collect()
>>> _ = session.file.put("tests/resources/doc.pdf", "@mystage", auto_compress=False)
>>> from snowflake.snowpark.functions import col, to_file
>>> df = session.create_dataframe([["@mystage/doc.pdf"]], schema=["file_path"]) # staged file path
>>> result_df = df.ai.parse_document(
... input_column=to_file(col("file_path")),
... output_column="parsed",
... )
>>> result_df.columns
['FILE_PATH', 'PARSED']
>>> result = json.loads(result_df.collect()[0]["PARSED"])
>>> "Sample PDF" in result["content"] and result["metadata"]["pageCount"] == 3
True
>>> # Parse with LAYOUT mode to extract tables and structure
>>> _ = session.file.put("tests/resources/invoice.pdf", "@mystage", auto_compress=False)
>>> df = session.create_dataframe([["@mystage/invoice.pdf"]], schema=["file_path"])
>>> result_df = df.ai.parse_document(
... input_column=to_file(col("file_path")),
... output_column="parsed",
... mode='LAYOUT',
... )
>>> result = json.loads(result_df.collect()[0]["PARSED"])
>>> "| Customer Name |" in result["content"] and "| Country |" in result["content"]
True
>>> # Parse with page splitting for long documents (PDF only)
>>> df = session.create_dataframe([["@mystage/doc.pdf"]], schema=["file_path"])
>>> result_df = df.ai.parse_document(
... input_column=to_file(col("file_path")),
... output_column="parsed",
... page_split=True,
... )
>>> result = json.loads(result_df.collect()[0]["PARSED"])
>>> len(result["pages"]) == 3 and result["pages"][0]["index"] == 0
True