Multimodal Inputs

Table Extraction

import instructor from openai import OpenAI from pydantic import BaseModel, Field import pandas as pd from typing import Annotated, Any from io import StringIO from pydantic import ( BaseModel, BeforeValidator, PlainSerializer, InstanceOf, WithJsonSchema, ) client = instructor.from_openai(OpenAI(), mode=instructor.Mode.MD_JSON) def to_markdown(df: pd.DataFrame) -> str: """Convert a dataframe to markdown format.""" return df.to_markdown() def md_to_df(data: Any) -> Any: """Convert markdown table to a pandas dataframe.""" if isinstance(data, str): return ( pd.read_csv( StringIO(data), sep="|", index_col=1, ) .dropna(axis=1, how="all") .iloc[1:] .map(lambda x: x.strip()) ) return data MarkdownDataFrame = Annotated[ InstanceOf[pd.DataFrame], BeforeValidator(md_to_df), PlainSerializer(to_markdown), WithJsonSchema( { "type": "string", "description": "The markdown representation of the table", } ), ] class Table(BaseModel): caption: str = Field(description="A descriptive caption for the table") dataframe: MarkdownDataFrame = Field( description="The table data as a markdown table" ) def extract_table_from_image(image_path_or_url: str) -> Table: """Extract a table from an image and return it as a structured object.""" if image_path_or_url.startswith(("http://", "https://")): image = instructor.Image.from_url(image_path_or_url) else: image = instructor.Image.from_path(image_path_or_url) return client.chat.completions.create( model="gpt-4-vision-preview", response_model=Table, max_tokens=1800, messages=[ { "role": "user", "content": [ { "type": "text", "text": "Extract the table from this image with a descriptive caption.", }, image, ], } ], ) def analyze_table_data(image_path: str): """Extract and analyze a table from an image.""" table = extract_table_from_image(image_path) print(f"Table Caption: {table.caption}") print("\nExtracted Table:") print(table.dataframe) # Perform data analysis if it's a pandas DataFrame if isinstance(table.dataframe, pd.DataFrame): print("\nData Analysis:") print(f"- Rows: {len(table.dataframe)}") print(f"- Columns: {len(table.dataframe.columns)}") # Basic statistics if numeric columns exist numeric_cols = table.dataframe.select_dtypes(include=["number"]).columns if len(numeric_cols) > 0: print("\nNumeric Column Statistics:") for col in numeric_cols: col_data = table.dataframe[col] print( f"- {col}: Min={col_data.min()}, Max={col_data.max()}, Mean={col_data.mean():.2f}" ) return table.dataframe return None def extract_multiple_tables(image_path_or_url: str) -> list[Table]: """Extract all tables from an image.""" if image_path_or_url.startswith(("http://", "https://")): image = instructor.Image.from_url(image_path_or_url) else: image = instructor.Image.from_path(image_path_or_url) tables = client.chat.completions.create_iterable( model="gpt-4-vision-preview", response_model=Table, max_tokens=1800, messages=[ { "role": "user", "content": [ { "type": "text", "text": "Extract all tables from this image. Each table should be separate and have its own caption.", }, image, ], } ], ) return list(tables) def analyze_multiple_tables(image_path: str): """Extract and analyze all tables from an image.""" tables = extract_multiple_tables(image_path) print(f"Found {len(tables)} tables in the image.") for i, table in enumerate(tables): print(f"\n--- Table {i+1}: {table.caption} ---") print(table.dataframe) if isinstance(table.dataframe, pd.DataFrame): yield table.dataframe

Instructor makes it easy to extract structured tables from images and convert them to pandas dataframes for analysis.

import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
import pandas as pd
from typing import Annotated, Any
from io import StringIO
from pydantic import (
    BaseModel,
    BeforeValidator,
    PlainSerializer,
    InstanceOf,
    WithJsonSchema,
)

Initialize the client with instructor (MD_JSON mode works well for tables)

client = instructor.from_openai(OpenAI(), mode=instructor.Mode.MD_JSON)

Define functions to convert between dataframe and markdown

def to_markdown(df: pd.DataFrame) -> str:
    """Convert a dataframe to markdown format."""
    return df.to_markdown()


def md_to_df(data: Any) -> Any:
    """Convert markdown table to a pandas dataframe."""
    if isinstance(data, str):
        return (
            pd.read_csv(
                StringIO(data),
                sep="|",
                index_col=1,
            )
            .dropna(axis=1, how="all")
            .iloc[1:]
            .map(lambda x: x.strip())
        )
    return data

Define a custom type for markdown dataframes

MarkdownDataFrame = Annotated[
    InstanceOf[pd.DataFrame],
    BeforeValidator(md_to_df),
    PlainSerializer(to_markdown),
    WithJsonSchema(
        {
            "type": "string",
            "description": "The markdown representation of the table",
        }
    ),
]

Define the table model

class Table(BaseModel):
    caption: str = Field(description="A descriptive caption for the table")
    dataframe: MarkdownDataFrame = Field(
        description="The table data as a markdown table"
    )

Function to extract tables from images
Create an image object or use autodetect

def extract_table_from_image(image_path_or_url: str) -> Table:
    """Extract a table from an image and return it as a structured object."""
    if image_path_or_url.startswith(("http://", "https://")):
        image = instructor.Image.from_url(image_path_or_url)
    else:
        image = instructor.Image.from_path(image_path_or_url)

    return client.chat.completions.create(
        model="gpt-4-vision-preview",
        response_model=Table,
        max_tokens=1800,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Extract the table from this image with a descriptive caption.",
                    },
                    image,
                ],
            }
        ],
    )

Example usage

def analyze_table_data(image_path: str):
    """Extract and analyze a table from an image."""
    table = extract_table_from_image(image_path)

    print(f"Table Caption: {table.caption}")
    print("\nExtracted Table:")
    print(table.dataframe)

    # Perform data analysis if it's a pandas DataFrame
    if isinstance(table.dataframe, pd.DataFrame):
        print("\nData Analysis:")
        print(f"- Rows: {len(table.dataframe)}")
        print(f"- Columns: {len(table.dataframe.columns)}")

        # Basic statistics if numeric columns exist
        numeric_cols = table.dataframe.select_dtypes(include=["number"]).columns
        if len(numeric_cols) > 0:
            print("\nNumeric Column Statistics:")
            for col in numeric_cols:
                col_data = table.dataframe[col]
                print(
                    f"- {col}: Min={col_data.min()}, Max={col_data.max()}, Mean={col_data.mean():.2f}"
                )

        return table.dataframe

    return None

This would be called as:# df = analyze_table_data("path/to/table_image.jpg")# After this, you can use pandas operations on the dataframe# For multiple tables in a single image, you can use the iterable response:

def extract_multiple_tables(image_path_or_url: str) -> list[Table]:
    """Extract all tables from an image."""
    if image_path_or_url.startswith(("http://", "https://")):
        image = instructor.Image.from_url(image_path_or_url)
    else:
        image = instructor.Image.from_path(image_path_or_url)

    tables = client.chat.completions.create_iterable(
        model="gpt-4-vision-preview",
        response_model=Table,
        max_tokens=1800,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Extract all tables from this image. Each table should be separate and have its own caption.",
                    },
                    image,
                ],
            }
        ],
    )

    return list(tables)

Process multiple tables from one image
Convert to dataframe and return list of dataframes

def analyze_multiple_tables(image_path: str):
    """Extract and analyze all tables from an image."""
    tables = extract_multiple_tables(image_path)

    print(f"Found {len(tables)} tables in the image.")

    for i, table in enumerate(tables):
        print(f"\n--- Table {i+1}: {table.caption} ---")
        print(table.dataframe)

        if isinstance(table.dataframe, pd.DataFrame):
            yield table.dataframe

Running the Example

First, install Instructor and any dependencies

$ pip install instructor pydantic

Run the Python script

$ python table-extraction.py

Further Information

Documentation link 1