Multimodal Inputs

PDF Extraction

import instructor from openai import OpenAI from pydantic import BaseModel, Field from typing import List import tempfile from pdf2image import convert_from_path client = instructor.from_openai(OpenAI()) class Section(BaseModel): title: str = Field(description="Section title") content: str = Field(description="Section content") class Document(BaseModel): title: str = Field(description="Document title") author: str = Field(description="Document author") sections: List[Section] = Field(description="Document sections") summary: str = Field(description="Brief document summary") def extract_from_pdf_page(pdf_path: str, page_number: int = 0) -> Document: """Extract structured information from a PDF page.""" # Convert the PDF page to an image images = convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1) # Save the image to a temporary file with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp: temp_path = temp.name images[0].save(temp_path, 'JPEG') # Create an Image object image = instructor.Image.from_path(temp_path) # Extract information using vision capabilities return client.chat.completions.create( model="gpt-4-vision-preview", response_model=Document, messages=[ { "role": "system", "content": "Extract structured information from this document page." }, { "role": "user", "content": [ "Extract the complete document structure from this page:", image ] } ] ) def process_pdf_document(pdf_path: str, max_pages: int = 5) -> List[Document]: """Process multiple pages from a PDF document.""" results = [] # Get the number of pages from pypdf import PdfReader reader = PdfReader(pdf_path) num_pages = len(reader.pages) actual_pages = min(num_pages, max_pages) # Process each page for i in range(actual_pages): page_result = extract_from_pdf_page(pdf_path, i) results.append(page_result) print(f"Processed page {i+1}/{actual_pages}") return results from typing import List, Optional from pydantic import BaseModel, Field import instructor from openai import OpenAI from pdf2image import convert_from_path import tempfile client = instructor.from_openai(OpenAI()) class LineItem(BaseModel): description: str = Field(description="Description of the item or service") quantity: float = Field(description="Quantity of the item") unit_price: float = Field(description="Price per unit") amount: float = Field(description="Total amount for this line") class Invoice(BaseModel): invoice_number: str = Field(description="Invoice identifier") date: str = Field(description="Invoice date") vendor: str = Field(description="Name of the vendor/seller") customer: str = Field(description="Name of the customer/buyer") items: List[LineItem] = Field(description="Line items in the invoice") subtotal: float = Field(description="Sum of all items before tax") tax: Optional[float] = Field(None, description="Tax amount") total: float = Field(description="Total invoice amount") def extract_invoice(pdf_path: str) -> Invoice: """Extract structured invoice data from a PDF.""" # Convert first page to image images = convert_from_path(pdf_path, first_page=1, last_page=1) # Save to temp file with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp: temp_path = temp.name images[0].save(temp_path, 'JPEG') # Create image object image = instructor.Image.from_path(temp_path) # Extract invoice data return client.chat.completions.create( model="gpt-4-vision-preview", response_model=Invoice, messages=[ { "role": "system", "content": "You are an invoice processing assistant that extracts structured data from invoice images." }, { "role": "user", "content": [ "Extract complete invoice details from this document:", image ] } ] )

Instructor provides support for working with PDF documents through a combination of vision capabilities and structured extraction.

import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from typing import List
import tempfile
from pdf2image import convert_from_path

Initialize the client with instructor

client = instructor.from_openai(OpenAI())

Define models for document extraction

class Section(BaseModel):
    title: str = Field(description="Section title")
    content: str = Field(description="Section content")

class Document(BaseModel):
    title: str = Field(description="Document title")
    author: str = Field(description="Document author")
    sections: List[Section] = Field(description="Document sections")
    summary: str = Field(description="Brief document summary")

Extract content from a PDF page as an image

def extract_from_pdf_page(pdf_path: str, page_number: int = 0) -> Document:
    """Extract structured information from a PDF page."""
    # Convert the PDF page to an image
    images = convert_from_path(pdf_path, first_page=page_number+1, last_page=page_number+1)

    # Save the image to a temporary file
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp:
        temp_path = temp.name
        images[0].save(temp_path, 'JPEG')

    # Create an Image object
    image = instructor.Image.from_path(temp_path)

    # Extract information using vision capabilities
    return client.chat.completions.create(
        model="gpt-4-vision-preview",
        response_model=Document,
        messages=[
            {
                "role": "system",
                "content": "Extract structured information from this document page."
            },
            {
                "role": "user",
                "content": [
                    "Extract the complete document structure from this page:",
                    image
                ]
            }
        ]
    )

Process multiple pages

def process_pdf_document(pdf_path: str, max_pages: int = 5) -> List[Document]:
    """Process multiple pages from a PDF document."""
    results = []

    # Get the number of pages
    from pypdf import PdfReader
    reader = PdfReader(pdf_path)
    num_pages = len(reader.pages)
    actual_pages = min(num_pages, max_pages)

    # Process each page
    for i in range(actual_pages):
        page_result = extract_from_pdf_page(pdf_path, i)
        results.append(page_result)
        print(f"Processed page {i+1}/{actual_pages}")

    return results

For more specific document types, you can create specialized models:

from typing import List, Optional
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
from pdf2image import convert_from_path
import tempfile

Initialize the client

client = instructor.from_openai(OpenAI())

Define a model for invoice extraction

class LineItem(BaseModel):
    description: str = Field(description="Description of the item or service")
    quantity: float = Field(description="Quantity of the item")
    unit_price: float = Field(description="Price per unit")
    amount: float = Field(description="Total amount for this line")

class Invoice(BaseModel):
    invoice_number: str = Field(description="Invoice identifier")
    date: str = Field(description="Invoice date")
    vendor: str = Field(description="Name of the vendor/seller")
    customer: str = Field(description="Name of the customer/buyer")
    items: List[LineItem] = Field(description="Line items in the invoice")
    subtotal: float = Field(description="Sum of all items before tax")
    tax: Optional[float] = Field(None, description="Tax amount")
    total: float = Field(description="Total invoice amount")

Extract invoice from PDF

def extract_invoice(pdf_path: str) -> Invoice:
    """Extract structured invoice data from a PDF."""
    # Convert first page to image
    images = convert_from_path(pdf_path, first_page=1, last_page=1)

    # Save to temp file
    with tempfile.NamedTemporaryFile(suffix='.jpg', delete=False) as temp:
        temp_path = temp.name
        images[0].save(temp_path, 'JPEG')

    # Create image object
    image = instructor.Image.from_path(temp_path)

    # Extract invoice data
    return client.chat.completions.create(
        model="gpt-4-vision-preview",
        response_model=Invoice,
        messages=[
            {
                "role": "system",
                "content": "You are an invoice processing assistant that extracts structured data from invoice images."
            },
            {
                "role": "user",
                "content": [
                    "Extract complete invoice details from this document:",
                    image
                ]
            }
        ]
    )

Running the Example

First, install Instructor and any dependencies

$ pip install instructor pydantic

Run the Python script

$ python pdf-extraction.py

Further Information

Documentation link 1