Audio Extraction
Instructor supports extracting structured data from audio files using the `Audio` class, making it easy to process speech and audio content.
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
from instructor.multimodal import Audio
Initialize the client with instructor
client = instructor.from_openai(OpenAI())
Define a model for audio transcription
class AudioTranscription(BaseModel):
text: str = Field(description="Full transcription of the audio")
speaker: str = Field(description="Identity of the speaker if known")
language: str = Field(description="Language spoken in the audio")
confidence: float = Field(description="Confidence score for the transcription", ge=0.0, le=1.0)
Extract transcription from audio
def transcribe_audio(audio_path: str) -> AudioTranscription:
"""Extract structured transcription from an audio file."""
# Load the audio using Instructor's Audio class
audio = Audio.from_path(audio_path)
return client.chat.completions.create(
model="gpt-4o-audio-preview", # Audio-capable model
response_model=AudioTranscription,
messages=[
{
"role": "user",
"content": [
"Transcribe this audio file and identify the speaker and language:",
audio # The Audio object is handled automatically
]
}
]
)
For more specific information extraction from audio:
from typing import List, Optional
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
from instructor.multimodal import Audio
Initialize the client with instructor
client = instructor.from_openai(OpenAI())
Define a model for person information
class Person(BaseModel):
name: str = Field(description="Person's full name")
age: int = Field(description="Person's age in years")
occupation: Optional[str] = Field(None, description="Person's job or profession if mentioned")
Define a model for meeting information
class MeetingPoint(BaseModel):
topic: str = Field(description="Topic discussed")
decision: Optional[str] = Field(None, description="Decision made on this topic")
action_items: List[str] = Field(default_factory=list, description="Action items related to this topic")
class Meeting(BaseModel):
title: str = Field(description="Meeting title or purpose")
date: Optional[str] = Field(None, description="Meeting date if mentioned")
participants: List[str] = Field(description="Names of meeting participants")
key_points: List[MeetingPoint] = Field(description="Key discussion points and decisions")
summary: str = Field(description="Brief summary of the meeting")
Extract structured information from audio
def extract_meeting_info(audio_path: str) -> Meeting:
"""Extract structured meeting information from audio recording."""
audio = Audio.from_path(audio_path)
return client.chat.completions.create(
model="gpt-4o-audio-preview",
response_model=Meeting,
messages=[
{
"role": "system",
"content": "Extract detailed meeting information from this audio recording."
},
{
"role": "user",
"content": [
"Extract the complete meeting details from this recording:",
audio
]
}
]
)
Extract person information
def extract_person_from_audio(audio_path: str) -> Person:
"""Extract structured person information from audio."""
audio = Audio.from_path(audio_path)
return client.chat.completions.create(
model="gpt-4o-audio-preview",
response_model=Person,
messages=[
{
"role": "user",
"content": [
"Extract the person's name, age, and occupation from this audio:",
audio
]
}
]
)
Running the Example
First, install Instructor and any dependencies
$ pip install instructor pydantic
Run the Python script
$ python audio-extraction.py