Skip to content
Open
8 changes: 8 additions & 0 deletions ak-py/src/agentkernel/framework/openai/openai.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,14 @@ async def run(self, agent: Any, session: Session, requests: list[AgentRequest])
# Multimodal case with images/files. When using multimodal inputs, OpenAI cannot handle session. So these inputs are not saved in the context
reply = (await Runner.run(agent.agent, message_content, session=None)).final_output

# Manually save the multimodal conversation to session for future reference
if session:
openai_session = session.get("openai") or session.set("openai", OpenAISession())
# Add user message
await openai_session.add_items([{"role": "user", "content": prompt}])
# Add assistant response
await openai_session.add_items([{"role": "assistant", "content": reply}])
Comment on lines 169 to +178
Copy link

Copilot AI Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The manual session management code directly accesses and manipulates the OpenAI session's internal state using add_items. This approach bypasses the normal session management flow and could lead to inconsistencies. The comment mentions "OpenAI cannot handle session" for multimodal inputs, but then manually adds items to the session anyway. This workaround needs better documentation explaining why it's necessary and what the limitations are, or consider using a more robust approach through the OpenAI SDK's proper APIs.

Suggested change
# Multimodal case with images/files. When using multimodal inputs, OpenAI cannot handle session. So these inputs are not saved in the context
reply = (await Runner.run(agent.agent, message_content, session=None)).final_output
# Manually save the multimodal conversation to session for future reference
if session:
openai_session = session.get("openai") or session.set("openai", OpenAISession())
# Add user message
await openai_session.add_items([{"role": "user", "content": prompt}])
# Add assistant response
await openai_session.add_items([{"role": "assistant", "content": reply}])
# Multimodal case with images/files.
# NOTE: When using multimodal inputs, the underlying OpenAI SDK cannot reliably handle
# session state in the same way as text-only interactions. To avoid inconsistent behavior,
# we invoke the agent with `session=None` here, and then (best-effort) persist a simplified
# text representation of the interaction into the session for downstream consumers that
# rely on conversational history.
#
# Limitations:
# - Only the textual `prompt` and the assistant `reply` are stored; non-text multimodal
# content (e.g., images, files) is not preserved in the session history.
# - The stored context may therefore diverge from the true multimodal context seen by
# the model at generation time.
reply = (await Runner.run(agent.agent, message_content, session=None)).final_output
# Manually save the multimodal conversation to the OpenAI session for future reference,
# using the same session helper that is used in the text-only path.
if session:
openai_session = self._session(session)
if isinstance(openai_session, SessionABC):
# Add user message (text-only prompt)
await openai_session.add_items([{"role": "user", "content": prompt}])
# Add assistant response
await openai_session.add_items([{"role": "assistant", "content": reply}])

Copilot uses AI. Check for mistakes.

return AgentReplyText(text=str(reply), prompt=prompt)
except Exception as e:
return AgentReplyText(text=f"Error during agent execution: {str(e)}")
Expand Down
36 changes: 31 additions & 5 deletions ak-py/src/agentkernel/integration/instagram/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,29 +33,30 @@ This requires an **Instagram User Access Token** (starts with `IGAA...`).
### Configuration Steps

1. **Create a Meta App**

- Go to https://developers.facebook.com/apps
- Create a new app and select "Business" type
- Add **Instagram API** product (with Business Login)

2. **Set Up Business Login for Instagram**

- In **Use Cases**, select "Instagram Business"
- Click "API setup with Instagram login"
- Add required permissions:
- `instagram_business_basic`
- `instagram_business_manage_messages`

3. **Generate Access Token**

- Go to "Generate access tokens" section
- Add your Instagram Professional account
- Generate a token with the required permissions
- The token will start with `IGAA...`

4. **Get Your Credentials**

- **Access Token**: Generated from the step above (starts with `IGAA...`)
- **App Secret**: App > App Settings > Basic (for webhook signature verification)
- **Verify Token**: Create your own secure random string for webhook verification

5. **Configure Webhook**

- In "Configure webhooks" section, enter:
- **Callback URL**: Your public HTTPS endpoint + `/instagram/webhook`
- **Verify Token**: Your chosen verify token
Expand Down Expand Up @@ -114,6 +115,30 @@ RESTAPI.run([handler])
- `messaging_reads`: Read receipts (logged only)
- `messaging_reactions`: Message reactions (logged only)

## Multi-Modal Support (Images & Files)

The Instagram integration fully supports sending and analyzing images and files:

**Supported File Types:**

- **Images**: JPEG, PNG, GIF, WebP
- **Documents**: PDF, Word (.docx), Excel (.xlsx), PowerPoint (.pptx), Text files
- **Media**: Audio and video files

**How It Works:**

1. User sends message with attachment (image or file)
2. Handler downloads attachment from Instagram
3. File is validated against size limit (default 2 MB, configurable)
4. File is base64-encoded for transmission to AI agent
5. Agent analyzes and responds with insights

**File Size Limits:**

- Default maximum: **2 MB per file**
- Base64 encoding increases size by ~33%, so effective usable size is ~1.5 MB
- Configurable via `api.max_file_size` in config.yaml

## Character Limits

Instagram DM messages have a 1000 character limit. Long responses are automatically split into multiple messages.
Expand All @@ -123,6 +148,7 @@ Instagram DM messages have a 1000 character limit. Long responses are automatica
### 401 Unauthorized / Cannot Parse Access Token

This is the most common issue. It occurs when:

- Using a Facebook Page Access Token instead of Instagram User Access Token
- Token has expired (tokens are valid for 60 days)
- Token doesn't have required permissions
Expand Down Expand Up @@ -150,4 +176,4 @@ This is the most common issue. It occurs when:
## Resources

- [Instagram API with Instagram Login](https://developers.facebook.com/docs/instagram-platform/instagram-api-with-instagram-login)
- [Webhook Setup Guide](https://developers.facebook.com/docs/instagram-platform/webhooks)
- [Webhook Setup Guide](https://developers.facebook.com/docs/instagram-platform/webhooks)
117 changes: 106 additions & 11 deletions ak-py/src/agentkernel/integration/instagram/instagram_chat.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import base64
import hashlib
import hmac
import logging
import mimetypes
Copy link

Copilot AI Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The mimetypes module is imported but never used in this file. Consider removing this unused import.

Suggested change
import mimetypes

Copilot uses AI. Check for mistakes.
import traceback

import httpx
from fastapi import APIRouter, HTTPException, Request

from ...api import RESTRequestHandler
from ...core import AgentService, Config
from ...core.model import AgentRequestFile, AgentRequestImage, AgentRequestText


class AgentInstagramRequestHandler(RESTRequestHandler):
Expand Down Expand Up @@ -35,6 +38,7 @@ def __init__(self):
self._app_secret = Config.get().instagram.app_secret
self._instagram_account_id = Config.get().instagram.instagram_account_id
self._api_version = Config.get().instagram.api_version or "v21.0"
self._max_file_size = Config.get().api.max_file_size
# Use graph.instagram.com for Business Login for Instagram (without Facebook)
self._base_url = f"https://graph.instagram.com/{self._api_version}"
if not all([self._access_token, self._verify_token]):
Expand Down Expand Up @@ -151,7 +155,8 @@ async def _handle_message(self, messaging_event: dict):
sender_id = messaging_event.get("sender", {}).get("id")
message = messaging_event.get("message", {})
message_id = message.get("mid")
message_text = message.get("text")
message_text = message.get("text", "").strip()
attachments = message.get("attachments", [])

if not sender_id or not message_id:
self._log.warning("Message missing required fields (sender/mid)")
Expand All @@ -162,13 +167,15 @@ async def _handle_message(self, messaging_event: dict):
self._log.debug(f"Skipping echo message {message_id}")
return

# Skip messages with attachments that don't have text
if not message_text:
self._log.warning("Message has no text content")
# Skip if no text and no attachments
if not message_text and not attachments:
self._log.warning("Message has no text content or attachments")
return

self._log.debug(f"Processing message {message_id} from {sender_id}: {message_text}")
await self._process_agent_message(sender_id, message_text)
self._log.debug(
f"Processing message {message_id} from {sender_id}: text='{message_text}', attachments={len(attachments)}"
)
await self._process_agent_message(sender_id, message_text, attachments)

async def _handle_postback(self, messaging_event: dict):
"""
Expand All @@ -195,12 +202,13 @@ async def _handle_postback(self, messaging_event: dict):
self._log.debug(f"Processing postback from {sender_id}: {message_text}")
await self._process_agent_message(sender_id, message_text)

async def _process_agent_message(self, sender_id: str, message_text: str):
async def _process_agent_message(self, sender_id: str, message_text: str, attachments: list = None):
"""
Process a message through the agent and send the response.

:param sender_id: Instagram-scoped user ID
:param message_text: The message text to process
:param attachments: Optional list of attachments
"""
service = AgentService()
session_id = sender_id # Use sender_id as session_id to maintain conversation context
Expand All @@ -220,13 +228,35 @@ async def _process_agent_message(self, sender_id: str, message_text: str):
await self._send_typing_indicator(sender_id, False)
return

# Build requests list with text and attachments
requests = []

# Add text if present
if message_text:
requests.append(AgentRequestText(text=message_text))

# Process attachments (images and files)
if attachments:
for attachment in attachments:
await self._process_attachment(attachment, requests)

# Run the agent
result = await service.run(message_text)
if requests:
# Use run_multi for multimodal requests
if len(requests) > 1 or any(isinstance(r, (AgentRequestFile, AgentRequestImage)) for r in requests):
result = await service.run_multi(requests=requests)
else:
result = await service.run(message_text) if message_text else None
Comment on lines +246 to +249
Copy link

Copilot AI Dec 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic on lines 246-249 has the same issue as in the Messenger integration. When there's only one request and it's an AgentRequestFile or AgentRequestImage (no text), the code will try to call service.run(message_text) where message_text is an empty string. Consider using run_multi for all cases where attachments are present.

Copilot uses AI. Check for mistakes.
else:
result = None

if hasattr(result, "raw"):
response_text = str(result.raw)
if result:
if hasattr(result, "raw"):
response_text = str(result.raw)
else:
response_text = str(result)
else:
response_text = str(result)
response_text = "Sorry, I could not process your message."

self._log.debug(f"Agent response: {response_text}")

Expand All @@ -239,6 +269,71 @@ async def _process_agent_message(self, sender_id: str, message_text: str):
await self._send_typing_indicator(sender_id, False)
await self._send_message(sender_id, "Sorry, there was an error processing your request.")

async def _process_attachment(self, attachment: dict, requests: list):
"""
Process an Instagram attachment (image or file).

:param attachment: Attachment object from message
:param requests: List to append the processed request to
"""
attachment_type = attachment.get("type")
payload = attachment.get("payload", {})
url = payload.get("url")

if not url:
self._log.warning(f"Attachment has no URL: {attachment}")
return

try:
# Download the attachment
async with httpx.AsyncClient() as client:
response = await client.get(url, timeout=10.0)
response.raise_for_status()
file_data = response.content

# Check file size
if len(file_data) > self._max_file_size:
self._log.warning(
f"Attachment size ({len(file_data) / (1024 * 1024):.2f} MB) exceeds maximum allowed size of {self._max_file_size / (1024 * 1024):.2f} MB"
)
return

# Encode to base64
file_data_base64 = base64.b64encode(file_data).decode("utf-8")

# Get MIME type
mime_type = response.headers.get("content-type", "application/octet-stream")

# Extract filename from URL if available
filename = url.split("/")[-1].split("?")[0] or f"attachment_{len(requests)}"

self._log.debug(
f"Downloaded {attachment_type} attachment: {filename} (size: {len(file_data)} bytes, type: {mime_type})"
)

# Classify based on attachment type and MIME type
if attachment_type == "image" or (mime_type and mime_type.startswith("image/")):
self._log.debug(f"Adding image: {filename}")
requests.append(
AgentRequestImage(
image_data=file_data_base64,
name=filename,
mime_type=mime_type,
)
)
else:
self._log.debug(f"Adding file: {filename}")
requests.append(
AgentRequestFile(
file_data=file_data_base64,
name=filename,
mime_type=mime_type,
)
)

except Exception as e:
self._log.error(f"Error processing attachment: {e}\n{traceback.format_exc()}")

async def _send_message(self, recipient_id: str, text: str):
"""
Send an Instagram message using the Instagram Messaging API.
Expand Down
Loading
Loading