yaalalabs · pulinduvidmal · Dec 19, 2025 · Dec 23, 2025 · Dec 23, 2025 · Dec 23, 2025
diff --git a/ak-py/src/agentkernel/framework/openai/openai.py b/ak-py/src/agentkernel/framework/openai/openai.py
@@ -169,6 +169,14 @@ async def run(self, agent: Any, session: Session, requests: list[AgentRequest])
                 # Multimodal case with images/files. When using multimodal inputs, OpenAI cannot handle session. So these inputs are not saved in the context
                 reply = (await Runner.run(agent.agent, message_content, session=None)).final_output
 
+                # Manually save the multimodal conversation to session for future reference
+                if session:
+                    openai_session = session.get("openai") or session.set("openai", OpenAISession())
+                    # Add user message
+                    await openai_session.add_items([{"role": "user", "content": prompt}])
+                    # Add assistant response
+                    await openai_session.add_items([{"role": "assistant", "content": reply}])
-                # Multimodal case with images/files. When using multimodal inputs, OpenAI cannot handle session. So these inputs are not saved in the context
-                reply = (await Runner.run(agent.agent, message_content, session=None)).final_output
-
-                # Manually save the multimodal conversation to session for future reference
-                if session:
-                    openai_session = session.get("openai") or session.set("openai", OpenAISession())
-                    # Add user message
-                    await openai_session.add_items([{"role": "user", "content": prompt}])
-                    # Add assistant response
-                    await openai_session.add_items([{"role": "assistant", "content": reply}])
+                # Multimodal case with images/files.
+                # NOTE: When using multimodal inputs, the underlying OpenAI SDK cannot reliably handle
+                # session state in the same way as text-only interactions. To avoid inconsistent behavior,
+                # we invoke the agent with `session=None` here, and then (best-effort) persist a simplified
+                # text representation of the interaction into the session for downstream consumers that
+                # rely on conversational history.
+                #
+                # Limitations:
+                # - Only the textual `prompt` and the assistant `reply` are stored; non-text multimodal
+                #   content (e.g., images, files) is not preserved in the session history.
+                # - The stored context may therefore diverge from the true multimodal context seen by
+                #   the model at generation time.
+                reply = (await Runner.run(agent.agent, message_content, session=None)).final_output
+
+                # Manually save the multimodal conversation to the OpenAI session for future reference,
+                # using the same session helper that is used in the text-only path.
+                if session:
+                    openai_session = self._session(session)
+                    if isinstance(openai_session, SessionABC):
+                        # Add user message (text-only prompt)
+                        await openai_session.add_items([{"role": "user", "content": prompt}])
+                        # Add assistant response
+                        await openai_session.add_items([{"role": "assistant", "content": reply}])
-                # Multimodal case with images/files. When using multimodal inputs, OpenAI cannot handle session. So these inputs are not saved in the context
-                reply = (await Runner.run(agent.agent, message_content, session=None)).final_output
-
-                # Manually save the multimodal conversation to session for future reference
-                if session:
-                    openai_session = session.get("openai") or session.set("openai", OpenAISession())
-                    # Add user message
-                    await openai_session.add_items([{"role": "user", "content": prompt}])
-                    # Add assistant response
-                    await openai_session.add_items([{"role": "assistant", "content": reply}])
+                # Multimodal case with images/files.
+                # NOTE: When using multimodal inputs, the underlying OpenAI SDK cannot reliably handle
+                # session state in the same way as text-only interactions. To avoid inconsistent behavior,
+                # we invoke the agent with `session=None` here, and then (best-effort) persist a simplified
+                # text representation of the interaction into the session for downstream consumers that
+                # rely on conversational history.
+                #
+                # Limitations:
+                # - Only the textual `prompt` and the assistant `reply` are stored; non-text multimodal
+                #   content (e.g., images, files) is not preserved in the session history.
+                # - The stored context may therefore diverge from the true multimodal context seen by
+                #   the model at generation time.
+                reply = (await Runner.run(agent.agent, message_content, session=None)).final_output
+
+                # Manually save the multimodal conversation to the OpenAI session for future reference,
+                # using the same session helper that is used in the text-only path.
+                if session:
+                    openai_session = self._session(session)
+                    if isinstance(openai_session, SessionABC):
+                        # Add user message (text-only prompt)
+                        await openai_session.add_items([{"role": "user", "content": prompt}])
+                        # Add assistant response
+                        await openai_session.add_items([{"role": "assistant", "content": reply}])
+
             return AgentReplyText(text=str(reply), prompt=prompt)
         except Exception as e:
             return AgentReplyText(text=f"Error during agent execution: {str(e)}")

diff --git a/ak-py/src/agentkernel/integration/instagram/README.md b/ak-py/src/agentkernel/integration/instagram/README.md
@@ -33,29 +33,30 @@ This requires an **Instagram User Access Token** (starts with `IGAA...`).
 ### Configuration Steps
 
 1. **Create a Meta App**
+
    - Go to https://developers.facebook.com/apps
    - Create a new app and select "Business" type
    - Add **Instagram API** product (with Business Login)
-
 2. **Set Up Business Login for Instagram**
+
    - In **Use Cases**, select "Instagram Business"
    - Click "API setup with Instagram login"
    - Add required permissions:
      - `instagram_business_basic`
      - `instagram_business_manage_messages`
-
 3. **Generate Access Token**
+
    - Go to "Generate access tokens" section
    - Add your Instagram Professional account
    - Generate a token with the required permissions
    - The token will start with `IGAA...`
-
 4. **Get Your Credentials**
+
    - **Access Token**: Generated from the step above (starts with `IGAA...`)
    - **App Secret**: App > App Settings > Basic (for webhook signature verification)
    - **Verify Token**: Create your own secure random string for webhook verification
-
 5. **Configure Webhook**
+
    - In "Configure webhooks" section, enter:
      - **Callback URL**: Your public HTTPS endpoint + `/instagram/webhook`
      - **Verify Token**: Your chosen verify token
@@ -114,6 +115,30 @@ RESTAPI.run([handler])
 - `messaging_reads`: Read receipts (logged only)
 - `messaging_reactions`: Message reactions (logged only)
 
+## Multi-Modal Support (Images & Files) 
+
+The Instagram integration fully supports sending and analyzing images and files:
+
+**Supported File Types:**
+
+- **Images**: JPEG, PNG, GIF, WebP
+- **Documents**: PDF, Word (.docx), Excel (.xlsx), PowerPoint (.pptx), Text files
+- **Media**: Audio and video files
+
+**How It Works:**
+
+1. User sends message with attachment (image or file)
+2. Handler downloads attachment from Instagram
+3. File is validated against size limit (default 2 MB, configurable)
+4. File is base64-encoded for transmission to AI agent
+5. Agent analyzes and responds with insights
+
+**File Size Limits:**
+
+- Default maximum: **2 MB per file**
+- Base64 encoding increases size by ~33%, so effective usable size is ~1.5 MB
+- Configurable via `api.max_file_size` in config.yaml
+
 ## Character Limits
 
 Instagram DM messages have a 1000 character limit. Long responses are automatically split into multiple messages.
@@ -123,6 +148,7 @@ Instagram DM messages have a 1000 character limit. Long responses are automatica
 ### 401 Unauthorized / Cannot Parse Access Token
 
 This is the most common issue. It occurs when:
+
 - Using a Facebook Page Access Token instead of Instagram User Access Token
 - Token has expired (tokens are valid for 60 days)
 - Token doesn't have required permissions
@@ -150,4 +176,4 @@ This is the most common issue. It occurs when:
 ## Resources
 
 - [Instagram API with Instagram Login](https://developers.facebook.com/docs/instagram-platform/instagram-api-with-instagram-login)
-- [Webhook Setup Guide](https://developers.facebook.com/docs/instagram-platform/webhooks)
+- [Webhook Setup Guide](https://developers.facebook.com/docs/instagram-platform/webhooks)
diff --git a/ak-py/src/agentkernel/integration/instagram/instagram_chat.py b/ak-py/src/agentkernel/integration/instagram/instagram_chat.py
@@ -1,13 +1,16 @@
+import base64
 import hashlib
 import hmac
 import logging
+import mimetypes
-import mimetypes
-import mimetypes
 import traceback
 
 import httpx
 from fastapi import APIRouter, HTTPException, Request
 
 from ...api import RESTRequestHandler
 from ...core import AgentService, Config
+from ...core.model import AgentRequestFile, AgentRequestImage, AgentRequestText
 
 
 class AgentInstagramRequestHandler(RESTRequestHandler):
@@ -35,6 +38,7 @@ def __init__(self):
         self._app_secret = Config.get().instagram.app_secret
         self._instagram_account_id = Config.get().instagram.instagram_account_id
         self._api_version = Config.get().instagram.api_version or "v21.0"
+        self._max_file_size = Config.get().api.max_file_size
         # Use graph.instagram.com for Business Login for Instagram (without Facebook)
         self._base_url = f"https://graph.instagram.com/{self._api_version}"
         if not all([self._access_token, self._verify_token]):
@@ -151,7 +155,8 @@ async def _handle_message(self, messaging_event: dict):
         sender_id = messaging_event.get("sender", {}).get("id")
         message = messaging_event.get("message", {})
         message_id = message.get("mid")
-        message_text = message.get("text")
+        message_text = message.get("text", "").strip()
+        attachments = message.get("attachments", [])
 
         if not sender_id or not message_id:
             self._log.warning("Message missing required fields (sender/mid)")
@@ -162,13 +167,15 @@ async def _handle_message(self, messaging_event: dict):
             self._log.debug(f"Skipping echo message {message_id}")
             return
 
-        # Skip messages with attachments that don't have text
-        if not message_text:
-            self._log.warning("Message has no text content")
+        # Skip if no text and no attachments
+        if not message_text and not attachments:
+            self._log.warning("Message has no text content or attachments")
             return
 
-        self._log.debug(f"Processing message {message_id} from {sender_id}: {message_text}")
-        await self._process_agent_message(sender_id, message_text)
+        self._log.debug(
+            f"Processing message {message_id} from {sender_id}: text='{message_text}', attachments={len(attachments)}"
+        )
+        await self._process_agent_message(sender_id, message_text, attachments)
 
     async def _handle_postback(self, messaging_event: dict):
         """
@@ -195,12 +202,13 @@ async def _handle_postback(self, messaging_event: dict):
         self._log.debug(f"Processing postback from {sender_id}: {message_text}")
         await self._process_agent_message(sender_id, message_text)
 
-    async def _process_agent_message(self, sender_id: str, message_text: str):
+    async def _process_agent_message(self, sender_id: str, message_text: str, attachments: list = None):
         """
         Process a message through the agent and send the response.
 
         :param sender_id: Instagram-scoped user ID
         :param message_text: The message text to process
+        :param attachments: Optional list of attachments
         """
         service = AgentService()
         session_id = sender_id  # Use sender_id as session_id to maintain conversation context
@@ -220,13 +228,35 @@ async def _process_agent_message(self, sender_id: str, message_text: str):
                 await self._send_typing_indicator(sender_id, False)
                 return
 
+            # Build requests list with text and attachments
+            requests = []
+
+            # Add text if present
+            if message_text:
+                requests.append(AgentRequestText(text=message_text))
+
+            # Process attachments (images and files)
+            if attachments:
+                for attachment in attachments:
+                    await self._process_attachment(attachment, requests)
+
             # Run the agent
-            result = await service.run(message_text)
+            if requests:
+                # Use run_multi for multimodal requests
+                if len(requests) > 1 or any(isinstance(r, (AgentRequestFile, AgentRequestImage)) for r in requests):
+                    result = await service.run_multi(requests=requests)
+                else:
+                    result = await service.run(message_text) if message_text else None
+            else:
+                result = None
 
-            if hasattr(result, "raw"):
-                response_text = str(result.raw)
+            if result:
+                if hasattr(result, "raw"):
+                    response_text = str(result.raw)
+                else:
+                    response_text = str(result)
             else:
-                response_text = str(result)
+                response_text = "Sorry, I could not process your message."
 
             self._log.debug(f"Agent response: {response_text}")
 
@@ -239,6 +269,71 @@ async def _process_agent_message(self, sender_id: str, message_text: str):
             await self._send_typing_indicator(sender_id, False)
             await self._send_message(sender_id, "Sorry, there was an error processing your request.")
 
+    async def _process_attachment(self, attachment: dict, requests: list):
+        """
+        Process an Instagram attachment (image or file).
+
+        :param attachment: Attachment object from message
+        :param requests: List to append the processed request to
+        """
+        attachment_type = attachment.get("type")
+        payload = attachment.get("payload", {})
+        url = payload.get("url")
+
+        if not url:
+            self._log.warning(f"Attachment has no URL: {attachment}")
+            return
+
+        try:
+            # Download the attachment
+            async with httpx.AsyncClient() as client:
+                response = await client.get(url, timeout=10.0)
+                response.raise_for_status()
+                file_data = response.content
+
+            # Check file size
+            if len(file_data) > self._max_file_size:
+                self._log.warning(
+                    f"Attachment size ({len(file_data) / (1024 * 1024):.2f} MB) exceeds maximum allowed size of {self._max_file_size / (1024 * 1024):.2f} MB"
+                )
+                return
+
+            # Encode to base64
+            file_data_base64 = base64.b64encode(file_data).decode("utf-8")
+
+            # Get MIME type
+            mime_type = response.headers.get("content-type", "application/octet-stream")
+
+            # Extract filename from URL if available
+            filename = url.split("/")[-1].split("?")[0] or f"attachment_{len(requests)}"
+
+            self._log.debug(
+                f"Downloaded {attachment_type} attachment: {filename} (size: {len(file_data)} bytes, type: {mime_type})"
+            )
+
+            # Classify based on attachment type and MIME type
+            if attachment_type == "image" or (mime_type and mime_type.startswith("image/")):
+                self._log.debug(f"Adding image: {filename}")
+                requests.append(
+                    AgentRequestImage(
+                        image_data=file_data_base64,
+                        name=filename,
+                        mime_type=mime_type,
+                    )
+                )
+            else:
+                self._log.debug(f"Adding file: {filename}")
+                requests.append(
+                    AgentRequestFile(
+                        file_data=file_data_base64,
+                        name=filename,
+                        mime_type=mime_type,
+                    )
+                )
+
+        except Exception as e:
+            self._log.error(f"Error processing attachment: {e}\n{traceback.format_exc()}")
+
     async def _send_message(self, recipient_id: str, text: str):
         """
         Send an Instagram message using the Instagram Messaging API.