diff --git a/.gitmodules b/.gitmodules
index c6602c8..65f0d07 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -10,3 +10,6 @@
 [submodule "esp32-s3-box-3/components/esp-protocols"]
 	path = esp32-s3-box-3/components/esp-protocols
 	url = https://github.com/espressif/esp-protocols.git
+[submodule "esp32-s3-box-3/esp-adf"]
+	path = esp32-s3-box-3/esp-adf
+	url = https://github.com/espressif/esp-adf.git
diff --git a/esp32-s3-box-3/CMakeLists.txt b/esp32-s3-box-3/CMakeLists.txt
index fbba559..ae931c7 100644
--- a/esp32-s3-box-3/CMakeLists.txt
+++ b/esp32-s3-box-3/CMakeLists.txt
@@ -7,6 +7,11 @@ if(NOT IDF_TARGET STREQUAL linux)
 
   add_compile_definitions(WIFI_SSID="$ENV{WIFI_SSID}")
   add_compile_definitions(WIFI_PASSWORD="$ENV{WIFI_PASSWORD}")
+
+  # We are not using video in this app, so set the max NALU size to very small
+  if(NOT DEFINED ENV{CONFIG_MAX_NALU_SIZE})
+    add_compile_definitions(CONFIG_MAX_NALU_SIZE=32)
+  endif()
 endif()
 
 if(NOT DEFINED ENV{PIPECAT_SMALLWEBRTC_URL})
@@ -20,7 +25,7 @@ endif()
 add_compile_definitions(PIPECAT_SMALLWEBRTC_URL="$ENV{PIPECAT_SMALLWEBRTC_URL}")
 
 set(COMPONENTS src)
-set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus")
+set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus" "esp-adf/components")
 
 if(IDF_TARGET STREQUAL linux)
   add_compile_definitions(LINUX_BUILD=1)
diff --git a/esp32-s3-box-3/dependencies.lock b/esp32-s3-box-3/dependencies.lock
index e6c6da2..d533402 100644
--- a/esp32-s3-box-3/dependencies.lock
+++ b/esp32-s3-box-3/dependencies.lock
@@ -1,4 +1,10 @@
 dependencies:
+  esp_codec_dev:
+    dependencies: []
+    source:
+      path: /home/khkramer/src/pipecat-esp32/esp32-s3-box-3/esp-adf/components/esp_codec_dev
+      type: local
+    version: 1.3.5
   espressif/button:
     component_hash: f53face2ab21fa0ffaf4cf0f6e513d393f56df6586bb2ad1146120f03f19ee05
     dependencies:
@@ -63,16 +69,16 @@ dependencies:
     targets:
     - esp32s3
     version: 3.0.0~1
-  espressif/esp_codec_dev:
-    component_hash: c71e2d13dad6fc41561590dd88dbc45c79e3f4ef48d5ee3575c60e8b6c8e79d5
+  espressif/esp-dsp:
+    component_hash: 619639efc18cfa361a9e423739b9b0ffc14991effc6c027f955c2f2c3bf1754b
     dependencies:
     - name: idf
       require: private
-      version: '>=4.0'
+      version: '>=4.2'
     source:
-      registry_url: https://components.espressif.com
+      registry_url: https://components.espressif.com/
       type: service
-    version: 1.3.5
+    version: 1.6.0
   espressif/esp_lcd_ili9341:
     component_hash: 31f1b793aa2110dd2ae071c21ccbff0a4eb20d9a4ee40b6294c0dc0ad9552c4e
     dependencies:
@@ -84,7 +90,7 @@ dependencies:
       require: private
       version: 0.*
     source:
-      registry_url: https://components.espressif.com
+      registry_url: https://components.espressif.com/
       type: service
     version: 1.2.0
   espressif/esp_lcd_touch:
@@ -139,6 +145,16 @@ dependencies:
       registry_url: https://components.espressif.com
       type: service
     version: 2.6.0
+  espressif/esp_websocket_client:
+    component_hash: f77326f0e1c38da4e9c97e17c5d649b0dd13027f2645e720e48db269638fd622
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=5.0'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.4.0
   espressif/icm42670:
     component_hash: 28b56e174f75c70037f5208aaed6c3789f0d243500d975519584bf9dc8c0836c
     dependencies:
@@ -149,6 +165,36 @@ dependencies:
       registry_url: https://components.espressif.com
       type: service
     version: 2.0.2
+  espressif/jsmn:
+    component_hash: d80350c41bbaa827c98a25b6072df00884e72f54885996fab4a4f0aebce6b6c3
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=4.3'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.1.0
+  espressif/nghttp:
+    component_hash: 32e513409ddce0000990ec74b0711f535462726236d72578cd70498e6bd66dc3
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=5.0'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.65.0
+  espressif/zlib:
+    component_hash: d901723af51f13fc8e5824f39f32239c847956e8fd951a05266588dc5cfbb9ae
+    dependencies:
+    - name: idf
+      require: private
+      version: '>=4.4'
+    source:
+      registry_url: https://components.espressif.com/
+      type: service
+    version: 1.3.1
   idf:
     source:
       type: idf
@@ -162,7 +208,13 @@ dependencies:
     version: 9.3.0
 direct_dependencies:
 - espressif/esp-box-3
+- espressif/esp-dsp
+- espressif/esp_lcd_ili9341
+- espressif/esp_websocket_client
+- espressif/jsmn
+- espressif/nghttp
+- espressif/zlib
 - idf
-manifest_hash: 55f6a702498de8e87201da504a38c5c958e2094ec832c4f8d6d368369d5d0625
+manifest_hash: 1891dc87058c6eec7aa77f286a1967ebe7c293709db8da5b2f1e79d2e45f57ef
 target: esp32s3
 version: 2.0.0
diff --git a/esp32-s3-box-3/esp-adf b/esp32-s3-box-3/esp-adf
new file mode 160000
index 0000000..1e839af
--- /dev/null
+++ b/esp32-s3-box-3/esp-adf
@@ -0,0 +1 @@
+Subproject commit 1e839afb5a5166aad6028ac2796272f32454dced
diff --git a/esp32-s3-box-3/partitions.csv b/esp32-s3-box-3/partitions.csv
index 03d5d13..6bfa8e4 100644
--- a/esp32-s3-box-3/partitions.csv
+++ b/esp32-s3-box-3/partitions.csv
@@ -2,5 +2,5 @@
 # Name,   Type, SubType, Offset,  Size, Flags
 nvs,      data, nvs,     0x9000,  0x6000,
 phy_init, data, phy,     0xf000,  0x1000,
-factory,  app,  factory, 0x10000, 0x180000,
+factory,  app,  factory, 0x10000, 0x200000,
 
diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults
index 236be55..9abc65a 100644
--- a/esp32-s3-box-3/sdkconfig.defaults
+++ b/esp32-s3-box-3/sdkconfig.defaults
@@ -18,6 +18,9 @@ CONFIG_ESP_MAIN_TASK_STACK_SIZE=16384
 # Defaults to partitions.csv
 CONFIG_PARTITION_TABLE_CUSTOM=y
 
+# Set flash size to 16MB for ESP32-S3-Box-3
+CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
+
 # Set highest CPU Freq
 CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
 
@@ -33,3 +36,10 @@ CONFIG_COMPILER_OPTIMIZATION_PERF=y
 CONFIG_COMPILER_OPTIMIZATION_ASSERTIONS_DISABLE=y
 
 CONFIG_CODEC_I2C_BACKWARD_COMPATIBLE=n
+
+# Use C library memory allocator for LVGL to save 64KB of BSS
+# This removes the 64KB work_mem_int buffer from BSS
+CONFIG_LV_USE_BUILTIN_MALLOC=n
+CONFIG_LV_USE_CLIB_MALLOC=y
+CONFIG_BSP_LCD_DRAW_BUF_HEIGHT=10
+CONFIG_BSP_LCD_DRAW_BUF_DOUBLE=n
diff --git a/esp32-s3-box-3/src/CMakeLists.txt b/esp32-s3-box-3/src/CMakeLists.txt
index da4e1e7..cc4ab61 100644
--- a/esp32-s3-box-3/src/CMakeLists.txt
+++ b/esp32-s3-box-3/src/CMakeLists.txt
@@ -7,7 +7,7 @@ if(IDF_TARGET STREQUAL linux)
 else()
 	idf_component_register(
 		SRCS ${COMMON_SRC} "wifi.cpp" "media.cpp" "rtvi.cpp" "rtvi_callbacks.cpp" "screen.cpp"
-		REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl)
+		REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl esp-sr)
 endif()
 
 idf_component_get_property(lib peer COMPONENT_LIB)
diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp
index 8bc08f1..2d18090 100644
--- a/esp32-s3-box-3/src/main.cpp
+++ b/esp32-s3-box-3/src/main.cpp
@@ -1,5 +1,6 @@
 #include "main.h"
 
+#include <inttypes.h>
 #include <esp_event.h>
 #include <esp_log.h>
 #include <peer.h>
@@ -7,27 +8,49 @@
 #ifndef LINUX_BUILD
 #include "nvs_flash.h"
 
+// All operations that modify the webrtc state should use this semaphore to ensure thread safety
+SemaphoreHandle_t webrtcSemaphore = NULL;
+
 extern "C" void app_main(void) {
-  esp_err_t ret = nvs_flash_init();
-  if (ret == ESP_ERR_NVS_NO_FREE_PAGES ||
-      ret == ESP_ERR_NVS_NEW_VERSION_FOUND) {
-    ESP_ERROR_CHECK(nvs_flash_erase());
-    ret = nvs_flash_init();
-  }
-  ESP_ERROR_CHECK(ret);
 
+  ESP_LOGI(LOG_TAG, "[APP] Startup..");
+  ESP_LOGI(LOG_TAG, "[APP] IDF version: %s", esp_get_idf_version());
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
+  ESP_ERROR_CHECK(nvs_flash_init());
   ESP_ERROR_CHECK(esp_event_loop_create_default());
+
+  webrtcSemaphore = xSemaphoreCreateMutex();
+
   pipecat_init_screen();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   peer_init();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_audio_capture();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_audio_decoder();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
+  pipecat_init_aec();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_wifi();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
   pipecat_init_webrtc();
+  heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
 
+  ESP_LOGI(LOG_TAG, "Pipecat ESP32 client initialized");
   pipecat_screen_system_log("Pipecat ESP32 client initialized\n");
 
+  ESP_LOGI(LOG_TAG, "Starting webrtc task");
+  pipecat_webrtc_run_task();
+
   while (1) {
-    pipecat_webrtc_loop();
+    // Add some stats printout to the screen here
     vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL));
   }
 }
@@ -38,6 +61,7 @@ int main(void) {
   pipecat_webrtc();
 
   while (1) {
+    // Todo: test/fix linux build
     pipecat_webrtc_loop();
     vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL));
   }
diff --git a/esp32-s3-box-3/src/main.h b/esp32-s3-box-3/src/main.h
index 389d7ee..5797b52 100644
--- a/esp32-s3-box-3/src/main.h
+++ b/esp32-s3-box-3/src/main.h
@@ -14,16 +14,19 @@ extern void pipecat_init_audio_decoder();
 extern void pipecat_init_audio_encoder();
 extern void pipecat_send_audio(PeerConnection *peer_connection);
 extern void pipecat_audio_decode(uint8_t *data, size_t size);
+extern void pipecat_init_aec();
 
 // WebRTC / Signalling
 extern void pipecat_init_webrtc();
 extern void pipecat_webrtc_loop();
+extern void pipecat_webrtc_run_task();
 extern void pipecat_http_request(char *offer, char *answer);
 
 // RTVI
 typedef struct {
   void (*on_bot_started_speaking)();
   void (*on_bot_stopped_speaking)();
+
   void (*on_bot_tts_text)(const char *text);
 } rtvi_callbacks_t;
 
diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp
index ba95185..989d2af 100644
--- a/esp32-s3-box-3/src/media.cpp
+++ b/esp32-s3-box-3/src/media.cpp
@@ -3,28 +3,32 @@
 
 #include <atomic>
 #include <cstring>
+#include <esp_timer.h>
+#include <freertos/ringbuf.h>
 
 #include "esp_log.h"
 #include "main.h"
 
+#include "esp_afe_aec.h"
+
+
 #define SAMPLE_RATE (16000)
 
 #define OPUS_BUFFER_SIZE 1276  // 1276 bytes is recommended by opus_encode
-#define PCM_BUFFER_SIZE 640
+#define AEC_CHUNK_SIZE 256  // AEC processes 256 samples at a time for 16kHz
+#define OPUS_FRAME_SIZE 320  // Opus frame size in samples
+#define COMMON_BUFFER_SAMPLES 1280  // LCM(256, 320) = 1280 samples
+#define AEC_CHUNKS_TO_BATCH (COMMON_BUFFER_SAMPLES / AEC_CHUNK_SIZE)  // 5 chunks
+#define OPUS_FRAMES_TO_BATCH (COMMON_BUFFER_SAMPLES / OPUS_FRAME_SIZE)  // 4 frames
+#define PCM_BUFFER_SIZE (COMMON_BUFFER_SAMPLES * sizeof(int16_t))  // 2560 bytes
 
 #define OPUS_ENCODER_BITRATE 30000
 #define OPUS_ENCODER_COMPLEXITY 0
 
+// AEC control
+static bool aec_enabled = true;  // Can be set to false to bypass AEC
+
 std::atomic<bool> is_playing = false;
-void set_is_playing(int16_t *in_buf, size_t in_samples) {
-  bool any_set = false;
-  for (size_t i = 0; i < in_samples; i++) {
-    if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) {
-      any_set = true;
-    }
-  }
-  is_playing = any_set;
-}
 
 esp_codec_dev_handle_t mic_codec_dev = NULL;
 esp_codec_dev_handle_t spk_codec_dev = NULL;
@@ -48,6 +52,39 @@ void pipecat_init_audio_capture() {
 opus_int16 *decoder_buffer = NULL;
 OpusDecoder *opus_decoder = NULL;
 
+// Circular buffer for speaker reference data (not using FreeRTOS ring buffer)
+// This is a sliding window buffer, not a queue - we read recent data for AEC
+static int16_t speaker_circular_buffer[960];   // 3 Opus frames (60ms @ 16kHz)
+static volatile size_t speaker_write_idx = 0;
+static volatile size_t speaker_read_idx = 0;
+static volatile size_t speaker_buffer_count = 0;  // Number of samples in buffer
+static volatile bool speaker_buffer_active = false; // True when we're receiving audio
+static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED;
+static volatile uint64_t last_buffer_read_time = 0;  // Track when buffer was last read
+
+// Function to update playback state and reset buffer when playback stops
+void set_is_playing(int16_t *in_buf, size_t in_samples) {
+  bool any_set = false;
+  for (size_t i = 0; i < in_samples; i++) {
+    if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) {
+      any_set = true;
+    }
+  }
+  bool was_playing = is_playing.load();
+  is_playing = any_set;
+  
+  // Reset buffer state when playback stops
+  if (was_playing && !any_set) {
+    portENTER_CRITICAL(&speaker_buffer_mutex);
+    speaker_buffer_active = false;
+    speaker_read_idx = speaker_write_idx; // Reset read to match write
+    speaker_buffer_count = 0;
+    last_buffer_read_time = 0;  // Reset read time
+    portEXIT_CRITICAL(&speaker_buffer_mutex);
+    ESP_LOGD(LOG_TAG, "Playback stopped, reset speaker buffer");
+  }
+}
+
 void pipecat_init_audio_decoder() {
   int decoder_error = 0;
   opus_decoder = opus_decoder_create(SAMPLE_RATE, 1, &decoder_error);
@@ -60,12 +97,53 @@ void pipecat_init_audio_decoder() {
 }
 
 void pipecat_audio_decode(uint8_t *data, size_t size) {
-  esp_err_t ret;
-  int decoded_size =
-      opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0);
+  uint64_t now = esp_timer_get_time();
+  
+  // Decode Opus frame to get the size (or use fixed size if predictable)
+  int decoded_size = opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0);
 
   if (decoded_size > 0) {
+    // FIRST: Write to circular buffer for AEC reference (before any other processing)
+    portENTER_CRITICAL(&speaker_buffer_mutex);
+      
+      // Write to circular buffer
+      size_t space_available = 960 - speaker_buffer_count;
+      size_t samples_to_write = decoded_size;
+      
+      // If we don't have enough space, drop oldest samples
+      if (samples_to_write > space_available) {
+        size_t samples_to_drop = samples_to_write - space_available;
+        speaker_read_idx = (speaker_read_idx + samples_to_drop) % 960;
+        speaker_buffer_count -= samples_to_drop;
+      }
+      
+      // Write new samples
+      for (int i = 0; i < samples_to_write; i++) {
+        speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i];
+        speaker_write_idx = (speaker_write_idx + 1) % 960;
+      }
+      speaker_buffer_count += samples_to_write;
+      speaker_buffer_active = true;
+      
+      // Get samples in buffer after write (while still in critical section)
+      size_t samples_in_buffer = speaker_buffer_count;
+      
+      portEXIT_CRITICAL(&speaker_buffer_mutex);
+      
+      // Log timing relative to last read from this buffer
+      uint64_t time_since_last_read = 0;
+      if (last_buffer_read_time > 0 && now > last_buffer_read_time) {
+        time_since_last_read = (now - last_buffer_read_time) / 1000;
+      } else {
+        time_since_last_read = 0;  // First write or invalid timestamp
+      }
+      
+    
+    // THEN: Process for playback
     set_is_playing(decoder_buffer, decoded_size);
+    
+    // Send to speaker
+    esp_err_t ret;
     if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer,
                                    decoded_size * sizeof(uint16_t))) !=
         ESP_OK) {
@@ -78,6 +156,27 @@ OpusEncoder *opus_encoder = NULL;
 uint8_t *encoder_output_buffer = NULL;
 uint8_t *read_buffer = NULL;
 
+// Timing statistics
+static uint64_t total_read_time = 0;
+static uint64_t total_aec_time = 0;
+static uint64_t total_encode_time = 0;
+static uint64_t total_send_time = 0;
+static uint64_t total_wait_time = 0;
+static uint32_t cycle_count = 0;
+static uint32_t aec_skip_count = 0;
+static uint64_t last_frame_send_time = 0;
+
+// Frame rate monitoring
+static uint64_t first_frame_time = 0;
+static uint32_t frames_sent = 0;
+static float current_frame_rate = 0.0;
+static const uint32_t MIN_FRAMES_FOR_MEASUREMENT = 50;  // Measure over 50 frames (~1 second)
+static const float TARGET_FRAME_RATE = 50.0;  // 50 fps (20ms per frame)
+static const float MIN_ACCEPTABLE_FRAME_RATE = 48.0;  // Allow 4% tolerance
+
+// AEC processed buffer
+static int16_t *processed_buffer = NULL;
+
 void pipecat_init_audio_encoder() {
   int encoder_error;
   opus_encoder = opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_VOIP,
@@ -100,22 +199,304 @@ void pipecat_init_audio_encoder() {
   read_buffer =
       (uint8_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT);
   encoder_output_buffer = (uint8_t *)malloc(OPUS_BUFFER_SIZE);
+  processed_buffer = (int16_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT);
 }
 
-void pipecat_send_audio(PeerConnection *peer_connection) {
-  if (esp_codec_dev_read(mic_codec_dev, read_buffer, PCM_BUFFER_SIZE) !=
-      ESP_OK) {
-    printf("esp_codec_dev_read failed");
+static int16_t *aec_in_buffer = NULL;
+static int16_t *aec_out_buffer = NULL;
+static afe_aec_handle_t *aec_handle = NULL;
+
+void pipecat_init_aec() {
+  // Try a simpler approach - create AEC without the task configuration struct
+  ESP_LOGI(LOG_TAG, "Initializing AEC");
+  
+  // Speaker circular buffer is now statically allocated
+  ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=960 samples");
+  
+  // Create AEC with basic parameters.
+  // Filter_length=4 recommended for ESP32S3, but filter_length=2 and AFE_MODE_LOW_COST seem to be a local
+  // maximum in terms of quality with the current code we have here.
+  aec_handle = afe_aec_create("MR", 2, AFE_TYPE_VC, AFE_MODE_LOW_COST);
+  if (aec_handle == NULL) {
+    ESP_LOGE(LOG_TAG, "Failed to create AEC");
     return;
   }
+  
+  ESP_LOGI(LOG_TAG, "AEC created successfully");
+  
+  int frame_size = afe_aec_get_chunksize(aec_handle);
+  int input_size = frame_size * aec_handle->pcm_config.total_ch_num * sizeof(int16_t);
+  int output_size = frame_size * sizeof(int16_t);
 
-  if (is_playing) {
-    memset(read_buffer, 0, PCM_BUFFER_SIZE);
+  // Allocate buffers in PSRAM instead of internal memory
+  aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_SPIRAM);
+  aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_SPIRAM);
+  
+  if (aec_in_buffer == NULL || aec_out_buffer == NULL) {
+    ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers");
+    // Try internal memory as fallback
+    if (aec_in_buffer) heap_caps_free(aec_in_buffer);
+    if (aec_out_buffer) heap_caps_free(aec_out_buffer);
+    
+    aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_INTERNAL);
+    aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_INTERNAL);
+    
+    if (aec_in_buffer == NULL || aec_out_buffer == NULL) {
+      ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers even in internal memory");
+      return;
+    }
+    ESP_LOGW(LOG_TAG, "AEC buffers allocated in internal memory");
   }
+  
+  ESP_LOGI(LOG_TAG, "AEC frame size: %d, input size: %d, output size: %d", frame_size, input_size, output_size);
+  ESP_LOGI(LOG_TAG, "AEC buffers allocated: in=%p, out=%p", aec_in_buffer, aec_out_buffer);
+}
+
 
-  auto encoded_size = opus_encode(opus_encoder, (const opus_int16 *)read_buffer,
-                                  PCM_BUFFER_SIZE / sizeof(uint16_t),
-                                  encoder_output_buffer, OPUS_BUFFER_SIZE);
-  peer_connection_send_audio(peer_connection, encoder_output_buffer,
-                             encoded_size);
+void pipecat_send_audio(PeerConnection *peer_connection) {
+  uint64_t start_time, end_time;
+  static uint64_t last_call_end_time = 0;
+  uint64_t function_start_time = esp_timer_get_time();
+  
+  
+  // Wait for (960 samples = 3 Opus frames) in our speaker data reference buffer to handle codec burst pattern
+  uint64_t wait_start = esp_timer_get_time();
+  int initial_samples = 0;
+  int wait_iterations = 0;
+  while (true) {
+    portENTER_CRITICAL(&speaker_buffer_mutex);
+    int samples_available = 0;
+    if (speaker_buffer_active) {
+      samples_available = speaker_buffer_count;
+    }
+    portEXIT_CRITICAL(&speaker_buffer_mutex);
+    
+    wait_iterations++;
+    if (initial_samples == 0) {
+      initial_samples = samples_available;
+    }
+    
+    if (samples_available >= 960) {
+      break;  // Buffer has 3 Opus frames - ready to handle burst consumption
+    }
+    
+    // Brief delay before checking again
+    vTaskDelay(1 / portTICK_PERIOD_MS);
+  }
+  uint64_t wait_end = esp_timer_get_time();
+  total_wait_time += (wait_end - wait_start);
+  
+  
+  // Buffers for interleaved processing
+  static int16_t opus_accumulator[640];  // Accumulate AEC output for Opus encoding
+  static int opus_accumulator_pos = 0;
+  int16_t mic_chunk_buffer[256];
+  int aec_chunk_size = 256;  // AEC processes 256 samples at a time
+  
+  // Track timing for AEC skip decision
+  bool skip_last_aec = false;
+  int frame_count = 0;
+  
+  // Track timing - each read blocks for ~16ms (256 samples @ 16kHz)
+  uint64_t loop_start_time = esp_timer_get_time();
+  
+  // Single loop processing 5 chunks of 256 samples each
+  for (int chunk = 0; chunk < 5; chunk++) {
+    uint64_t chunk_start_time = esp_timer_get_time();
+    // 1. Read 256 samples from microphone
+    
+    start_time = esp_timer_get_time();
+    if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) {
+      ESP_LOGE(LOG_TAG, "esp_codec_dev_read failed");
+      return;
+    }
+    end_time = esp_timer_get_time();
+    total_read_time += (end_time - start_time);
+    
+    
+    // 2. Process with AEC or bypass
+    if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) {
+      // Get reference data from speaker circular buffer
+      int16_t ref_chunk[256];
+      int samples_available = 0;
+      bool log_underrun = false;
+      static int underrun_count = 0;
+      static int ref_success_count = 0;
+      
+      portENTER_CRITICAL(&speaker_buffer_mutex);
+        
+        if (speaker_buffer_active) {
+          samples_available = speaker_buffer_count;
+          
+          if (samples_available >= aec_chunk_size) {
+            // We have enough data, read it
+            for (int i = 0; i < aec_chunk_size; i++) {
+              ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
+              speaker_read_idx = (speaker_read_idx + 1) % 960;
+            }
+            speaker_buffer_count -= aec_chunk_size;
+          } else if (samples_available > 0) {
+            // Not enough data, read what we have and pad with zeros
+            int i;
+            for (i = 0; i < samples_available; i++) {
+              ref_chunk[i] = speaker_circular_buffer[speaker_read_idx];
+              speaker_read_idx = (speaker_read_idx + 1) % 960;
+            }
+            speaker_buffer_count = 0;  // We consumed all available samples
+            for (; i < aec_chunk_size; i++) {
+              ref_chunk[i] = 0;
+            }
+            
+            underrun_count++;
+            // Log every underrun for first 20, then every 10th
+            if (underrun_count <= 20 || underrun_count % 10 == 0) {
+              log_underrun = true;
+            }
+          } else {
+            // No data at all - don't advance read pointer, just use zeros
+            for (int i = 0; i < aec_chunk_size; i++) {
+              ref_chunk[i] = 0;
+            }
+            
+            underrun_count++;
+            // Log every underrun for first 20, then every 10th
+            if (underrun_count <= 20 || underrun_count % 10 == 0) {
+              log_underrun = true;
+            }
+          }
+        } else {
+          // Buffer not active, use zeros
+          for (int i = 0; i < aec_chunk_size; i++) {
+            ref_chunk[i] = 0;
+          }
+        }
+        
+        portEXIT_CRITICAL(&speaker_buffer_mutex);
+        
+        // Update last read time if we read any data
+        // Update last read time if we successfully read data
+        if (speaker_buffer_active && samples_available >= aec_chunk_size) {
+          last_buffer_read_time = esp_timer_get_time();
+        } else if (speaker_buffer_active && samples_available > 0) {
+          // Even partial reads update the time
+          last_buffer_read_time = esp_timer_get_time();
+        }
+        
+        if (log_underrun) {
+          uint64_t time_since_loop_start = (esp_timer_get_time() - loop_start_time) / 1000;
+          // Get consistent state outside critical section
+          size_t log_w_idx, log_r_idx, log_cnt;
+          portENTER_CRITICAL(&speaker_buffer_mutex);
+          log_w_idx = speaker_write_idx;
+          log_r_idx = speaker_read_idx;
+          log_cnt = speaker_buffer_count;
+          portEXIT_CRITICAL(&speaker_buffer_mutex);
+          
+          ESP_LOGW(LOG_TAG, "[UNDERRUN] #%d: chunk=%d, only %d/%d samples, %llums since loop start, count=%d, w=%d r=%d", 
+                   underrun_count, chunk, samples_available, aec_chunk_size, 
+                   time_since_loop_start, log_cnt, log_w_idx, log_r_idx);
+        }
+      
+      // Create interleaved AEC input
+      for (int i = 0; i < aec_chunk_size; i++) {
+        aec_in_buffer[i * 2] = mic_chunk_buffer[i];     // Mic
+        aec_in_buffer[i * 2 + 1] = ref_chunk[i];        // Reference
+      }
+      
+      // Run AEC
+      start_time = esp_timer_get_time();
+      int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer);
+      end_time = esp_timer_get_time();
+      total_aec_time += (end_time - start_time);
+      
+      if (ret < 0) {
+        ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret);
+        // Fall back - copy mic input directly
+        memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
+      } else {
+        // Copy AEC output to accumulator
+        memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t));
+      }
+      // ---- 
+      // DEBUG: Just copy mic data
+      // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
+      
+    } else {
+      // AEC disabled or skipped - copy mic input directly
+      memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t));
+      
+    }
+    
+    opus_accumulator_pos += 256;
+    
+    // Note: esp_codec_dev_read blocks, providing natural pacing
+    
+    // 3. Encode and send Opus frames when we have enough samples
+    while (opus_accumulator_pos >= 320) {
+      start_time = esp_timer_get_time();
+      int encoded_size = opus_encode(opus_encoder, opus_accumulator, 320,
+                                    encoder_output_buffer, OPUS_BUFFER_SIZE);
+      end_time = esp_timer_get_time();
+      total_encode_time += (end_time - start_time);
+      
+      start_time = esp_timer_get_time();
+      peer_connection_send_audio(peer_connection, encoder_output_buffer, encoded_size);
+      end_time = esp_timer_get_time();
+      total_send_time += (end_time - start_time);
+      
+      frame_count++;
+      
+      // Frame rate tracking
+      uint64_t now = esp_timer_get_time();
+      if (first_frame_time == 0) {
+        first_frame_time = now;
+        frames_sent = 0;
+      }
+      
+      frames_sent++;
+      
+      // Check frame rate after minimum frames
+      if (frames_sent >= MIN_FRAMES_FOR_MEASUREMENT) {
+        uint64_t elapsed_us = now - first_frame_time;
+        float elapsed_seconds = elapsed_us / 1000000.0f;
+        current_frame_rate = frames_sent / elapsed_seconds;
+        
+        if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) {          
+          // We could try to catch up here, somehow. But not sure what makes sense.
+        }
+        
+        // Reset counters for next measurement period
+        first_frame_time = now;
+        frames_sent = 0;
+      }
+            
+      // Shift remaining samples
+      opus_accumulator_pos -= 320;
+      if (opus_accumulator_pos > 0) {
+        memmove(opus_accumulator, opus_accumulator + 320, opus_accumulator_pos * sizeof(int16_t));
+      }
+    }
+  }
+  
+  // Update statistics
+  
+  cycle_count++;
+  if (cycle_count % 50 == 0) {
+    uint64_t avg_wait_time = total_wait_time / 50;
+    uint64_t avg_read_time = total_read_time / 50;
+    uint64_t avg_aec_time = total_aec_time / 50;
+    uint64_t avg_encode_time = total_encode_time / 50;
+    uint64_t avg_send_time = total_send_time / 50;
+    
+    ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu, frame_rate=%.1f fps",
+             avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count, current_frame_rate);
+    
+    // Reset counters
+    total_wait_time = 0;
+    total_read_time = 0;
+    total_aec_time = 0;
+    total_encode_time = 0;
+    total_send_time = 0;
+  }
+  
 }
diff --git a/esp32-s3-box-3/src/screen.cpp b/esp32-s3-box-3/src/screen.cpp
index bbd1d9b..b4b7f47 100644
--- a/esp32-s3-box-3/src/screen.cpp
+++ b/esp32-s3-box-3/src/screen.cpp
@@ -77,6 +77,7 @@ static void screen_task(void *pvParameter) {
 }
 
 void pipecat_init_screen() {
+
   bsp_display_start();
 
   bsp_display_backlight_on();
diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp
index b777b81..d26d36a 100644
--- a/esp32-s3-box-3/src/webrtc.cpp
+++ b/esp32-s3-box-3/src/webrtc.cpp
@@ -5,20 +5,26 @@
 
 #include <esp_event.h>
 #include <esp_log.h>
+#include <esp_timer.h>
 #include <string.h>
 
 #include "main.h"
 
+extern SemaphoreHandle_t webrtcSemaphore;
+
 static PeerConnection *peer_connection = NULL;
 
 #ifndef LINUX_BUILD
-StaticTask_t task_buffer;
+
+static TaskHandle_t xPcTaskHandle = NULL;
+static TaskHandle_t xSendAudioTaskHandle = NULL;
+
 void pipecat_send_audio_task(void *user_data) {
   pipecat_init_audio_encoder();
 
   while (1) {
     pipecat_send_audio(peer_connection);
-    vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL));
+    vTaskDelay(pdMS_TO_TICKS(1));
   }
 }
 #endif
@@ -51,13 +57,15 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state,
 #ifndef LINUX_BUILD
     esp_restart();
 #endif
-  } else if (state == PEER_CONNECTION_CONNECTED) {
+  } else if (state == PEER_CONNECTION_COMPLETED) {
 #ifndef LINUX_BUILD
-    StackType_t *stack_memory = (StackType_t *)heap_caps_malloc(
-        30000 * sizeof(StackType_t), MALLOC_CAP_SPIRAM);
-    xTaskCreateStaticPinnedToCore(pipecat_send_audio_task, "audio_publisher",
-                                  30000, NULL, 7, stack_memory, &task_buffer,
-                                  0);
+
+    ESP_LOGI(LOG_TAG, "Creating send audio task");
+    // Todo: move this to a separate init block
+    // Why is this stack size so large? Opus encoder?
+    xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 1);
+    heap_caps_print_heap_info(MALLOC_CAP_INTERNAL);
+
     pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks);
 #endif
   }
@@ -109,3 +117,29 @@ void pipecat_init_webrtc() {
 void pipecat_webrtc_loop() {
   peer_connection_loop(peer_connection);
 }
+
+
+void peer_connection_task(void* arg) {
+  ESP_LOGI(LOG_TAG, "peer_connection_task started");
+
+  while (peer_connection == NULL) {
+    ESP_LOGI(LOG_TAG, "peer_connection_task waiting for peer_connection");
+    vTaskDelay(pdMS_TO_TICKS(1000));
+  }
+
+
+  for (;;) {
+    if (xSemaphoreTake(webrtcSemaphore, portMAX_DELAY)) {
+      peer_connection_loop(peer_connection);
+      xSemaphoreGive(webrtcSemaphore);
+    }
+
+    vTaskDelay(pdMS_TO_TICKS(1));
+  }
+}
+
+void pipecat_webrtc_run_task() {
+  ESP_LOGI(LOG_TAG, "peer_connection_task starting ...");
+
+  xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 0);
+}