diff --git a/.gitmodules b/.gitmodules index c6602c8..65f0d07 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "esp32-s3-box-3/components/esp-protocols"] path = esp32-s3-box-3/components/esp-protocols url = https://github.com/espressif/esp-protocols.git +[submodule "esp32-s3-box-3/esp-adf"] + path = esp32-s3-box-3/esp-adf + url = https://github.com/espressif/esp-adf.git diff --git a/esp32-s3-box-3/CMakeLists.txt b/esp32-s3-box-3/CMakeLists.txt index fbba559..ae931c7 100644 --- a/esp32-s3-box-3/CMakeLists.txt +++ b/esp32-s3-box-3/CMakeLists.txt @@ -7,6 +7,11 @@ if(NOT IDF_TARGET STREQUAL linux) add_compile_definitions(WIFI_SSID="$ENV{WIFI_SSID}") add_compile_definitions(WIFI_PASSWORD="$ENV{WIFI_PASSWORD}") + + # We are not using video in this app, so set the max NALU size to very small + if(NOT DEFINED ENV{CONFIG_MAX_NALU_SIZE}) + add_compile_definitions(CONFIG_MAX_NALU_SIZE=32) + endif() endif() if(NOT DEFINED ENV{PIPECAT_SMALLWEBRTC_URL}) @@ -20,7 +25,7 @@ endif() add_compile_definitions(PIPECAT_SMALLWEBRTC_URL="$ENV{PIPECAT_SMALLWEBRTC_URL}") set(COMPONENTS src) -set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus") +set(EXTRA_COMPONENT_DIRS "src" "components/srtp" "components/peer" "components/esp-libopus" "esp-adf/components") if(IDF_TARGET STREQUAL linux) add_compile_definitions(LINUX_BUILD=1) diff --git a/esp32-s3-box-3/dependencies.lock b/esp32-s3-box-3/dependencies.lock index e6c6da2..d533402 100644 --- a/esp32-s3-box-3/dependencies.lock +++ b/esp32-s3-box-3/dependencies.lock @@ -1,4 +1,10 @@ dependencies: + esp_codec_dev: + dependencies: [] + source: + path: /home/khkramer/src/pipecat-esp32/esp32-s3-box-3/esp-adf/components/esp_codec_dev + type: local + version: 1.3.5 espressif/button: component_hash: f53face2ab21fa0ffaf4cf0f6e513d393f56df6586bb2ad1146120f03f19ee05 dependencies: @@ -63,16 +69,16 @@ dependencies: targets: - esp32s3 version: 3.0.0~1 - espressif/esp_codec_dev: - component_hash: c71e2d13dad6fc41561590dd88dbc45c79e3f4ef48d5ee3575c60e8b6c8e79d5 + espressif/esp-dsp: + component_hash: 619639efc18cfa361a9e423739b9b0ffc14991effc6c027f955c2f2c3bf1754b dependencies: - name: idf require: private - version: '>=4.0' + version: '>=4.2' source: - registry_url: https://components.espressif.com + registry_url: https://components.espressif.com/ type: service - version: 1.3.5 + version: 1.6.0 espressif/esp_lcd_ili9341: component_hash: 31f1b793aa2110dd2ae071c21ccbff0a4eb20d9a4ee40b6294c0dc0ad9552c4e dependencies: @@ -84,7 +90,7 @@ dependencies: require: private version: 0.* source: - registry_url: https://components.espressif.com + registry_url: https://components.espressif.com/ type: service version: 1.2.0 espressif/esp_lcd_touch: @@ -139,6 +145,16 @@ dependencies: registry_url: https://components.espressif.com type: service version: 2.6.0 + espressif/esp_websocket_client: + component_hash: f77326f0e1c38da4e9c97e17c5d649b0dd13027f2645e720e48db269638fd622 + dependencies: + - name: idf + require: private + version: '>=5.0' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.4.0 espressif/icm42670: component_hash: 28b56e174f75c70037f5208aaed6c3789f0d243500d975519584bf9dc8c0836c dependencies: @@ -149,6 +165,36 @@ dependencies: registry_url: https://components.espressif.com type: service version: 2.0.2 + espressif/jsmn: + component_hash: d80350c41bbaa827c98a25b6072df00884e72f54885996fab4a4f0aebce6b6c3 + dependencies: + - name: idf + require: private + version: '>=4.3' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.1.0 + espressif/nghttp: + component_hash: 32e513409ddce0000990ec74b0711f535462726236d72578cd70498e6bd66dc3 + dependencies: + - name: idf + require: private + version: '>=5.0' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.65.0 + espressif/zlib: + component_hash: d901723af51f13fc8e5824f39f32239c847956e8fd951a05266588dc5cfbb9ae + dependencies: + - name: idf + require: private + version: '>=4.4' + source: + registry_url: https://components.espressif.com/ + type: service + version: 1.3.1 idf: source: type: idf @@ -162,7 +208,13 @@ dependencies: version: 9.3.0 direct_dependencies: - espressif/esp-box-3 +- espressif/esp-dsp +- espressif/esp_lcd_ili9341 +- espressif/esp_websocket_client +- espressif/jsmn +- espressif/nghttp +- espressif/zlib - idf -manifest_hash: 55f6a702498de8e87201da504a38c5c958e2094ec832c4f8d6d368369d5d0625 +manifest_hash: 1891dc87058c6eec7aa77f286a1967ebe7c293709db8da5b2f1e79d2e45f57ef target: esp32s3 version: 2.0.0 diff --git a/esp32-s3-box-3/esp-adf b/esp32-s3-box-3/esp-adf new file mode 160000 index 0000000..1e839af --- /dev/null +++ b/esp32-s3-box-3/esp-adf @@ -0,0 +1 @@ +Subproject commit 1e839afb5a5166aad6028ac2796272f32454dced diff --git a/esp32-s3-box-3/partitions.csv b/esp32-s3-box-3/partitions.csv index 03d5d13..6bfa8e4 100644 --- a/esp32-s3-box-3/partitions.csv +++ b/esp32-s3-box-3/partitions.csv @@ -2,5 +2,5 @@ # Name, Type, SubType, Offset, Size, Flags nvs, data, nvs, 0x9000, 0x6000, phy_init, data, phy, 0xf000, 0x1000, -factory, app, factory, 0x10000, 0x180000, +factory, app, factory, 0x10000, 0x200000, diff --git a/esp32-s3-box-3/sdkconfig.defaults b/esp32-s3-box-3/sdkconfig.defaults index 236be55..9abc65a 100644 --- a/esp32-s3-box-3/sdkconfig.defaults +++ b/esp32-s3-box-3/sdkconfig.defaults @@ -18,6 +18,9 @@ CONFIG_ESP_MAIN_TASK_STACK_SIZE=16384 # Defaults to partitions.csv CONFIG_PARTITION_TABLE_CUSTOM=y +# Set flash size to 16MB for ESP32-S3-Box-3 +CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y + # Set highest CPU Freq CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y @@ -33,3 +36,10 @@ CONFIG_COMPILER_OPTIMIZATION_PERF=y CONFIG_COMPILER_OPTIMIZATION_ASSERTIONS_DISABLE=y CONFIG_CODEC_I2C_BACKWARD_COMPATIBLE=n + +# Use C library memory allocator for LVGL to save 64KB of BSS +# This removes the 64KB work_mem_int buffer from BSS +CONFIG_LV_USE_BUILTIN_MALLOC=n +CONFIG_LV_USE_CLIB_MALLOC=y +CONFIG_BSP_LCD_DRAW_BUF_HEIGHT=10 +CONFIG_BSP_LCD_DRAW_BUF_DOUBLE=n diff --git a/esp32-s3-box-3/src/CMakeLists.txt b/esp32-s3-box-3/src/CMakeLists.txt index da4e1e7..cc4ab61 100644 --- a/esp32-s3-box-3/src/CMakeLists.txt +++ b/esp32-s3-box-3/src/CMakeLists.txt @@ -7,7 +7,7 @@ if(IDF_TARGET STREQUAL linux) else() idf_component_register( SRCS ${COMMON_SRC} "wifi.cpp" "media.cpp" "rtvi.cpp" "rtvi_callbacks.cpp" "screen.cpp" - REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl) + REQUIRES driver esp_wifi nvs_flash peer esp_psram esp-libopus esp_http_client json lvgl esp-sr) endif() idf_component_get_property(lib peer COMPONENT_LIB) diff --git a/esp32-s3-box-3/src/main.cpp b/esp32-s3-box-3/src/main.cpp index 8bc08f1..2d18090 100644 --- a/esp32-s3-box-3/src/main.cpp +++ b/esp32-s3-box-3/src/main.cpp @@ -1,5 +1,6 @@ #include "main.h" +#include #include #include #include @@ -7,27 +8,49 @@ #ifndef LINUX_BUILD #include "nvs_flash.h" +// All operations that modify the webrtc state should use this semaphore to ensure thread safety +SemaphoreHandle_t webrtcSemaphore = NULL; + extern "C" void app_main(void) { - esp_err_t ret = nvs_flash_init(); - if (ret == ESP_ERR_NVS_NO_FREE_PAGES || - ret == ESP_ERR_NVS_NEW_VERSION_FOUND) { - ESP_ERROR_CHECK(nvs_flash_erase()); - ret = nvs_flash_init(); - } - ESP_ERROR_CHECK(ret); + ESP_LOGI(LOG_TAG, "[APP] Startup.."); + ESP_LOGI(LOG_TAG, "[APP] IDF version: %s", esp_get_idf_version()); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + + ESP_ERROR_CHECK(nvs_flash_init()); ESP_ERROR_CHECK(esp_event_loop_create_default()); + + webrtcSemaphore = xSemaphoreCreateMutex(); + pipecat_init_screen(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + peer_init(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_audio_capture(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_audio_decoder(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + + pipecat_init_aec(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_wifi(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_webrtc(); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + ESP_LOGI(LOG_TAG, "Pipecat ESP32 client initialized"); pipecat_screen_system_log("Pipecat ESP32 client initialized\n"); + ESP_LOGI(LOG_TAG, "Starting webrtc task"); + pipecat_webrtc_run_task(); + while (1) { - pipecat_webrtc_loop(); + // Add some stats printout to the screen here vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL)); } } @@ -38,6 +61,7 @@ int main(void) { pipecat_webrtc(); while (1) { + // Todo: test/fix linux build pipecat_webrtc_loop(); vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL)); } diff --git a/esp32-s3-box-3/src/main.h b/esp32-s3-box-3/src/main.h index 389d7ee..5797b52 100644 --- a/esp32-s3-box-3/src/main.h +++ b/esp32-s3-box-3/src/main.h @@ -14,16 +14,19 @@ extern void pipecat_init_audio_decoder(); extern void pipecat_init_audio_encoder(); extern void pipecat_send_audio(PeerConnection *peer_connection); extern void pipecat_audio_decode(uint8_t *data, size_t size); +extern void pipecat_init_aec(); // WebRTC / Signalling extern void pipecat_init_webrtc(); extern void pipecat_webrtc_loop(); +extern void pipecat_webrtc_run_task(); extern void pipecat_http_request(char *offer, char *answer); // RTVI typedef struct { void (*on_bot_started_speaking)(); void (*on_bot_stopped_speaking)(); + void (*on_bot_tts_text)(const char *text); } rtvi_callbacks_t; diff --git a/esp32-s3-box-3/src/media.cpp b/esp32-s3-box-3/src/media.cpp index ba95185..989d2af 100644 --- a/esp32-s3-box-3/src/media.cpp +++ b/esp32-s3-box-3/src/media.cpp @@ -3,28 +3,32 @@ #include #include +#include +#include #include "esp_log.h" #include "main.h" +#include "esp_afe_aec.h" + + #define SAMPLE_RATE (16000) #define OPUS_BUFFER_SIZE 1276 // 1276 bytes is recommended by opus_encode -#define PCM_BUFFER_SIZE 640 +#define AEC_CHUNK_SIZE 256 // AEC processes 256 samples at a time for 16kHz +#define OPUS_FRAME_SIZE 320 // Opus frame size in samples +#define COMMON_BUFFER_SAMPLES 1280 // LCM(256, 320) = 1280 samples +#define AEC_CHUNKS_TO_BATCH (COMMON_BUFFER_SAMPLES / AEC_CHUNK_SIZE) // 5 chunks +#define OPUS_FRAMES_TO_BATCH (COMMON_BUFFER_SAMPLES / OPUS_FRAME_SIZE) // 4 frames +#define PCM_BUFFER_SIZE (COMMON_BUFFER_SAMPLES * sizeof(int16_t)) // 2560 bytes #define OPUS_ENCODER_BITRATE 30000 #define OPUS_ENCODER_COMPLEXITY 0 +// AEC control +static bool aec_enabled = true; // Can be set to false to bypass AEC + std::atomic is_playing = false; -void set_is_playing(int16_t *in_buf, size_t in_samples) { - bool any_set = false; - for (size_t i = 0; i < in_samples; i++) { - if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) { - any_set = true; - } - } - is_playing = any_set; -} esp_codec_dev_handle_t mic_codec_dev = NULL; esp_codec_dev_handle_t spk_codec_dev = NULL; @@ -48,6 +52,39 @@ void pipecat_init_audio_capture() { opus_int16 *decoder_buffer = NULL; OpusDecoder *opus_decoder = NULL; +// Circular buffer for speaker reference data (not using FreeRTOS ring buffer) +// This is a sliding window buffer, not a queue - we read recent data for AEC +static int16_t speaker_circular_buffer[960]; // 3 Opus frames (60ms @ 16kHz) +static volatile size_t speaker_write_idx = 0; +static volatile size_t speaker_read_idx = 0; +static volatile size_t speaker_buffer_count = 0; // Number of samples in buffer +static volatile bool speaker_buffer_active = false; // True when we're receiving audio +static portMUX_TYPE speaker_buffer_mutex = portMUX_INITIALIZER_UNLOCKED; +static volatile uint64_t last_buffer_read_time = 0; // Track when buffer was last read + +// Function to update playback state and reset buffer when playback stops +void set_is_playing(int16_t *in_buf, size_t in_samples) { + bool any_set = false; + for (size_t i = 0; i < in_samples; i++) { + if (in_buf[i] != -1 && in_buf[i] != 0 && in_buf[i] != 1) { + any_set = true; + } + } + bool was_playing = is_playing.load(); + is_playing = any_set; + + // Reset buffer state when playback stops + if (was_playing && !any_set) { + portENTER_CRITICAL(&speaker_buffer_mutex); + speaker_buffer_active = false; + speaker_read_idx = speaker_write_idx; // Reset read to match write + speaker_buffer_count = 0; + last_buffer_read_time = 0; // Reset read time + portEXIT_CRITICAL(&speaker_buffer_mutex); + ESP_LOGD(LOG_TAG, "Playback stopped, reset speaker buffer"); + } +} + void pipecat_init_audio_decoder() { int decoder_error = 0; opus_decoder = opus_decoder_create(SAMPLE_RATE, 1, &decoder_error); @@ -60,12 +97,53 @@ void pipecat_init_audio_decoder() { } void pipecat_audio_decode(uint8_t *data, size_t size) { - esp_err_t ret; - int decoded_size = - opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0); + uint64_t now = esp_timer_get_time(); + + // Decode Opus frame to get the size (or use fixed size if predictable) + int decoded_size = opus_decode(opus_decoder, data, size, decoder_buffer, PCM_BUFFER_SIZE, 0); if (decoded_size > 0) { + // FIRST: Write to circular buffer for AEC reference (before any other processing) + portENTER_CRITICAL(&speaker_buffer_mutex); + + // Write to circular buffer + size_t space_available = 960 - speaker_buffer_count; + size_t samples_to_write = decoded_size; + + // If we don't have enough space, drop oldest samples + if (samples_to_write > space_available) { + size_t samples_to_drop = samples_to_write - space_available; + speaker_read_idx = (speaker_read_idx + samples_to_drop) % 960; + speaker_buffer_count -= samples_to_drop; + } + + // Write new samples + for (int i = 0; i < samples_to_write; i++) { + speaker_circular_buffer[speaker_write_idx] = decoder_buffer[i]; + speaker_write_idx = (speaker_write_idx + 1) % 960; + } + speaker_buffer_count += samples_to_write; + speaker_buffer_active = true; + + // Get samples in buffer after write (while still in critical section) + size_t samples_in_buffer = speaker_buffer_count; + + portEXIT_CRITICAL(&speaker_buffer_mutex); + + // Log timing relative to last read from this buffer + uint64_t time_since_last_read = 0; + if (last_buffer_read_time > 0 && now > last_buffer_read_time) { + time_since_last_read = (now - last_buffer_read_time) / 1000; + } else { + time_since_last_read = 0; // First write or invalid timestamp + } + + + // THEN: Process for playback set_is_playing(decoder_buffer, decoded_size); + + // Send to speaker + esp_err_t ret; if ((ret = esp_codec_dev_write(spk_codec_dev, decoder_buffer, decoded_size * sizeof(uint16_t))) != ESP_OK) { @@ -78,6 +156,27 @@ OpusEncoder *opus_encoder = NULL; uint8_t *encoder_output_buffer = NULL; uint8_t *read_buffer = NULL; +// Timing statistics +static uint64_t total_read_time = 0; +static uint64_t total_aec_time = 0; +static uint64_t total_encode_time = 0; +static uint64_t total_send_time = 0; +static uint64_t total_wait_time = 0; +static uint32_t cycle_count = 0; +static uint32_t aec_skip_count = 0; +static uint64_t last_frame_send_time = 0; + +// Frame rate monitoring +static uint64_t first_frame_time = 0; +static uint32_t frames_sent = 0; +static float current_frame_rate = 0.0; +static const uint32_t MIN_FRAMES_FOR_MEASUREMENT = 50; // Measure over 50 frames (~1 second) +static const float TARGET_FRAME_RATE = 50.0; // 50 fps (20ms per frame) +static const float MIN_ACCEPTABLE_FRAME_RATE = 48.0; // Allow 4% tolerance + +// AEC processed buffer +static int16_t *processed_buffer = NULL; + void pipecat_init_audio_encoder() { int encoder_error; opus_encoder = opus_encoder_create(SAMPLE_RATE, 1, OPUS_APPLICATION_VOIP, @@ -100,22 +199,304 @@ void pipecat_init_audio_encoder() { read_buffer = (uint8_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT); encoder_output_buffer = (uint8_t *)malloc(OPUS_BUFFER_SIZE); + processed_buffer = (int16_t *)heap_caps_malloc(PCM_BUFFER_SIZE, MALLOC_CAP_DEFAULT); } -void pipecat_send_audio(PeerConnection *peer_connection) { - if (esp_codec_dev_read(mic_codec_dev, read_buffer, PCM_BUFFER_SIZE) != - ESP_OK) { - printf("esp_codec_dev_read failed"); +static int16_t *aec_in_buffer = NULL; +static int16_t *aec_out_buffer = NULL; +static afe_aec_handle_t *aec_handle = NULL; + +void pipecat_init_aec() { + // Try a simpler approach - create AEC without the task configuration struct + ESP_LOGI(LOG_TAG, "Initializing AEC"); + + // Speaker circular buffer is now statically allocated + ESP_LOGI(LOG_TAG, "Speaker circular buffer ready, size=960 samples"); + + // Create AEC with basic parameters. + // Filter_length=4 recommended for ESP32S3, but filter_length=2 and AFE_MODE_LOW_COST seem to be a local + // maximum in terms of quality with the current code we have here. + aec_handle = afe_aec_create("MR", 2, AFE_TYPE_VC, AFE_MODE_LOW_COST); + if (aec_handle == NULL) { + ESP_LOGE(LOG_TAG, "Failed to create AEC"); return; } + + ESP_LOGI(LOG_TAG, "AEC created successfully"); + + int frame_size = afe_aec_get_chunksize(aec_handle); + int input_size = frame_size * aec_handle->pcm_config.total_ch_num * sizeof(int16_t); + int output_size = frame_size * sizeof(int16_t); - if (is_playing) { - memset(read_buffer, 0, PCM_BUFFER_SIZE); + // Allocate buffers in PSRAM instead of internal memory + aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_SPIRAM); + aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_SPIRAM); + + if (aec_in_buffer == NULL || aec_out_buffer == NULL) { + ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers"); + // Try internal memory as fallback + if (aec_in_buffer) heap_caps_free(aec_in_buffer); + if (aec_out_buffer) heap_caps_free(aec_out_buffer); + + aec_in_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, input_size, MALLOC_CAP_INTERNAL); + aec_out_buffer = (int16_t *)heap_caps_aligned_calloc(16, 1, output_size, MALLOC_CAP_INTERNAL); + + if (aec_in_buffer == NULL || aec_out_buffer == NULL) { + ESP_LOGE(LOG_TAG, "Failed to allocate AEC buffers even in internal memory"); + return; + } + ESP_LOGW(LOG_TAG, "AEC buffers allocated in internal memory"); } + + ESP_LOGI(LOG_TAG, "AEC frame size: %d, input size: %d, output size: %d", frame_size, input_size, output_size); + ESP_LOGI(LOG_TAG, "AEC buffers allocated: in=%p, out=%p", aec_in_buffer, aec_out_buffer); +} + - auto encoded_size = opus_encode(opus_encoder, (const opus_int16 *)read_buffer, - PCM_BUFFER_SIZE / sizeof(uint16_t), - encoder_output_buffer, OPUS_BUFFER_SIZE); - peer_connection_send_audio(peer_connection, encoder_output_buffer, - encoded_size); +void pipecat_send_audio(PeerConnection *peer_connection) { + uint64_t start_time, end_time; + static uint64_t last_call_end_time = 0; + uint64_t function_start_time = esp_timer_get_time(); + + + // Wait for (960 samples = 3 Opus frames) in our speaker data reference buffer to handle codec burst pattern + uint64_t wait_start = esp_timer_get_time(); + int initial_samples = 0; + int wait_iterations = 0; + while (true) { + portENTER_CRITICAL(&speaker_buffer_mutex); + int samples_available = 0; + if (speaker_buffer_active) { + samples_available = speaker_buffer_count; + } + portEXIT_CRITICAL(&speaker_buffer_mutex); + + wait_iterations++; + if (initial_samples == 0) { + initial_samples = samples_available; + } + + if (samples_available >= 960) { + break; // Buffer has 3 Opus frames - ready to handle burst consumption + } + + // Brief delay before checking again + vTaskDelay(1 / portTICK_PERIOD_MS); + } + uint64_t wait_end = esp_timer_get_time(); + total_wait_time += (wait_end - wait_start); + + + // Buffers for interleaved processing + static int16_t opus_accumulator[640]; // Accumulate AEC output for Opus encoding + static int opus_accumulator_pos = 0; + int16_t mic_chunk_buffer[256]; + int aec_chunk_size = 256; // AEC processes 256 samples at a time + + // Track timing for AEC skip decision + bool skip_last_aec = false; + int frame_count = 0; + + // Track timing - each read blocks for ~16ms (256 samples @ 16kHz) + uint64_t loop_start_time = esp_timer_get_time(); + + // Single loop processing 5 chunks of 256 samples each + for (int chunk = 0; chunk < 5; chunk++) { + uint64_t chunk_start_time = esp_timer_get_time(); + // 1. Read 256 samples from microphone + + start_time = esp_timer_get_time(); + if (esp_codec_dev_read(mic_codec_dev, (uint8_t*)mic_chunk_buffer, 256 * sizeof(int16_t)) != ESP_OK) { + ESP_LOGE(LOG_TAG, "esp_codec_dev_read failed"); + return; + } + end_time = esp_timer_get_time(); + total_read_time += (end_time - start_time); + + + // 2. Process with AEC or bypass + if (aec_handle != NULL && aec_enabled && !(chunk == 4 && skip_last_aec)) { + // Get reference data from speaker circular buffer + int16_t ref_chunk[256]; + int samples_available = 0; + bool log_underrun = false; + static int underrun_count = 0; + static int ref_success_count = 0; + + portENTER_CRITICAL(&speaker_buffer_mutex); + + if (speaker_buffer_active) { + samples_available = speaker_buffer_count; + + if (samples_available >= aec_chunk_size) { + // We have enough data, read it + for (int i = 0; i < aec_chunk_size; i++) { + ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; + speaker_read_idx = (speaker_read_idx + 1) % 960; + } + speaker_buffer_count -= aec_chunk_size; + } else if (samples_available > 0) { + // Not enough data, read what we have and pad with zeros + int i; + for (i = 0; i < samples_available; i++) { + ref_chunk[i] = speaker_circular_buffer[speaker_read_idx]; + speaker_read_idx = (speaker_read_idx + 1) % 960; + } + speaker_buffer_count = 0; // We consumed all available samples + for (; i < aec_chunk_size; i++) { + ref_chunk[i] = 0; + } + + underrun_count++; + // Log every underrun for first 20, then every 10th + if (underrun_count <= 20 || underrun_count % 10 == 0) { + log_underrun = true; + } + } else { + // No data at all - don't advance read pointer, just use zeros + for (int i = 0; i < aec_chunk_size; i++) { + ref_chunk[i] = 0; + } + + underrun_count++; + // Log every underrun for first 20, then every 10th + if (underrun_count <= 20 || underrun_count % 10 == 0) { + log_underrun = true; + } + } + } else { + // Buffer not active, use zeros + for (int i = 0; i < aec_chunk_size; i++) { + ref_chunk[i] = 0; + } + } + + portEXIT_CRITICAL(&speaker_buffer_mutex); + + // Update last read time if we read any data + // Update last read time if we successfully read data + if (speaker_buffer_active && samples_available >= aec_chunk_size) { + last_buffer_read_time = esp_timer_get_time(); + } else if (speaker_buffer_active && samples_available > 0) { + // Even partial reads update the time + last_buffer_read_time = esp_timer_get_time(); + } + + if (log_underrun) { + uint64_t time_since_loop_start = (esp_timer_get_time() - loop_start_time) / 1000; + // Get consistent state outside critical section + size_t log_w_idx, log_r_idx, log_cnt; + portENTER_CRITICAL(&speaker_buffer_mutex); + log_w_idx = speaker_write_idx; + log_r_idx = speaker_read_idx; + log_cnt = speaker_buffer_count; + portEXIT_CRITICAL(&speaker_buffer_mutex); + + ESP_LOGW(LOG_TAG, "[UNDERRUN] #%d: chunk=%d, only %d/%d samples, %llums since loop start, count=%d, w=%d r=%d", + underrun_count, chunk, samples_available, aec_chunk_size, + time_since_loop_start, log_cnt, log_w_idx, log_r_idx); + } + + // Create interleaved AEC input + for (int i = 0; i < aec_chunk_size; i++) { + aec_in_buffer[i * 2] = mic_chunk_buffer[i]; // Mic + aec_in_buffer[i * 2 + 1] = ref_chunk[i]; // Reference + } + + // Run AEC + start_time = esp_timer_get_time(); + int ret = afe_aec_process(aec_handle, aec_in_buffer, aec_out_buffer); + end_time = esp_timer_get_time(); + total_aec_time += (end_time - start_time); + + if (ret < 0) { + ESP_LOGE(LOG_TAG, "afe_aec_process failed: return value %d", ret); + // Fall back - copy mic input directly + memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); + } else { + // Copy AEC output to accumulator + memcpy(opus_accumulator + opus_accumulator_pos, aec_out_buffer, 256 * sizeof(int16_t)); + } + // ---- + // DEBUG: Just copy mic data + // memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); + + } else { + // AEC disabled or skipped - copy mic input directly + memcpy(opus_accumulator + opus_accumulator_pos, mic_chunk_buffer, 256 * sizeof(int16_t)); + + } + + opus_accumulator_pos += 256; + + // Note: esp_codec_dev_read blocks, providing natural pacing + + // 3. Encode and send Opus frames when we have enough samples + while (opus_accumulator_pos >= 320) { + start_time = esp_timer_get_time(); + int encoded_size = opus_encode(opus_encoder, opus_accumulator, 320, + encoder_output_buffer, OPUS_BUFFER_SIZE); + end_time = esp_timer_get_time(); + total_encode_time += (end_time - start_time); + + start_time = esp_timer_get_time(); + peer_connection_send_audio(peer_connection, encoder_output_buffer, encoded_size); + end_time = esp_timer_get_time(); + total_send_time += (end_time - start_time); + + frame_count++; + + // Frame rate tracking + uint64_t now = esp_timer_get_time(); + if (first_frame_time == 0) { + first_frame_time = now; + frames_sent = 0; + } + + frames_sent++; + + // Check frame rate after minimum frames + if (frames_sent >= MIN_FRAMES_FOR_MEASUREMENT) { + uint64_t elapsed_us = now - first_frame_time; + float elapsed_seconds = elapsed_us / 1000000.0f; + current_frame_rate = frames_sent / elapsed_seconds; + + if (current_frame_rate < MIN_ACCEPTABLE_FRAME_RATE) { + // We could try to catch up here, somehow. But not sure what makes sense. + } + + // Reset counters for next measurement period + first_frame_time = now; + frames_sent = 0; + } + + // Shift remaining samples + opus_accumulator_pos -= 320; + if (opus_accumulator_pos > 0) { + memmove(opus_accumulator, opus_accumulator + 320, opus_accumulator_pos * sizeof(int16_t)); + } + } + } + + // Update statistics + + cycle_count++; + if (cycle_count % 50 == 0) { + uint64_t avg_wait_time = total_wait_time / 50; + uint64_t avg_read_time = total_read_time / 50; + uint64_t avg_aec_time = total_aec_time / 50; + uint64_t avg_encode_time = total_encode_time / 50; + uint64_t avg_send_time = total_send_time / 50; + + ESP_LOGI(LOG_TAG, "Audio timing stats (avg over 50 cycles): wait=%llu us, read=%llu us, aec=%llu us, encode=%llu us, send=%llu us, aec_skips=%lu, frame_rate=%.1f fps", + avg_wait_time, avg_read_time, avg_aec_time, avg_encode_time, avg_send_time, aec_skip_count, current_frame_rate); + + // Reset counters + total_wait_time = 0; + total_read_time = 0; + total_aec_time = 0; + total_encode_time = 0; + total_send_time = 0; + } + } diff --git a/esp32-s3-box-3/src/screen.cpp b/esp32-s3-box-3/src/screen.cpp index bbd1d9b..b4b7f47 100644 --- a/esp32-s3-box-3/src/screen.cpp +++ b/esp32-s3-box-3/src/screen.cpp @@ -77,6 +77,7 @@ static void screen_task(void *pvParameter) { } void pipecat_init_screen() { + bsp_display_start(); bsp_display_backlight_on(); diff --git a/esp32-s3-box-3/src/webrtc.cpp b/esp32-s3-box-3/src/webrtc.cpp index b777b81..d26d36a 100644 --- a/esp32-s3-box-3/src/webrtc.cpp +++ b/esp32-s3-box-3/src/webrtc.cpp @@ -5,20 +5,26 @@ #include #include +#include #include #include "main.h" +extern SemaphoreHandle_t webrtcSemaphore; + static PeerConnection *peer_connection = NULL; #ifndef LINUX_BUILD -StaticTask_t task_buffer; + +static TaskHandle_t xPcTaskHandle = NULL; +static TaskHandle_t xSendAudioTaskHandle = NULL; + void pipecat_send_audio_task(void *user_data) { pipecat_init_audio_encoder(); while (1) { pipecat_send_audio(peer_connection); - vTaskDelay(pdMS_TO_TICKS(TICK_INTERVAL)); + vTaskDelay(pdMS_TO_TICKS(1)); } } #endif @@ -51,13 +57,15 @@ static void pipecat_onconnectionstatechange_task(PeerConnectionState state, #ifndef LINUX_BUILD esp_restart(); #endif - } else if (state == PEER_CONNECTION_CONNECTED) { + } else if (state == PEER_CONNECTION_COMPLETED) { #ifndef LINUX_BUILD - StackType_t *stack_memory = (StackType_t *)heap_caps_malloc( - 30000 * sizeof(StackType_t), MALLOC_CAP_SPIRAM); - xTaskCreateStaticPinnedToCore(pipecat_send_audio_task, "audio_publisher", - 30000, NULL, 7, stack_memory, &task_buffer, - 0); + + ESP_LOGI(LOG_TAG, "Creating send audio task"); + // Todo: move this to a separate init block + // Why is this stack size so large? Opus encoder? + xTaskCreatePinnedToCore(pipecat_send_audio_task, "audio_publisher", 24576, NULL, 7, &xSendAudioTaskHandle, 1); + heap_caps_print_heap_info(MALLOC_CAP_INTERNAL); + pipecat_init_rtvi(peer_connection, &pipecat_rtvi_callbacks); #endif } @@ -109,3 +117,29 @@ void pipecat_init_webrtc() { void pipecat_webrtc_loop() { peer_connection_loop(peer_connection); } + + +void peer_connection_task(void* arg) { + ESP_LOGI(LOG_TAG, "peer_connection_task started"); + + while (peer_connection == NULL) { + ESP_LOGI(LOG_TAG, "peer_connection_task waiting for peer_connection"); + vTaskDelay(pdMS_TO_TICKS(1000)); + } + + + for (;;) { + if (xSemaphoreTake(webrtcSemaphore, portMAX_DELAY)) { + peer_connection_loop(peer_connection); + xSemaphoreGive(webrtcSemaphore); + } + + vTaskDelay(pdMS_TO_TICKS(1)); + } +} + +void pipecat_webrtc_run_task() { + ESP_LOGI(LOG_TAG, "peer_connection_task starting ..."); + + xTaskCreatePinnedToCore(peer_connection_task, "peer_connection", 16384, NULL, 5, &xPcTaskHandle, 0); +}