diff --git a/config.json b/config.json index 5cafb4c44..f1cd80125 100644 --- a/config.json +++ b/config.json @@ -98,5 +98,6 @@ "ENABLE_SPOTLIGHT": false, "AUTO_CONNECT_RTM": false, "ENABLE_CONVERSATIONAL_AI": false, - "CUSTOMIZE_AGENT": false + "CUSTOMIZE_AGENT": false, + "SONIOX_API_KEY": "5d1b0276e3b1e647485d42e7d1439430dea7459b3248a55f98928c8b1074f1d7" } diff --git a/template/bridge/rtc/webNg/RtcEngine.ts b/template/bridge/rtc/webNg/RtcEngine.ts index 761910582..e45864429 100644 --- a/template/bridge/rtc/webNg/RtcEngine.ts +++ b/template/bridge/rtc/webNg/RtcEngine.ts @@ -32,6 +32,8 @@ import type { Subscription, } from 'react-native-agora/lib/typescript/src/common/RtcEvents'; +import {RecordTranscribe} from '@soniox/speech-to-text-web'; + import {IRtcEngine} from 'react-native-agora'; import {VideoProfile} from '../quality'; import {ChannelProfileType, ClientRoleType} from '../../../agora-rn-uikit'; @@ -222,6 +224,7 @@ export default class RtcEngine { // public AgoraRTC: any; public client: IAgoraRTCClient; public screenClient: any | IAgoraRTCClient; + public eventsMap = new Map([ ['onUserJoined', () => null], ['onUserOffline', () => null], @@ -232,10 +235,13 @@ export default class RtcEngine { ['onNetworkQuality', () => null], ['onActiveSpeaker', () => null], ['onStreamMessage', () => null], + ['onSonioxTranscriptionResult', () => null], ]); + public localStream: LocalStream = {}; public screenStream: ScreenStream = {}; public remoteStreams = new Map(); + public isSonioxPanelOpen = false; private inScreenshare: Boolean = false; private videoProfile: | VideoEncoderConfigurationPreset @@ -255,13 +261,101 @@ export default class RtcEngine { private muteLocalAudioMutex = false; private speakerDeviceId = ''; private usersVolumeLevel = []; + // Create channel profile and set it here initialize(context: RtcEngineContext) { const {appId} = context; logger.log(LogSource.AgoraSDK, 'Log', 'RTC engine initialized'); this.appId = appId; + this.sonioxTranscribers = new Map(); + this.customEvents = new Map(); + this.localUserId = null; + } + addCustomListener(eventName: string, callback: (...args: any[]) => void) { + this.customEvents.set(eventName, callback); + } + + removeCustomListener(eventName: string) { + this.customEvents.delete(eventName); + } + + async startSonioxTranscription(uid: UID, apiKey: string, isLocal: boolean) { + let stream: MediaStream | null = null; + + // Select local or remote stream + if (isLocal) { + this.localUserId = uid; + if (!this.localStream?.audio) { + console.log('No local audio stream available', uid); + return; + } else { + stream = new MediaStream([ + this.localStream.audio.getMediaStreamTrack(), + ]); + } + } else { + const remoteAudio = this.remoteStreams.get(uid)?.audio; + if (!remoteAudio) { + console.warn(`No remote audio stream found for UID ${uid}`); + return; + } else { + stream = new MediaStream([remoteAudio.getMediaStreamTrack()]); + } + } + + // Create a new transcriber instance + const transcriber = new RecordTranscribe({apiKey}); + + // Start transcription for the single stream + await transcriber.start({ + model: 'stt-rt-preview', + stream, + languageHints: ['en'], + sampleRate: 48000, + numChannels: 1, + enableLanguageIdentification: false, + enableEndpointDetection: false, + // translation: { + // type: 'one_way', + // source_languages: ['en'], + // target_language: 'hi', + // }, + onPartialResult: results => { + const callback = this.customEvents.get('onSonioxTranscriptionResult'); + if (callback) callback(uid, {uid, ...results}); + }, + onError: (status, message, code) => { + console.error( + `Soniox Transcription Error (${uid}):`, + status, + message, + code, + ); + }, + onStarted: () => { + console.log(`Soniox started transcription for UID: ${uid}`); + }, + onStateChange: ({oldState, newState}) => { + console.log(`Soniox state (${uid}): ${oldState} → ${newState}`); + }, + onFinished: () => { + console.log(` Soniox transcription session finished for UID: ${uid}`); + }, + }); + + // Track this transcriber + this.sonioxTranscribers.set(uid, transcriber); } + + stopSonioxTranscription(): void { + for (const [uid, transcriber] of this.sonioxTranscribers.entries()) { + transcriber.stop(); + console.log(` Stopped Soniox transcription for user UID: ${uid}`); + } + this.sonioxTranscribers.clear(); + } + getLocalVideoStats() { try { logger.log( @@ -771,6 +865,14 @@ export default class RtcEngine { 0, 0, ); + // Only start transcriber if panel is open & not already started + if (this.isSonioxPanelOpen && !this.sonioxTranscribers.has(user.uid)) { + this.startSonioxTranscription( + user.uid, + $config.SONIOX_API_KEY, + false, + ); + } } else { const videoTrack = user.videoTrack; // Play the video @@ -1022,6 +1124,14 @@ export default class RtcEngine { // Release the lock once done this.muteLocalAudioMutex = false; this.isAudioEnabled = !muted; + + // Stop/ Start Local Transcriber on local mute/unmute + const transcriber = this.sonioxTranscribers.get(this.localUserId); + if (muted) { + await transcriber.stop(); + } else { + await transcriber.start(transcriber._audioOptions); + } // Unpublish only after when the user has joined the call if (!muted && !this.isAudioPublished && this.isJoined) { logger.log( diff --git a/template/package.json b/template/package.json index 5d5d8136d..9099b9d72 100644 --- a/template/package.json +++ b/template/package.json @@ -56,6 +56,7 @@ "@react-native-async-storage/async-storage": "1.19.2", "@react-native-community/checkbox": "0.5.16", "@react-native-community/clipboard": "1.5.1", + "@soniox/speech-to-text-web": "^1.1.4", "@splinetool/runtime": "^1.9.69", "@supersami/rn-foreground-service": "^1.1.1", "add": "^2.0.6", diff --git a/template/src/pages/video-call/VideoCallScreen.tsx b/template/src/pages/video-call/VideoCallScreen.tsx index 4093cc44e..74d55119b 100644 --- a/template/src/pages/video-call/VideoCallScreen.tsx +++ b/template/src/pages/video-call/VideoCallScreen.tsx @@ -55,6 +55,7 @@ import {useIsRecordingBot} from '../../subComponents/recording/useIsRecordingBot import {ToolbarPresetProps} from '../../atoms/ToolbarPreset'; import CustomSidePanelView from '../../components/CustomSidePanel'; import {useControlPermissionMatrix} from '../../components/controls/useControlPermissionMatrix'; +import SonixCaptionContainer from '../../subComponents/caption/SonixCaptionContainer'; const VideoCallScreen = () => { useFindActiveSpeaker(); @@ -454,6 +455,7 @@ const VideoCallScreen = () => { /> ) : ( <> + {isCaptionON ? : <>} {isCaptionON ? : <>} { + const date = new Date(timestamp); + return date.toLocaleTimeString([], { + hour: '2-digit', + minute: '2-digit', + hour12: true, + }); +}; + +const SonixCaptionContainer = () => { + const {RtcEngineUnsafe} = useRtc(); + const {defaultContent, activeUids} = useContent(); + const localUid = useLocalUid(); + const {captionFeed, setCaptionFeed} = useCaption(); + const scrollRef = React.useRef(null); + const queueRef = React.useRef(new PQueue({concurrency: 1})); + const [autoScroll, setAutoScroll] = useState(true); + + // in-progress captions per speaker now + const activeCaptionsRef = useRef({}); + + const engine = RtcEngineUnsafe; + + useEffect(() => { + engine.isSonioxPanelOpen = true; + + engine.addCustomListener( + 'onSonioxTranscriptionResult', + sonixCaptionCallback, + ); + + activeUids.map(uid => { + engine.startSonioxTranscription( + uid, + $config.SONIOX_API_KEY, + uid === localUid, + ); + }); + + return () => { + engine.isSonioxPanelOpen = false; + engine.stopSonioxTranscription(); + }; + }, []); + + const sonixCaptionCallback = (uid, transcript) => { + const queueCallback = () => { + console.log('sonix transcript =>', uid, transcript); + + const finalText = transcript.tokens + .filter(t => t.is_final) + .map(t => t.text) + .join(''); + const nonFinalText = transcript.tokens + .filter(t => !t.is_final) + .map(t => t.text) + .join(''); + + // merge into in-progress buffer + const active = activeCaptionsRef.current[uid] || { + uid, + text: '', + nonFinal: '', + time: Date.now(), + }; + + if (finalText) { + active.text = (active.text + ' ' + finalText).trim(); + } + active.nonFinal = nonFinalText; + active.time = Date.now(); + activeCaptionsRef.current[uid] = active; + + // If fully finalized, commit to feed + remove from active buffer + if (!nonFinalText && finalText) { + setCaptionFeed(prev => [...prev, {...active, nonFinal: ''}]); + delete activeCaptionsRef.current[uid]; + } else { + // partial update: force rerender by setting dummy feed (not needed in your hook-based context) + setCaptionFeed(prev => [...prev]); // triggers UI refresh + } + }; + + queueRef.current.add(queueCallback); + }; + + const handleScroll = event => { + const {layoutMeasurement, contentOffset, contentSize} = event.nativeEvent; + const isAtBottom = + layoutMeasurement.height + contentOffset.y >= contentSize.height - 20; + setAutoScroll(isAtBottom); + }; + + return ( + { + if (autoScroll) { + scrollRef.current?.scrollToEnd({animated: true}); + } + }}> + {/* Show committed lines */} + {captionFeed.map((entry, index) => ( + + + {defaultContent[entry.uid]?.name} ({formatTime(entry.time)}) : + + {entry.text} + + ))} + + {/* Show all active speakers */} + {Object.values(activeCaptionsRef.current) + .filter(entry => entry.text || entry.nonFinal) + .map((entry, index) => ( + + + {defaultContent[entry.uid]?.name} ({formatTime(entry.time)}) : + + {entry.text} + {entry.nonFinal && ( + {entry.nonFinal} + )} + + ))} + + ); +}; + +export default SonixCaptionContainer; + +const styles = StyleSheet.create({ + scrollContainer: { + maxHeight: CAPTION_CONTAINER_HEIGHT, + height: CAPTION_CONTAINER_HEIGHT, + backgroundColor: '#815f46', + borderRadius: ThemeConfig.BorderRadius.small, + marginTop: $config.ICON_TEXT ? 8 : 0, + overflowY: 'scroll', + }, + container: { + padding: 12, + flexGrow: 1, + }, + captionLine: { + flexDirection: 'row', + flexWrap: 'wrap', + marginBottom: 4, + flexShrink: 1, + lineHeight: 24, + }, + uid: { + color: 'orange', + fontWeight: 'bold', + fontSize: 18, + lineHeight: 24, + }, + content: { + color: 'white', + fontSize: 18, + flexShrink: 1, + lineHeight: 24, + }, + live: { + color: 'skyblue', + fontSize: 18, + lineHeight: 24, + }, +}); diff --git a/template/src/subComponents/caption/useCaption.tsx b/template/src/subComponents/caption/useCaption.tsx index a924d4d2c..70602f845 100644 --- a/template/src/subComponents/caption/useCaption.tsx +++ b/template/src/subComponents/caption/useCaption.tsx @@ -50,6 +50,10 @@ export const CaptionContext = React.createContext<{ activeSpeakerRef: React.MutableRefObject; prevSpeakerRef: React.MutableRefObject; + sonixCaptions: Object; + setSonixCaptions: React.Dispatch>; + captionFeed: Object; + setCaptionFeed: React.Dispatch>; }>({ isCaptionON: false, setIsCaptionON: () => {}, @@ -69,6 +73,10 @@ export const CaptionContext = React.createContext<{ setIsSTTListenerAdded: () => {}, activeSpeakerRef: {current: ''}, prevSpeakerRef: {current: ''}, + sonixCaptions: {}, + setSonixCaptions: () => {}, + captionFeed: {}, + setCaptionFeed: () => {}, }); interface CaptionProviderProps { @@ -98,6 +106,17 @@ const CaptionProvider: React.FC = ({ const activeSpeakerRef = React.useRef(''); const prevSpeakerRef = React.useRef(''); + const [sonixCaptions, setSonixCaptions] = React.useState< + Record + >({}); + const [captionFeed, setCaptionFeed] = React.useState< + { + uid: string; + text: string; // finalized text + nonFinal?: string; // optional, shows live tokens + time: number; + }[] + >([]); return ( = ({ setIsSTTListenerAdded, activeSpeakerRef, prevSpeakerRef, + sonixCaptions, + setSonixCaptions, + captionFeed, + setCaptionFeed, }}> {children}