TMElyralab · JiiT-Tech · Mar 14, 2025 · Mar 14, 2025 · Apr 22, 2025 · Apr 28, 2025
diff --git a/.gitignore b/.gitignore
@@ -5,6 +5,21 @@
 *.pyc
 .ipynb_checkpoints
 results/
+data/audio/*.wav
+data/video/*.mp4
+data/video/*.mov
+data/video/ma/*.mp4
+avatars.zip
+avatars/
+avator_1_skipImage.zip
+avator_1_skipImage/
+cuda_11.7.0_515.43.04_linux.run
+eaudio --start
+ffmpeg-7.0.2-amd64-static/
+ffmpeg-release-amd64-static.tar.xz
+musetalk.zip
+scripts.zip
+venv_PhotonSync/
 models/
 **/__pycache__/
 *.py[cod]
@@ -15,4 +30,4 @@ ffmprobe*
 ffplay*
 debug
 exp_out
-.gradio
+.gradio
diff --git a/PhotonSync_Diagram.md b/PhotonSync_Diagram.md
@@ -0,0 +1,52 @@
+graph TD
+    %% Define Styles
+    classDef process fill:#cde4ff,stroke:#6699ff,stroke-width:2px;
+    classDef data fill:#e6ffcc,stroke:#99cc66,stroke-width:2px,rx:10px,ry:10px;
+    classDef component fill:#fff2cc,stroke:#ffcc66,stroke-width:2px;
+    classDef io fill:#f2f2f2,stroke:#333,stroke-width:2px,stroke-dasharray: 5 5;
+    classDef hardware fill:#e0e0e0,stroke:#666,stroke-width:2px,rx:5px,ry:5px;
+    classDef title fill:#ffffff,stroke:#ffffff,font-weight:bold,font-size:18px;
+
+    %% One-Time Preparation Phase
+    prep_title("One-Time Preparation<br>(一次性素材准备)"):::title
+    prep_video(Input Video/Images <br> 输入视频/图像) ==> prep_frames(Extract Frames <br> 提取帧)
+    prep_frames --> prep_landmark(Get Face BBox & Landmarks <br> 获取人脸框和关键点)
+    prep_landmark -- Face Coords (人脸坐标) --> avatar_data
+    prep_landmark -- Cropped Face (裁剪的人脸) --> prep_vae(VAE Encoder <br> VAE编码器)
+    prep_landmark -- Full Frame (完整帧) --> prep_parse(Face Parsing <br> 人脸解析)
+    prep_vae --> prep_latents(Latent Vectors <br> 潜向量)
+    prep_parse --> prep_masks(Blending Masks <br> 融合蒙版)
+    prep_latents & prep_masks --> avatar_data(<b>Avatar Data Storage</b> <br> <b>虚拟人数据存储</b><br>Frames, Coords, Latents, Masks)
+
+    %% Sender Application Phase
+    sender_title("PhotonSync Sender Application<br>(发送端应用)"):::title
+    photon_gpt[PhotonGPT Audio Input <br> PhotonGPT音频输入] --> audio_proc(Audio Feature Extraction <br> 音频特征提取<br><i>Whisper</i>)
+    photon_gpt --> audio_enc(Audio Encoding <br> 音频编码<br><i>GStreamerAudio / opusenc</i>)
+
+    audio_proc -- Audio Features (音频特征) --> rt_unet
+    avatar_data -- Pre-calculated Latents (预计算潜向量) --> rt_unet
+
+    rt_unet(<b>UNet Inference</b><br><b>UNet推理</b><br>Generate Lip-Synced Latents<br>生成口型同步的潜向量) --> rt_vae(<b>VAE Decoder</b><br><b>VAE解码器</b><br>Latents to Image Frame<br>潜向量转图像帧)
+    rt_vae -- Generated Face Frame (生成的面部帧) --> rt_blend
+
+    avatar_data -- Original Frame, Mask, Coords (原始帧、蒙版、坐标) --> rt_blend
+    rt_blend(<b>Real-time Blending</b><br><b>实时融合</b><br>Combine face and background<br>合并面部与背景) -- Final Video Frame (最终视频帧) --> video_enc(Video Encoding<br>视频编码<br><i>GStreamerPipeline / nvh264enc</i>)
+
+    video_enc -- H.264 RTP Stream --> network((Network <br> 网络))
+    audio_enc -- Opus RTP Stream --> network
+
+    %% Receiver Phase
+    receiver_title("Holobot Receiver<br>(接收端)"):::title
+    network -- Video Stream (视频流) --> vid_receiver(Video UDP Source <br> 视频UDP源)
+    network -- Audio Stream (音频流) --> aud_receiver(Audio UDP Source <br> 音频UDP源)
+
+    vid_receiver --> vid_jitter(Video Jitter Buffer<br>视频抖动缓冲)
+    vid_jitter --> vid_depay(Video RTP Depayload<br>视频RTP解包)
+    vid_depay --> vid_parse(H.264 Parse<br>H.264解析)
+    vid_parse --> vid_dec(NVDEC Decode<br>NVDEC解码<br><i>GPU</i>)
+    vid_dec --> vid_sink(Video Sink<br>视频接收器<br><i>d3d11videosink</i>)
+
+    aud_receiver --> aud_jitter(Audio Jitter Buffer<br>音频抖动缓冲)
+    aud_jitter --> aud_depay(Audio RTP Depayload<br>音频RTP解包)
+    aud_depay --> aud_parse(Opus Parse<br>Opus解析)
+    aud_parse --> aud_dec
diff --git a/Photonsync.ps1 b/Photonsync.ps1
@@ -0,0 +1,2 @@
+python -m scripts.realtime_stream_gst_15 --inference_config configs/inference/realtime.yaml  --skip_save_images
+
diff --git a/configs/inference/realtime-stable.yaml b/configs/inference/realtime-stable.yaml
@@ -0,0 +1,7 @@
+avator_1:
+ preparation: False
+ bbox_shift: 5
+ video_path: "data/video/sun.mp4"
+ audio_clips:
+     audio_0: "data/audio/sun.wav"
+     audio_1: "data/audio/yongen.wav"
diff --git a/configs/inference/realtime.yaml b/configs/inference/realtime.yaml
@@ -1,10 +1,8 @@
-avator_1:
- preparation: True # your can set it to False if you want to use the existing avator, it will save time
- bbox_shift: 5
- video_path: "data/video/yongen.mp4"
+avatar_3:
+ preparation: False
+ bbox_shift: 0
+ batch_size: 16  
+ video_path: "data/video/aiden-glasses-processed.mp4"
  audio_clips:
-     audio_0: "data/audio/yongen.wav"
-     audio_1: "data/audio/eng.wav"
-
-
-
+     audio_0: "data/audio/sun.wav"
+     audio_1: "data/audio/yongen.wav"
diff --git a/data/video/musk.png b/data/video/musk.png
diff --git a/data/video/younglook.7z b/data/video/younglook.7z
diff --git a/download-gi-deps.py b/download-gi-deps.py
@@ -0,0 +1,40 @@
+import os
+import urllib.request
+import zipfile
+import shutil
+
+# URLs for missing dependencies
+dll_sources = {
+    "z.dll": "https://github.com/winlibs/zlib/releases/download/zlib-1.3/zlib-1.3-msvc-x64.zip",
+    "intl-8.dll": "https://github.com/mlocati/gettext-iconv-windows/releases/download/v0.21-v1.16/gettext0.21-iconv1.16-static-64.zip"
+}
+
+download_dir = "gtk_deps_download"
+os.makedirs(download_dir, exist_ok=True)
+
+# Download and extract dependencies
+for dll_name, url in dll_sources.items():
+    zip_path = os.path.join(download_dir, f"{dll_name}.zip")
+    print(f"Downloading {url}")
+    urllib.request.urlretrieve(url, zip_path)
+
+    print(f"Extracting {zip_path}")
+    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
+        zip_ref.extractall(download_dir)
+
+# Copy DLLs to GTK bin directory
+gtk_bin = r'C:\gtk\bin'
+for root, dirs, files in os.walk(download_dir):
+    for file in files:
+        if file.lower().endswith('.dll'):
+            source = os.path.join(root, file)
+            dest = os.path.join(gtk_bin, file)
+            print(f"Copying {source} to {dest}")
+            shutil.copy2(source, dest)
+
+            # Also create lib* version
+            lib_dest = os.path.join(gtk_bin, f"lib{file}")
+            print(f"Creating lib version at {lib_dest}")
+            shutil.copy2(source, lib_dest)
+
+print("Done installing dependencies!")
diff --git a/find -dll-dep.py b/find -dll-dep.py
@@ -0,0 +1,36 @@
+import os
+import sys
+import ctypes
+
+# Add the OpenCV bin directory to the DLL search path
+opencv_bin = r"D:\tencent\devel\cv\opencv-4.5.5\build\install\x64\vc16\bin"
+os.add_dll_directory(opencv_bin)
+
+# Add the directory containing the cv2.pyd file to the Python path
+opencv_pyd_dir = r"D:\tencent\devel\cv\opencv-4.5.5\build\lib\python3\Release"
+sys.path.insert(0, opencv_pyd_dir)
+
+# Pre-load only the essential DLLs (skip highgui)
+essential_dlls = [
+    "opencv_core455.dll", 
+    "opencv_imgproc455.dll",
+    "opencv_imgcodecs455.dll",
+    "opencv_videoio455.dll",
+    "opencv_flann455.dll",
+    "opencv_features2d455.dll"
+]
+
+for dll in essential_dlls:
+    try:
+        dll_path = os.path.join(opencv_bin, dll)
+        ctypes.CDLL(dll_path)
+        print(f"Successfully pre-loaded {dll}")
+    except Exception as e:
+        print(f"Failed to load {dll}: {e}")
+
+# Now try importing cv2
+try:
+    import cv2
+    print(f"\nSuccess! OpenCV version: {cv2.__version__}")
+except ImportError as e:
+    print(f"\nStill failed: {e}")
diff --git a/fix_gtk.py b/fix_gtk.py
@@ -0,0 +1,71 @@
+import os
+import sys
+import ctypes
+from ctypes import windll
+import glob
+import platform
+
+def inspect_gtk_installation():
+    print(f"\n#### GTK Installation Info ####")
+    gtk_bin = r'C:\gtk\bin'
+    gtk_lib = r'C:\gtk\lib'
+
+    # Add both directories to PATH
+    os.environ['PATH'] = f"{gtk_bin};{gtk_lib};{os.environ['PATH']}"
+    print(f"Python version: {platform.python_version()}")
+
+    # Set additional environment variables
+    os.environ['GI_TYPELIB_PATH'] = r'C:\gtk\lib\girepository-1.0'
+    print(f"GI_TYPELIB_PATH: {os.environ.get('GI_TYPELIB_PATH', 'Not set')}")
+
+    # Find all DLLs in GTK directories
+    bin_dlls = glob.glob(os.path.join(gtk_bin, "*.dll"))
+    lib_dlls = glob.glob(os.path.join(gtk_lib, "*.dll"))
+    print(f"Found {len(bin_dlls)} DLLs in {gtk_bin}")
+    print(f"Found {len(lib_dlls)} DLLs in {gtk_lib}")
+
+    # Critical DLLs that need to be loaded in the right order
+    critical_dlls = [
+        os.path.join(gtk_bin, "glib-2.0-0.dll"),
+        os.path.join(gtk_bin, "gobject-2.0-0.dll"),
+        os.path.join(gtk_bin, "gmodule-2.0-0.dll"),
+        os.path.join(gtk_bin, "girepository-1.0-1.dll"),
+        os.path.join(gtk_bin, "gio-2.0-0.dll"),
+        os.path.join(gtk_bin, "ffi-8.dll"),
+        os.path.join(gtk_bin, "z.dll"),
+        os.path.join(gtk_bin, "libintl-8.dll")
+    ]
+
+    # Try to load critical DLLs first
+    print("\n#### Loading Critical DLLs ####")
+    for dll in critical_dlls:
+        if os.path.exists(dll):
+            try:
+                windll.LoadLibrary(dll)
+                print(f"✓ Loaded {os.path.basename(dll)}")
+            except Exception as e:
+                print(f"✗ Failed loading {os.path.basename(dll)}: {e}")
+        else:
+            print(f"! Missing {os.path.basename(dll)}")
+
+    # Try importing gi
+    print("\n#### Testing PyGObject Import ####")
+    try:
+        import gi
+        print(f"✓ Successfully imported gi ({gi.__file__})")
+        return True
+    except ImportError as e:
+        print(f"✗ Failed to import gi: {e}")
+        return False
+
+if __name__ == "__main__":
+    success = inspect_gtk_installation()
+
+    if not success:
+        print("\n#### Troubleshooting Tips ####")
+        print("1. Your PyGObject installation might not be compatible with your GTK installation.")
+        print("2. Try reinstalling PyGObject with:")
+        print("   pip uninstall pygobject")
+        print("   pip install pygobject")
+        print("3. If that doesn't work, try installing from an alternate source:")
+        print("   pip install --no-binary :all: pygobject")
diff --git a/holobot-receiver.ps1 b/holobot-receiver.ps1
@@ -0,0 +1,44 @@
+<#
+.SYNOPSIS
+    Launches the GStreamer pipeline to decode and play the MuseTalk real-time A/V stream.
+
+.DESCRIPTION
+    This script uses a "zero-copy" video pipeline. The video frame is decoded on the GPU
+    and rendered directly to the screen using d3d11videosink without ever being copied
+    to system RAM, providing the lowest possible latency and highest performance.
+
+.NOTES
+    - Requires GStreamer 1.0 (with msvc_x86_64 and nvcodec packages) to be installed and in the system's PATH.
+    - Run this script from a PowerShell terminal.
+    - To stop the stream, press Ctrl+C.
+#>
+
+# --- Configuration ---
+$videoPort = 5000
+$audioPort = 5001
+
+# --- User Feedback ---
+Write-Host "🚀 Launching GStreamer Decoder (Zero-Copy Video Pipeline)..." -ForegroundColor Green
+Write-Host "   - Listening for VIDEO on UDP port: $videoPort"
+Write-Host "   - Listening for AUDIO on UDP port: $audioPort"
+Write-Host "   (Press Ctrl+C to stop the stream)"
+Write-Host ""
+
+
+# --- Launch GStreamer Pipeline ---
+gst-launch-1.0 -v `
+    udpsrc port=$videoPort caps="application/x-rtp, media=video, clock-rate=90000, encoding-name=H264, payload=96" `
+        ! rtpjitterbuffer latency=200 `
+        ! queue `
+        ! rtph264depay `
+        ! h264parse `
+        ! nvh264dec `
+        ! d3d11videosink sync=true qos=true max-lateness=200000000 `
+`
+    udpsrc port=$audioPort caps="application/x-rtp, media=audio, clock-rate=48000, encoding-name=OPUS, payload=97" `
+        ! rtpjitterbuffer latency=375 drop-on-latency=true `
+        ! rtpopusdepay `
+        ! opusdec plc=true `
+        ! audioconvert `
+        ! audioresample `
+        ! wasapisink sync=true
diff --git a/musetalk/utils/blending.py b/musetalk/utils/blending.py
diff --git a/musetalk/utils/face_parsing/resnet.py b/musetalk/utils/face_parsing/resnet.py
@@ -80,7 +80,7 @@ def forward(self, x):
         return feat8, feat16, feat32
 
     def init_weight(self, model_path):
-        state_dict = torch.load(model_path) #modelzoo.load_url(resnet18_url)
+        state_dict = torch.load(model_path, weights_only=False) #modelzoo.load_url(resnet18_url)
         self_state_dict = self.state_dict()
         for k, v in state_dict.items():
             if 'fc' in k: continue

diff --git a/requirements-pip.txt b/requirements-pip.txt
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
 diffusers==0.30.2
 accelerate==0.28.0
-numpy==1.23.5
-tensorflow==2.12.0
-tensorboard==2.12.0
+numpy
+tensorflow
+tensorboard
 opencv-python==4.9.0.80
 soundfile==0.12.1
 transformers==4.39.2

diff --git a/scripts/.env b/scripts/.env
@@ -0,0 +1,27 @@
+# .env file for realtime_stream_sync watcher
+
+# --- Required ---
+# Full path to the WAV file that PhotonGPT writes
+#WATCHED_WAV_FILE_PATH=E:\devel\NanoAR\HoloBot\PhotonGPT\latest_response.opus
+
+# Path to the avatar configuration YAML file used by realtime_stream_sync
+AVATAR_CONFIG_PATH=configs/inference/realtime.yaml # Or your actual path
+
+# The specific Avatar ID from the config file to use for lipsyncing
+AVATAR_ID_TO_USE=avatar_3 # Replace with your actual avatar ID from the YAML
+
+# --- Optional (Defaults will be used if not set) ---
+# Target FPS for the GStreamer output (should match video source)
+TARGET_FPS=25
+
+# Frame skipping parameters - lower values = more aggressive
+FRAME_SKIP_THRESHOLD=1000
+OVERLOAD_MULTIPLIER=10.0
+MAX_FRAMES_TO_SKIP=0
+
+#FRAME_SKIP_THRESHOLD=2
+#OVERLOAD_MULTIPLIER=1.2
+#MAX_FRAMES_TO_SKIP=1
+
+STREAM_PIPE_PATH=D:\photon_audio.opus
+GSTREAMER_LAUNCH_PATH=C:\gstreamer\1.0\msvc_x86_64\bin\gst-launch-1.0.exe
diff --git a/scripts/Downloads - Shortcut.lnk b/scripts/Downloads - Shortcut.lnk
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		python -m scripts.realtime_stream_gst_15 --inference_config configs/inference/realtime.yaml --skip_save_images