Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,5 @@ ffmprobe*
ffplay*
debug
exp_out
.gradio
.gradio
venv/
6 changes: 4 additions & 2 deletions musetalk/utils/blending.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,8 @@ def get_image(image, face, face_box, upper_boundary_ratio=0.5, expand=1.5, mode=


# 对掩码进行高斯模糊,使边缘更平滑
blur_kernel_size = int(0.05 * ori_shape[0] // 2 * 2) + 1 # 计算模糊核大小
# Optimized blur kernel at 0.15 - balances smooth edges without affecting lip stability
blur_kernel_size = int(0.15 * ori_shape[0] // 2 * 2) + 1 # 计算模糊核大小
mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0) # 高斯模糊
#mask_array = np.array(modified_mask_image)
mask_image = Image.fromarray(mask_array) # 将模糊后的掩码转换回 PIL 图像
Expand Down Expand Up @@ -131,6 +132,7 @@ def get_image_prepare_material(image, face_box, upper_boundary_ratio=0.5, expand
modified_mask_image = Image.new('L', ori_shape, 0)
modified_mask_image.paste(mask_image.crop((0, top_boundary, width, height)), (0, top_boundary))

blur_kernel_size = int(0.1 * ori_shape[0] // 2 * 2) + 1
# Optimized blur kernel at 0.15 - balances smooth edges without affecting lip stability
blur_kernel_size = int(0.15 * ori_shape[0] // 2 * 2) + 1
mask_array = cv2.GaussianBlur(np.array(modified_mask_image), (blur_kernel_size, blur_kernel_size), 0)
return mask_array, crop_box
35 changes: 28 additions & 7 deletions scripts/realtime_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,9 @@ def prepare_material(self):
latents = vae.get_latents_for_unet(resized_crop_frame)
input_latent_list.append(latents)

self.frame_list_cycle = frame_list + frame_list[::-1]
self.coord_list_cycle = coord_list + coord_list[::-1]
self.input_latent_list_cycle = input_latent_list + input_latent_list[::-1]
self.frame_list_cycle = frame_list
self.coord_list_cycle = coord_list
self.input_latent_list_cycle = input_latent_list
self.mask_coords_list_cycle = []
self.mask_list_cycle = []

Expand Down Expand Up @@ -211,6 +211,9 @@ def prepare_material(self):

def process_frames(self, res_frame_queue, video_len, skip_save_images):
print(video_len)
prev_combine_frame = None # Store previous frame for temporal smoothing
temporal_alpha = 0.3 # Smoothing factor: 0.3 means 30% previous + 70% current

while True:
if self.idx >= video_len - 1:
break
Expand All @@ -224,13 +227,30 @@ def process_frames(self, res_frame_queue, video_len, skip_save_images):
ori_frame = copy.deepcopy(self.frame_list_cycle[self.idx % (len(self.frame_list_cycle))])
x1, y1, x2, y2 = bbox
try:
res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
# Use LANCZOS4 for higher quality upscaling of lip-sync region
res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1), interpolation=cv2.INTER_LANCZOS4)
except:
continue
mask = self.mask_list_cycle[self.idx % (len(self.mask_list_cycle))]
mask_crop_box = self.mask_coords_list_cycle[self.idx % (len(self.mask_coords_list_cycle))]
combine_frame = get_image_blending(ori_frame,res_frame,bbox,mask,mask_crop_box)

# Apply sharpening to the lip-sync region to match rest of video
# Extract the blended region and apply unsharp mask
lip_region = combine_frame[y1:y2, x1:x2]
gaussian = cv2.GaussianBlur(lip_region, (0, 0), 2.0)
sharpened_lip = cv2.addWeighted(lip_region, 1.5, gaussian, -0.5, 0)
combine_frame[y1:y2, x1:x2] = sharpened_lip

# Apply temporal smoothing to reduce stuttering/jitter
if prev_combine_frame is not None:
combine_frame = cv2.addWeighted(
prev_combine_frame, temporal_alpha,
combine_frame, 1 - temporal_alpha,
0
)
prev_combine_frame = combine_frame.copy()

if skip_save_images is False:
cv2.imwrite(f"{self.avatar_path}/tmp/{str(self.idx).zfill(8)}.png", combine_frame)
self.idx = self.idx + 1
Expand Down Expand Up @@ -292,13 +312,14 @@ def inference(self, audio_path, out_vid_name, fps, skip_save_images):
time.time() - start_time))

if out_vid_name is not None and args.skip_save_images is False:
# optional
cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=yuv420p -crf 18 {self.avatar_path}/temp.mp4"
# optional - using CRF 15 for higher quality lip-sync preservation
cmd_img2video = f"ffmpeg -y -v warning -r {fps} -f image2 -i {self.avatar_path}/tmp/%08d.png -vcodec libx264 -vf format=yuv420p -crf 15 {self.avatar_path}/temp.mp4"
print(cmd_img2video)
os.system(cmd_img2video)

output_vid = os.path.join(self.video_out_path, out_vid_name + ".mp4") # on
cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {self.avatar_path}/temp.mp4 {output_vid}"
# Explicitly set output fps to match generated frames and improve quality
cmd_combine_audio = f"ffmpeg -y -v warning -i {audio_path} -i {self.avatar_path}/temp.mp4 -c:v libx264 -preset slow -crf 18 -r {fps} -c:a aac -b:a 192k {output_vid}"
print(cmd_combine_audio)
os.system(cmd_combine_audio)

Expand Down