From 405145f8e5f5db1440d3f38671b2d294cfad4e39 Mon Sep 17 00:00:00 2001 From: Dencel-CleverAI Date: Fri, 19 Dec 2025 19:38:58 +0100 Subject: [PATCH 1/7] Changed order of spatial and temporal upscaling --- wgp.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/wgp.py b/wgp.py index 02d8e6687..d12686bc5 100644 --- a/wgp.py +++ b/wgp.py @@ -4580,17 +4580,16 @@ def edit_video( sample = sample.float().div_(127.5).sub_(1.).permute(-1,0,1,2) frames_count = sample.shape[1] + if len(spatial_upsampling) > 0: + sample = perform_spatial_upsampling(sample, spatial_upsampling ) + configs["spatial_upsampling"] = spatial_upsampling + output_fps = round(fps) if len(temporal_upsampling) > 0: sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, None, temporal_upsampling, fps) configs["temporal_upsampling"] = temporal_upsampling frames_count = sample.shape[1] - - if len(spatial_upsampling) > 0: - sample = perform_spatial_upsampling(sample, spatial_upsampling ) - configs["spatial_upsampling"] = spatial_upsampling - if film_grain_intensity > 0: from postprocessing.film_grain import add_film_grain sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) @@ -5860,16 +5859,18 @@ def set_header_text(txt): if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 and not "vae2" in spatial_upsampling: send_cmd("progress", [0, get_latest_status(state,"Upsampling")]) + + if len(spatial_upsampling) > 0: + sample = perform_spatial_upsampling(sample, spatial_upsampling) output_fps = fps if len(temporal_upsampling) > 0: sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps) - - if len(spatial_upsampling) > 0: - sample = perform_spatial_upsampling(sample, spatial_upsampling ) + if film_grain_intensity> 0: from postprocessing.film_grain import add_film_grain sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) + if sliding_window : if frames_already_processed == None: frames_already_processed = sample From a295706d7008a6375441bb0e0e7ec3beb5d0f955 Mon Sep 17 00:00:00 2001 From: Dencel-CleverAI Date: Sat, 20 Dec 2025 17:29:13 +0100 Subject: [PATCH 2/7] Added 20 FPS for Model Default --- wgp.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/wgp.py b/wgp.py index d12686bc5..3ffcbefaf 100644 --- a/wgp.py +++ b/wgp.py @@ -4483,7 +4483,7 @@ def upsample_frames(frame): return resize_lanczos(frame, h, w, method).unsqueeze(1) sample = torch.cat(process_images_multithread(upsample_frames, frames_to_upsample, "upsample", wrap_in_list = False, max_workers=get_default_workers(), in_place=True), dim=1) frames_to_upsample = None - return sample + return sample def any_audio_track(model_type): base_model_type = get_base_model_type(model_type) @@ -4581,14 +4581,14 @@ def edit_video( frames_count = sample.shape[1] if len(spatial_upsampling) > 0: - sample = perform_spatial_upsampling(sample, spatial_upsampling ) + sample = perform_spatial_upsampling(sample, spatial_upsampling) configs["spatial_upsampling"] = spatial_upsampling output_fps = round(fps) if len(temporal_upsampling) > 0: sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, None, temporal_upsampling, fps) configs["temporal_upsampling"] = temporal_upsampling - frames_count = sample.shape[1] + frames_count = sample.shape[1] if film_grain_intensity > 0: from postprocessing.film_grain import add_film_grain @@ -4604,8 +4604,8 @@ def edit_video( tmp_path = None any_change = False if sample != None: - video_path =get_available_filename(save_path, video_source, "_tmp") if any_mmaudio or has_already_audio else get_available_filename(save_path, video_source, "_post") - save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None), container=server_config.get("video_container", "mp4")) + video_path =get_available_filename(save_path, video_source, "_tmp") if any_mmaudio or has_already_audio else get_available_filename(save_path, video_source, "_post") + save_video(tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None), container=server_config.get("video_container", "mp4")) if any_mmaudio or has_already_audio: tmp_path = video_path any_change = True @@ -5837,7 +5837,6 @@ def set_header_text(txt): else: pre_video_guide = sample[:, -reuse_frames:].clone() - if prefix_video != None and window_no == 1: # remove source video overlapped frames at the beginning of the generation sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1) @@ -5856,21 +5855,21 @@ def set_header_text(txt): full_generated_audio = generated_audio if full_generated_audio is None else np.concatenate([full_generated_audio, generated_audio], axis=0) output_new_audio_data = full_generated_audio - if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 and not "vae2" in spatial_upsampling: send_cmd("progress", [0, get_latest_status(state,"Upsampling")]) if len(spatial_upsampling) > 0: + # h_before, w_before = sample.shape[-2:] sample = perform_spatial_upsampling(sample, spatial_upsampling) output_fps = fps if len(temporal_upsampling) > 0: sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps) - + if film_grain_intensity> 0: from postprocessing.film_grain import add_film_grain - sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) - + sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) + if sliding_window : if frames_already_processed == None: frames_already_processed = sample @@ -9271,7 +9270,8 @@ def gen_upsampling_dropdowns(temporal_upsampling, spatial_upsampling , film_grai force_fps_choices += [("Source Video fps", "source")] force_fps_choices += [ ("15", "15"), - ("16", "16"), + ("16", "16"), + ("20", "20"), ("23", "23"), ("24", "24"), ("25", "25"), From 801587f6d6f383a856c1566f5ae5638d5c9a6952 Mon Sep 17 00:00:00 2001 From: Dencel-CleverAI Date: Thu, 25 Dec 2025 20:07:24 +0100 Subject: [PATCH 3/7] Merge video after upsampling --- wgp.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 84 insertions(+), 5 deletions(-) diff --git a/wgp.py b/wgp.py index 3ffcbefaf..d0a74a138 100644 --- a/wgp.py +++ b/wgp.py @@ -4996,6 +4996,48 @@ def remove_temp_filenames(temp_filenames_list): if temp_filename!= None and os.path.isfile(temp_filename): os.remove(temp_filename) + # --- CONTINUING VIDEO + UPSAMPLING VALIDATION CHECK --- + # Before starting generation, ensure that if we are continuing a video ("V" or "L") + # and using upsampling, the final result matches the source video resolution/FPS. + do_upsampling = len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 + if video_source and any_letters(image_prompt_type, "VL") and do_upsampling: + # 1. Determine Upscaling Multipliers + s_mult = 1.0 + if "lanczos1.5" in spatial_upsampling: s_mult = 1.5 + elif "lanczos2" in spatial_upsampling or "vae2" in spatial_upsampling: s_mult = 2.0 + elif "vae1" in spatial_upsampling: s_mult = 0.5 + + t_mult = 1 + if "rife2" in temporal_upsampling: t_mult = 2 + elif "rife4" in temporal_upsampling: t_mult = 4 + + # 2. Calculate Expected Target Properties (Model Resolution * Multiplier) + base_w, base_h = resolution.split("x") + base_w, base_h = int(base_w), int(base_h) + # Note: Model gen usually snaps to block_size (16), assume passed resolution is close to model output + expected_w = base_w * s_mult + expected_h = base_h * s_mult + + base_model_type_check = get_base_model_type(model_type) + base_fps = get_computed_fps(force_fps, base_model_type_check, video_guide, video_source) + expected_fps = base_fps * t_mult + + # 3. Get Source Properties + src_fps, src_w, src_h, _ = get_video_info(video_source) + + # 4. Compare with tolerance (16px for dims, 0.1 for FPS) + err_msg = [] + if abs(src_w - expected_w) > 16 or abs(src_h - expected_h) > 16: + err_msg.append(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}.") + + if abs(src_fps - expected_fps) > 0.1: + err_msg.append(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps.") + + if len(err_msg) > 0: + err_msg.append("Please adjust Resolution, Spatial Upsampling, Force FPS, or Temporal Upsampling settings to match the source.") + raise gr.Error("\n".join(err_msg)) + # ----------------------------------- + global wan_model, offloadobj, reload_needed gen = get_gen_info(state) torch.set_grad_enabled(False) @@ -5068,7 +5110,7 @@ def remove_temp_filenames(temp_filenames_list): return width, height = resolution.split("x") - width, height = int(width) // block_size * block_size, int(height) // block_size * block_size + width, height = int(width) // block_size * block_size, int(height) // block_size * block_size default_image_size = (height, width) if slg_switch == 0: @@ -5823,6 +5865,7 @@ def set_header_text(txt): # sample =torch.load("output.pt") if gen.get("extra_windows",0) > 0: sliding_window = True + if sliding_window : # guide_start_frame = guide_end_frame guide_start_frame += current_video_length @@ -5838,8 +5881,12 @@ def set_header_text(txt): pre_video_guide = sample[:, -reuse_frames:].clone() if prefix_video != None and window_no == 1: - # remove source video overlapped frames at the beginning of the generation - sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1) + # Only concatenate low-res prefix if NO continue video upscaling is performed. + # If continue video upscaling is active, we concat high-res original frames LATER. + if not (any_letters(image_prompt_type, "VL") and do_upsampling): + # remove prefix video overlapped frames at the beginning of the generation + sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1) + guide_start_frame -= source_video_overlap_frames_count if generated_audio is not None: generated_audio = truncate_audio(generated_audio, source_video_overlap_frames_count, 0, fps, audio_sampling_rate) @@ -5855,17 +5902,49 @@ def set_header_text(txt): full_generated_audio = generated_audio if full_generated_audio is None else np.concatenate([full_generated_audio, generated_audio], axis=0) output_new_audio_data = full_generated_audio - if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 and not "vae2" in spatial_upsampling: + if do_upsampling and not "vae2" in spatial_upsampling: send_cmd("progress", [0, get_latest_status(state,"Upsampling")]) if len(spatial_upsampling) > 0: - # h_before, w_before = sample.shape[-2:] sample = perform_spatial_upsampling(sample, spatial_upsampling) output_fps = fps if len(temporal_upsampling) > 0: sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps) + # --- MERGE WITH ORIGINAL VIDEO SOURCE IF UPSCALED --- + if any_letters(image_prompt_type, "VL") and do_upsampling: + src_fps, src_w, src_h, _ = get_video_info(video_source) + src_video = preprocess_video( + width=src_w, + height=src_h, + video_in=video_source, + max_frames=parsed_keep_frames_video_source, + start_frame=0, + fit_canvas=None, + fit_crop=False, + target_fps=src_fps, + block_size=block_size + ) + src_video = src_video.permute(3, 0, 1, 2).float().div_(127.5).sub_(1.) # c, f, h, w + + # Resize sample to match the source's resolution exactly if they differ + if src_video.shape[-2:] != sample.shape[-2:]: + # Permute to (F, C, H, W) for torch.nn.functional.interpolate + sample = sample.permute(1, 0, 2, 3) + sample = torch.nn.functional.interpolate( + sample, + size=src_video.shape[-2:], + mode='bilinear', + align_corners=False + ) + # Permute back to (C, F, H, W) + sample = sample.permute(1, 0, 2, 3) + + # remove source video overlapped frames and merge with new generated sample + sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample], dim = 1) + # ----------------------------------------------- + if film_grain_intensity> 0: from postprocessing.film_grain import add_film_grain sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) From 2272b413bf819c5619132d2599fac7757b4f8c36 Mon Sep 17 00:00:00 2001 From: Dencel-CleverAI Date: Fri, 26 Dec 2025 03:41:00 +0100 Subject: [PATCH 4/7] Exposed Fit Canvas, refined merging upsampled video --- wgp.py | 74 ++++++++++++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 33 deletions(-) diff --git a/wgp.py b/wgp.py index d0a74a138..62ca6eb93 100644 --- a/wgp.py +++ b/wgp.py @@ -4575,22 +4575,24 @@ def edit_video( if mode == "edit_postprocessing": if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 or film_grain_intensity > 0: - send_cmd("progress", [0, get_latest_status(state,"Upsampling" if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 else "Adding Film Grain" )]) sample = get_resampled_video(video_source, 0, max_source_video_frames, fps) sample = sample.float().div_(127.5).sub_(1.).permute(-1,0,1,2) frames_count = sample.shape[1] if len(spatial_upsampling) > 0: + send_cmd("progress", [0, get_latest_status(state,"Spatial Upsampling")]) sample = perform_spatial_upsampling(sample, spatial_upsampling) configs["spatial_upsampling"] = spatial_upsampling output_fps = round(fps) if len(temporal_upsampling) > 0: + send_cmd("progress", [0, get_latest_status(state,"Temporal Upsampling")]) sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, None, temporal_upsampling, fps) configs["temporal_upsampling"] = temporal_upsampling frames_count = sample.shape[1] if film_grain_intensity > 0: + send_cmd("progress", [0, get_latest_status(state,"Film Grain")]) from postprocessing.film_grain import add_film_grain sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) configs["film_grain_intensity"] = film_grain_intensity @@ -4598,6 +4600,7 @@ def edit_video( else: output_fps = round(fps) + send_cmd("progress", [0, get_latest_status(state,"Finalizing")]) any_mmaudio = MMAudio_setting != 0 and server_config.get("mmaudio_enabled", 0) != 0 and frames_count >=output_fps if any_mmaudio: download_mmaudio() @@ -5025,17 +5028,12 @@ def remove_temp_filenames(temp_filenames_list): # 3. Get Source Properties src_fps, src_w, src_h, _ = get_video_info(video_source) - # 4. Compare with tolerance (16px for dims, 0.1 for FPS) - err_msg = [] + # 4. Compare with tolerance (16px for resolution, 0.1 for FPS) if abs(src_w - expected_w) > 16 or abs(src_h - expected_h) > 16: - err_msg.append(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}.") + raise gr.Error(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}. Please adjust Resolution or Spatial Upsampling to match the source.") if abs(src_fps - expected_fps) > 0.1: - err_msg.append(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps.") - - if len(err_msg) > 0: - err_msg.append("Please adjust Resolution, Spatial Upsampling, Force FPS, or Temporal Upsampling settings to match the source.") - raise gr.Error("\n".join(err_msg)) + raise gr.Error(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps. Please adjust Default Model FPS or Temporal Upsampling to match the source.") # ----------------------------------- global wan_model, offloadobj, reload_needed @@ -5438,13 +5436,14 @@ def remove_temp_filenames(temp_filenames_list): image_start_tensor = convert_image_to_tensor(image_start_tensor) pre_video_guide = prefix_video = image_start_tensor.unsqueeze(1) else: - prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, block_size = block_size ) - prefix_video = prefix_video.permute(3, 0, 1, 2) - prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w + prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, block_size = block_size ) + prefix_video = prefix_video.permute(3, 0, 1, 2) + prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w if fit_crop or "L" in image_prompt_type: refresh_preview["video_source"] = convert_tensor_to_image(prefix_video, 0) - new_height, new_width = prefix_video.shape[-2:] - pre_video_guide = prefix_video[:, -reuse_frames:] + new_height, new_width = prefix_video.shape[-2:] + #print("Downsampled Video:", str(new_width), str(new_height)) + pre_video_guide = prefix_video[:, -reuse_frames:] pre_video_frame = convert_tensor_to_image(prefix_video[:, -1]) source_video_overlap_frames_count = pre_video_guide.shape[1] source_video_frames_count = prefix_video.shape[1] @@ -5902,19 +5901,19 @@ def set_header_text(txt): full_generated_audio = generated_audio if full_generated_audio is None else np.concatenate([full_generated_audio, generated_audio], axis=0) output_new_audio_data = full_generated_audio - if do_upsampling and not "vae2" in spatial_upsampling: - send_cmd("progress", [0, get_latest_status(state,"Upsampling")]) - if len(spatial_upsampling) > 0: + send_cmd("progress", [0, get_latest_status(state,"Spatial Upsampling")]) sample = perform_spatial_upsampling(sample, spatial_upsampling) output_fps = fps if len(temporal_upsampling) > 0: + send_cmd("progress", [0, get_latest_status(state,"Temporal Upsampling")]) sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps) # --- MERGE WITH ORIGINAL VIDEO SOURCE IF UPSCALED --- if any_letters(image_prompt_type, "VL") and do_upsampling: - src_fps, src_w, src_h, _ = get_video_info(video_source) + send_cmd("progress", [0, get_latest_status(state,"Resizing Video")]) + src_fps, src_w, src_h, _ = get_video_info(video_source) src_video = preprocess_video( width=src_w, height=src_h, @@ -5927,35 +5926,42 @@ def set_header_text(txt): block_size=block_size ) src_video = src_video.permute(3, 0, 1, 2).float().div_(127.5).sub_(1.) # c, f, h, w - + #print("Source Video:", str(src_w), str(src_h)) + #print("Sample Video:", str(sample.shape[-2:])) + # Resize sample to match the source's resolution exactly if they differ - if src_video.shape[-2:] != sample.shape[-2:]: + if [src_h, src_w] != sample.shape[-2:]: # Permute to (F, C, H, W) for torch.nn.functional.interpolate sample = sample.permute(1, 0, 2, 3) sample = torch.nn.functional.interpolate( sample, - size=src_video.shape[-2:], + size=[src_h, src_w], mode='bilinear', align_corners=False ) # Permute back to (C, F, H, W) sample = sample.permute(1, 0, 2, 3) - # remove source video overlapped frames and merge with new generated sample - sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample], dim = 1) + # last frame of source is first frame of generated sample which can be skipped + # remove source overlapped frame and merge with new generated sample + send_cmd("progress", [0, get_latest_status(state,"Merging Videos")]) + sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample[:, 1:]], dim = 1) # ----------------------------------------------- if film_grain_intensity> 0: + send_cmd("progress", [0, get_latest_status(state,"Film Grain")]) from postprocessing.film_grain import add_film_grain sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) - if sliding_window : + if sliding_window: + send_cmd("progress", [0, get_latest_status(state,"Sliding Window")]) if frames_already_processed == None: frames_already_processed = sample else: sample = torch.cat([frames_already_processed, sample], dim=1) frames_already_processed = sample + send_cmd("progress", [0, get_latest_status(state,"Saving")]) time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%Hh%Mm%Ss") save_prompt = original_prompts[0] if audio_only: @@ -6027,6 +6033,7 @@ def set_header_text(txt): end_time = time.time() + send_cmd("progress", [0, get_latest_status(state,"Add Meta Data")]) inputs.pop("send_cmd") inputs.pop("task") inputs.pop("mode") @@ -8943,13 +8950,6 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl pace = gr.Slider( 0.2, 1, value=ui_get("pace"), step=0.01, label="Pace", show_reset_button= False) with gr.Row(visible=not audio_only) as resolution_row: - fit_canvas = server_config.get("fit_canvas", 0) - if fit_canvas == 1: - label = "Outer Box Resolution (one dimension may be less to preserve video W/H ratio)" - elif fit_canvas == 2: - label = "Output Resolution (Input Images wil be Cropped if the W/H ratio is different)" - else: - label = "Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)" current_resolution_choice = ui_get("resolution") if update_form or last_resolution is None else last_resolution model_resolutions = model_def.get("resolutions", None) resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions) @@ -8962,8 +8962,16 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl resolution = gr.Dropdown( choices = selected_group_resolutions, value= current_resolution_choice, - label= label, - scale = 5 + label= "Format", + scale = 2 + ) + fit_canvas = gr.Dropdown( + choices=[("Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)", 0), + ("Outer Box Resolution (one dimension may be less to preserve video W/H ratio)", 1), + ("Output Resolution (Input Images wil be Cropped if the W/H ratio is different)", 2)], + value= server_config.get("fit_canvas", 0), + label="Fit Canvas", + scale = 3 ) with gr.Row(visible= not audio_only) as number_frames_row: batch_size = gr.Slider(1, 16, value=ui_get("batch_size"), step=1, label="Number of Images to Generate", visible = image_outputs, show_reset_button= False) From aacaa8c3bec6becaf1c3368569996b2fb66df040 Mon Sep 17 00:00:00 2001 From: Dencel-CleverAI Date: Fri, 26 Dec 2025 04:05:00 +0100 Subject: [PATCH 5/7] Fit Canvas is now interactive and its start value is used correctly --- wgp.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/wgp.py b/wgp.py index 62ca6eb93..aa19c4cf1 100644 --- a/wgp.py +++ b/wgp.py @@ -8954,6 +8954,7 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl model_resolutions = model_def.get("resolutions", None) resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions) available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice) + current_fit_canvas = server_config.get("fit_canvas", 0) resolution_group = gr.Dropdown( choices = available_groups, value= selected_group, @@ -8969,9 +8970,10 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl choices=[("Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)", 0), ("Outer Box Resolution (one dimension may be less to preserve video W/H ratio)", 1), ("Output Resolution (Input Images wil be Cropped if the W/H ratio is different)", 2)], - value= server_config.get("fit_canvas", 0), + value= current_fit_canvas, label="Fit Canvas", - scale = 3 + scale = 3, + interactive = True ) with gr.Row(visible= not audio_only) as number_frames_row: batch_size = gr.Slider(1, 16, value=ui_get("batch_size"), step=1, label="Number of Images to Generate", visible = image_outputs, show_reset_button= False) From bd6933988e5647bb3eac374099369db76190b2c2 Mon Sep 17 00:00:00 2001 From: Dencel-CleverAI Date: Sat, 27 Dec 2025 18:16:12 +0100 Subject: [PATCH 6/7] Saved Fit_Canvas model specific, merged videos via ffmpeg --- wgp.py | 119 +++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 73 insertions(+), 46 deletions(-) diff --git a/wgp.py b/wgp.py index aa19c4cf1..76fd014d2 100644 --- a/wgp.py +++ b/wgp.py @@ -618,7 +618,8 @@ def ret(): outpainting_dims = get_outpainting_dims(video_guide_outpainting) - if server_config.get("fit_canvas", 0) == 2 and outpainting_dims is not None and any_letters(video_prompt_type, "VKF"): + fit_canvas = inputs["fit_canvas"] + if fit_canvas == 2 and outpainting_dims is not None and any_letters(video_prompt_type, "VKF"): gr.Info("Output Resolution Cropping will be not used for this Generation as it is not compatible with Video Outpainting") if not model_def.get("motion_amplitude", False): motion_amplitude = 1. @@ -4901,6 +4902,7 @@ def generate_video( prompt, negative_prompt, resolution, + fit_canvas, video_length, batch_size, seed, @@ -4999,11 +5001,10 @@ def remove_temp_filenames(temp_filenames_list): if temp_filename!= None and os.path.isfile(temp_filename): os.remove(temp_filename) - # --- CONTINUING VIDEO + UPSAMPLING VALIDATION CHECK --- + # --- CONTINUING VIDEO VALIDATION CHECK --- # Before starting generation, ensure that if we are continuing a video ("V" or "L") - # and using upsampling, the final result matches the source video resolution/FPS. - do_upsampling = len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 - if video_source and any_letters(image_prompt_type, "VL") and do_upsampling: + # the final result matches the source video resolution/FPS. + if video_source and any_letters(image_prompt_type, "VL"): # 1. Determine Upscaling Multipliers s_mult = 1.0 if "lanczos1.5" in spatial_upsampling: s_mult = 1.5 @@ -5017,7 +5018,6 @@ def remove_temp_filenames(temp_filenames_list): # 2. Calculate Expected Target Properties (Model Resolution * Multiplier) base_w, base_h = resolution.split("x") base_w, base_h = int(base_w), int(base_h) - # Note: Model gen usually snaps to block_size (16), assume passed resolution is close to model output expected_w = base_w * s_mult expected_h = base_h * s_mult @@ -5025,15 +5025,12 @@ def remove_temp_filenames(temp_filenames_list): base_fps = get_computed_fps(force_fps, base_model_type_check, video_guide, video_source) expected_fps = base_fps * t_mult - # 3. Get Source Properties + # 3. Get Source Properties and compare with tolerance (16px for resolution, 0.1 for FPS) src_fps, src_w, src_h, _ = get_video_info(video_source) - - # 4. Compare with tolerance (16px for resolution, 0.1 for FPS) if abs(src_w - expected_w) > 16 or abs(src_h - expected_h) > 16: - raise gr.Error(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}. Please adjust Resolution or Spatial Upsampling to match the source.") - + raise gr.Error(f"Resolution Mismatch: Source is {src_w}x{src_h}, but result will be approx {int(expected_w)}x{int(expected_h)}. Please adjust Resolution or Spatial Upsampling to match the source.") if abs(src_fps - expected_fps) > 0.1: - raise gr.Error(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps. Please adjust Default Model FPS or Temporal Upsampling to match the source.") + raise gr.Error(f"FPS Mismatch: Source is {src_fps:.2f} fps, but result will be {expected_fps:.2f} fps. Please adjust Default Model FPS or Temporal Upsampling to match the source.") # ----------------------------------- global wan_model, offloadobj, reload_needed @@ -5239,7 +5236,6 @@ def remove_temp_filenames(temp_filenames_list): any_background_ref = 2 if model_def.get("all_image_refs_are_background_ref", False) else 1 outpainting_dims = get_outpainting_dims(video_guide_outpainting) - fit_canvas = server_config.get("fit_canvas", 0) fit_crop = fit_canvas == 2 if fit_crop and outpainting_dims is not None: fit_crop = False @@ -5441,8 +5437,7 @@ def remove_temp_filenames(temp_filenames_list): prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w if fit_crop or "L" in image_prompt_type: refresh_preview["video_source"] = convert_tensor_to_image(prefix_video, 0) - new_height, new_width = prefix_video.shape[-2:] - #print("Downsampled Video:", str(new_width), str(new_height)) + new_height, new_width = prefix_video.shape[-2:] pre_video_guide = prefix_video[:, -reuse_frames:] pre_video_frame = convert_tensor_to_image(prefix_video[:, -1]) source_video_overlap_frames_count = pre_video_guide.shape[1] @@ -5880,11 +5875,10 @@ def set_header_text(txt): pre_video_guide = sample[:, -reuse_frames:].clone() if prefix_video != None and window_no == 1: - # Only concatenate low-res prefix if NO continue video upscaling is performed. - # If continue video upscaling is active, we concat high-res original frames LATER. - if not (any_letters(image_prompt_type, "VL") and do_upsampling): + # If continue (last) video is active, we concat high-res original frames LATER. + if not any_letters(image_prompt_type, "VL"): # remove prefix video overlapped frames at the beginning of the generation - sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1) + sample = torch.cat([prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1) guide_start_frame -= source_video_overlap_frames_count if generated_audio is not None: @@ -5910,27 +5904,13 @@ def set_header_text(txt): send_cmd("progress", [0, get_latest_status(state,"Temporal Upsampling")]) sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps) - # --- MERGE WITH ORIGINAL VIDEO SOURCE IF UPSCALED --- - if any_letters(image_prompt_type, "VL") and do_upsampling: - send_cmd("progress", [0, get_latest_status(state,"Resizing Video")]) + if any_letters(image_prompt_type, "VL"): src_fps, src_w, src_h, _ = get_video_info(video_source) - src_video = preprocess_video( - width=src_w, - height=src_h, - video_in=video_source, - max_frames=parsed_keep_frames_video_source, - start_frame=0, - fit_canvas=None, - fit_crop=False, - target_fps=src_fps, - block_size=block_size - ) - src_video = src_video.permute(3, 0, 1, 2).float().div_(127.5).sub_(1.) # c, f, h, w - #print("Source Video:", str(src_w), str(src_h)) - #print("Sample Video:", str(sample.shape[-2:])) + sample_height, sample_width = sample.shape[-2:] - # Resize sample to match the source's resolution exactly if they differ - if [src_h, src_w] != sample.shape[-2:]: + # Resize sample to match the source's resolution exactly if they differ + if src_h != sample_height or src_w != sample_width: + send_cmd("progress", [0, get_latest_status(state,"Resizing Video")]) # Permute to (F, C, H, W) for torch.nn.functional.interpolate sample = sample.permute(1, 0, 2, 3) sample = torch.nn.functional.interpolate( @@ -5941,12 +5921,6 @@ def set_header_text(txt): ) # Permute back to (C, F, H, W) sample = sample.permute(1, 0, 2, 3) - - # last frame of source is first frame of generated sample which can be skipped - # remove source overlapped frame and merge with new generated sample - send_cmd("progress", [0, get_latest_status(state,"Merging Videos")]) - sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample[:, 1:]], dim = 1) - # ----------------------------------------------- if film_grain_intensity> 0: send_cmd("progress", [0, get_latest_status(state,"Film Grain")]) @@ -6030,7 +6004,17 @@ def set_header_text(txt): else: save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None), container= container) - + + # If continue (last) video is chosen + if any_letters(image_prompt_type, "VL"): + send_cmd("progress", [0, get_latest_status(state,"Merging Videos")]) + # Add "_Preview" to the file name of the new generated sample + base, ext = os.path.splitext(video_path) + video_preview_path = base + "_Preview" + ext + os.rename(video_path, video_preview_path) + # Merge the saved video with the last one via ffmpeg, which saves a lot of RAM in comparison to torch.cat + combine_videos(video_source, video_preview_path, video_path, trim_end_frames1=source_video_overlap_frames_count, trim_start_frames2=1, fps=output_fps) + end_time = time.time() send_cmd("progress", [0, get_latest_status(state,"Add Meta Data")]) @@ -6116,6 +6100,48 @@ def set_header_text(txt): remove_temp_filenames(temp_filenames_list) +def combine_videos( + video1_path, + video2_path, + output_path, + trim_end_frames1=0, + trim_start_frames2=0, + fps=16, + vcodec='libx264', + crf=10, + preset='veryfast', + audio_bitrate='192k'): + + import ffmpeg + + # Calculate trim duration in seconds + trim_end_seconds1 = trim_end_frames1 / fps + trim_start_seconds2 = trim_start_frames2 / fps + + probe1 = ffmpeg.probe(video1_path) + probe2 = ffmpeg.probe(video2_path) + duration1 = float(probe1['streams'][0]['duration']) + + # Trim the end of the first and the beginning of the second video, then join them + input1 = ffmpeg.input(video1_path, ss=0, t=duration1 - trim_end_seconds1) + input2 = ffmpeg.input(video2_path, ss=trim_start_seconds2) + v1 = input1.video + v2 = input2.video + joined_video = ffmpeg.concat(v1, v2, v=1, a=0) + + # Check if videos have audio + has_audio1 = any(s['codec_type'] == 'audio' for s in probe1['streams']) + has_audio2 = any(s['codec_type'] == 'audio' for s in probe2['streams']) + if has_audio1 and has_audio2: + a1 = input1.audio + a2 = input2.audio + joined_audio = ffmpeg.concat(a1, a2, v=0, a=1) + output = ffmpeg.output(joined_video, joined_audio, output_path, vcodec=vcodec, crf=crf, preset=preset, audio_bitrate=audio_bitrate) + else: + output = ffmpeg.output(joined_video, output_path, vcodec=vcodec, crf=crf, preset=preset) + + ffmpeg.run(output, overwrite_output=True) + def prepare_generate_video(state): if state.get("validate_success",0) != 1: @@ -7666,6 +7692,7 @@ def save_inputs( prompt, negative_prompt, resolution, + fit_canvas, video_length, batch_size, seed, @@ -8954,7 +8981,7 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl model_resolutions = model_def.get("resolutions", None) resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions) available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice) - current_fit_canvas = server_config.get("fit_canvas", 0) + current_fit_canvas = ui_get("fit_canvas", 0) resolution_group = gr.Dropdown( choices = available_groups, value= selected_group, From 83d28b4a7f5e9d0b0d03d4ea556ecfcf9c7bd946 Mon Sep 17 00:00:00 2001 From: Dencel-CleverAI Date: Sun, 28 Dec 2025 20:14:31 +0100 Subject: [PATCH 7/7] ffmpeg is now silent and preview gets deleted --- wgp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/wgp.py b/wgp.py index 76fd014d2..c46804915 100644 --- a/wgp.py +++ b/wgp.py @@ -6014,7 +6014,7 @@ def set_header_text(txt): os.rename(video_path, video_preview_path) # Merge the saved video with the last one via ffmpeg, which saves a lot of RAM in comparison to torch.cat combine_videos(video_source, video_preview_path, video_path, trim_end_frames1=source_video_overlap_frames_count, trim_start_frames2=1, fps=output_fps) - + os.remove(video_preview_path) end_time = time.time() send_cmd("progress", [0, get_latest_status(state,"Add Meta Data")]) @@ -6140,7 +6140,7 @@ def combine_videos( else: output = ffmpeg.output(joined_video, output_path, vcodec=vcodec, crf=crf, preset=preset) - ffmpeg.run(output, overwrite_output=True) + ffmpeg.run(output, overwrite_output=True, quiet=True) def prepare_generate_video(state):