From 405145f8e5f5db1440d3f38671b2d294cfad4e39 Mon Sep 17 00:00:00 2001
From: Dencel-CleverAI <dencel.cleverai@gmail.com>
Date: Fri, 19 Dec 2025 19:38:58 +0100
Subject: [PATCH 1/7] Changed order of spatial and temporal upscaling

---
 wgp.py | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/wgp.py b/wgp.py
index 02d8e6687..d12686bc5 100644
--- a/wgp.py
+++ b/wgp.py
@@ -4580,17 +4580,16 @@ def edit_video(
             sample = sample.float().div_(127.5).sub_(1.).permute(-1,0,1,2)
             frames_count = sample.shape[1] 
 
+        if len(spatial_upsampling) > 0:
+            sample = perform_spatial_upsampling(sample, spatial_upsampling )
+            configs["spatial_upsampling"] = spatial_upsampling
+
         output_fps  = round(fps)
         if len(temporal_upsampling) > 0:
             sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, None, temporal_upsampling, fps)
             configs["temporal_upsampling"] = temporal_upsampling
             frames_count = sample.shape[1] 
 
-
-        if len(spatial_upsampling) > 0:
-            sample = perform_spatial_upsampling(sample, spatial_upsampling )
-            configs["spatial_upsampling"] = spatial_upsampling
-
         if film_grain_intensity > 0:
             from postprocessing.film_grain import add_film_grain
             sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) 
@@ -5860,16 +5859,18 @@ def set_header_text(txt):
 
                 if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 and not "vae2" in spatial_upsampling:                
                     send_cmd("progress", [0, get_latest_status(state,"Upsampling")])
+                          
+                if len(spatial_upsampling) > 0:
+                    sample = perform_spatial_upsampling(sample, spatial_upsampling)
                 
                 output_fps  = fps
                 if len(temporal_upsampling) > 0:
                     sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps)
-
-                if len(spatial_upsampling) > 0:
-                    sample = perform_spatial_upsampling(sample, spatial_upsampling )
+                    
                 if film_grain_intensity> 0:
                     from postprocessing.film_grain import add_film_grain
                     sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) 
+                    
                 if sliding_window :
                     if frames_already_processed == None:
                         frames_already_processed = sample

From a295706d7008a6375441bb0e0e7ec3beb5d0f955 Mon Sep 17 00:00:00 2001
From: Dencel-CleverAI <dencel.cleverai@gmail.com>
Date: Sat, 20 Dec 2025 17:29:13 +0100
Subject: [PATCH 2/7] Added 20 FPS for Model Default

---
 wgp.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/wgp.py b/wgp.py
index d12686bc5..3ffcbefaf 100644
--- a/wgp.py
+++ b/wgp.py
@@ -4483,7 +4483,7 @@ def upsample_frames(frame):
         return resize_lanczos(frame, h, w, method).unsqueeze(1)
     sample = torch.cat(process_images_multithread(upsample_frames, frames_to_upsample, "upsample", wrap_in_list = False, max_workers=get_default_workers(), in_place=True), dim=1)
     frames_to_upsample = None
-    return sample 
+    return sample
 
 def any_audio_track(model_type):
     base_model_type = get_base_model_type(model_type)
@@ -4581,14 +4581,14 @@ def edit_video(
             frames_count = sample.shape[1] 
 
         if len(spatial_upsampling) > 0:
-            sample = perform_spatial_upsampling(sample, spatial_upsampling )
+            sample = perform_spatial_upsampling(sample, spatial_upsampling)
             configs["spatial_upsampling"] = spatial_upsampling
 
         output_fps  = round(fps)
         if len(temporal_upsampling) > 0:
             sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, None, temporal_upsampling, fps)
             configs["temporal_upsampling"] = temporal_upsampling
-            frames_count = sample.shape[1] 
+            frames_count = sample.shape[1]
 
         if film_grain_intensity > 0:
             from postprocessing.film_grain import add_film_grain
@@ -4604,8 +4604,8 @@ def edit_video(
     tmp_path = None
     any_change = False
     if sample != None:
-        video_path =get_available_filename(save_path, video_source, "_tmp") if any_mmaudio or has_already_audio else get_available_filename(save_path, video_source, "_post")  
-        save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None), container=server_config.get("video_container", "mp4"))
+        video_path =get_available_filename(save_path, video_source, "_tmp") if any_mmaudio or has_already_audio else get_available_filename(save_path, video_source, "_post")
+        save_video(tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1), codec_type= server_config.get("video_output_codec", None), container=server_config.get("video_container", "mp4"))
 
         if any_mmaudio or has_already_audio: tmp_path = video_path
         any_change = True
@@ -5837,7 +5837,6 @@ def set_header_text(txt):
                     else:
                         pre_video_guide =  sample[:, -reuse_frames:].clone()
 
-
                 if prefix_video != None and window_no == 1:
                     # remove source video overlapped frames at the beginning of the generation
                     sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1)
@@ -5856,21 +5855,21 @@ def set_header_text(txt):
                     full_generated_audio =  generated_audio if full_generated_audio is None else np.concatenate([full_generated_audio, generated_audio], axis=0)
                     output_new_audio_data = full_generated_audio
 
-
                 if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 and not "vae2" in spatial_upsampling:                
                     send_cmd("progress", [0, get_latest_status(state,"Upsampling")])
                           
                 if len(spatial_upsampling) > 0:
+                    # h_before, w_before = sample.shape[-2:]
                     sample = perform_spatial_upsampling(sample, spatial_upsampling)
                 
                 output_fps  = fps
                 if len(temporal_upsampling) > 0:
                     sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps)
-                    
+
                 if film_grain_intensity> 0:
                     from postprocessing.film_grain import add_film_grain
-                    sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) 
-                    
+                    sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation)
+
                 if sliding_window :
                     if frames_already_processed == None:
                         frames_already_processed = sample
@@ -9271,7 +9270,8 @@ def gen_upsampling_dropdowns(temporal_upsampling, spatial_upsampling , film_grai
                             force_fps_choices +=  [("Source Video fps", "source")]
                         force_fps_choices += [
                                 ("15", "15"), 
-                                ("16", "16"), 
+                                ("16", "16"),
+                                ("20", "20"),
                                 ("23", "23"), 
                                 ("24", "24"), 
                                 ("25", "25"), 

From 801587f6d6f383a856c1566f5ae5638d5c9a6952 Mon Sep 17 00:00:00 2001
From: Dencel-CleverAI <dencel.cleverai@gmail.com>
Date: Thu, 25 Dec 2025 20:07:24 +0100
Subject: [PATCH 3/7] Merge video after upsampling

---
 wgp.py | 89 ++++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 84 insertions(+), 5 deletions(-)

diff --git a/wgp.py b/wgp.py
index 3ffcbefaf..d0a74a138 100644
--- a/wgp.py
+++ b/wgp.py
@@ -4996,6 +4996,48 @@ def remove_temp_filenames(temp_filenames_list):
             if temp_filename!= None and os.path.isfile(temp_filename):
                 os.remove(temp_filename)
 
+    # --- CONTINUING VIDEO + UPSAMPLING VALIDATION CHECK ---
+    # Before starting generation, ensure that if we are continuing a video ("V" or "L")
+    # and using upsampling, the final result matches the source video resolution/FPS.
+    do_upsampling = len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0
+    if video_source and any_letters(image_prompt_type, "VL") and do_upsampling:
+        # 1. Determine Upscaling Multipliers
+        s_mult = 1.0
+        if "lanczos1.5" in spatial_upsampling: s_mult = 1.5
+        elif "lanczos2" in spatial_upsampling or "vae2" in spatial_upsampling: s_mult = 2.0
+        elif "vae1" in spatial_upsampling: s_mult = 0.5 
+        
+        t_mult = 1
+        if "rife2" in temporal_upsampling: t_mult = 2
+        elif "rife4" in temporal_upsampling: t_mult = 4
+        
+        # 2. Calculate Expected Target Properties (Model Resolution * Multiplier)
+        base_w, base_h = resolution.split("x")
+        base_w, base_h = int(base_w), int(base_h)
+        # Note: Model gen usually snaps to block_size (16), assume passed resolution is close to model output
+        expected_w = base_w * s_mult
+        expected_h = base_h * s_mult
+        
+        base_model_type_check = get_base_model_type(model_type)
+        base_fps = get_computed_fps(force_fps, base_model_type_check, video_guide, video_source)
+        expected_fps = base_fps * t_mult
+        
+        # 3. Get Source Properties
+        src_fps, src_w, src_h, _ = get_video_info(video_source)
+        
+        # 4. Compare with tolerance (16px for dims, 0.1 for FPS)
+        err_msg = []
+        if abs(src_w - expected_w) > 16 or abs(src_h - expected_h) > 16:
+             err_msg.append(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}.")
+        
+        if abs(src_fps - expected_fps) > 0.1:
+             err_msg.append(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps.")
+             
+        if len(err_msg) > 0:
+            err_msg.append("Please adjust Resolution, Spatial Upsampling, Force FPS, or Temporal Upsampling settings to match the source.")
+            raise gr.Error("\n".join(err_msg))
+    # -----------------------------------
+
     global wan_model, offloadobj, reload_needed
     gen = get_gen_info(state)
     torch.set_grad_enabled(False) 
@@ -5068,7 +5110,7 @@ def remove_temp_filenames(temp_filenames_list):
         return
     
     width, height = resolution.split("x")
-    width, height = int(width) // block_size *  block_size, int(height) // block_size *  block_size
+    width, height = int(width) // block_size * block_size, int(height) // block_size * block_size
     default_image_size = (height, width)
 
     if slg_switch == 0:
@@ -5823,6 +5865,7 @@ def set_header_text(txt):
                 #     sample =torch.load("output.pt")
                 if gen.get("extra_windows",0) > 0:
                     sliding_window = True 
+                
                 if sliding_window :
                     # guide_start_frame = guide_end_frame
                     guide_start_frame += current_video_length
@@ -5838,8 +5881,12 @@ def set_header_text(txt):
                         pre_video_guide =  sample[:, -reuse_frames:].clone()
 
                 if prefix_video != None and window_no == 1:
-                    # remove source video overlapped frames at the beginning of the generation
-                    sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1)
+                    # Only concatenate low-res prefix if NO continue video upscaling is performed.
+                    # If continue video upscaling is active, we concat high-res original frames LATER.
+                    if not (any_letters(image_prompt_type, "VL") and do_upsampling):
+                        # remove prefix video overlapped frames at the beginning of the generation
+                        sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1)
+                        
                     guide_start_frame -= source_video_overlap_frames_count 
                     if generated_audio is not None:
                         generated_audio = truncate_audio(generated_audio, source_video_overlap_frames_count, 0, fps, audio_sampling_rate)
@@ -5855,17 +5902,49 @@ def set_header_text(txt):
                     full_generated_audio =  generated_audio if full_generated_audio is None else np.concatenate([full_generated_audio, generated_audio], axis=0)
                     output_new_audio_data = full_generated_audio
 
-                if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 and not "vae2" in spatial_upsampling:                
+                if do_upsampling and not "vae2" in spatial_upsampling:                
                     send_cmd("progress", [0, get_latest_status(state,"Upsampling")])
                           
                 if len(spatial_upsampling) > 0:
-                    # h_before, w_before = sample.shape[-2:]
                     sample = perform_spatial_upsampling(sample, spatial_upsampling)
                 
                 output_fps  = fps
                 if len(temporal_upsampling) > 0:
                     sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps)
 
+                # --- MERGE WITH ORIGINAL VIDEO SOURCE IF UPSCALED ---
+                if any_letters(image_prompt_type, "VL") and do_upsampling: 
+                    src_fps, src_w, src_h, _ = get_video_info(video_source) 
+                    src_video = preprocess_video(
+                        width=src_w, 
+                        height=src_h, 
+                        video_in=video_source, 
+                        max_frames=parsed_keep_frames_video_source, 
+                        start_frame=0, 
+                        fit_canvas=None, 
+                        fit_crop=False, 
+                        target_fps=src_fps, 
+                        block_size=block_size
+                    )
+                    src_video = src_video.permute(3, 0, 1, 2).float().div_(127.5).sub_(1.) # c, f, h, w
+
+                    # Resize sample to match the source's resolution exactly if they differ
+                    if src_video.shape[-2:] != sample.shape[-2:]:
+                        # Permute to (F, C, H, W) for torch.nn.functional.interpolate
+                        sample = sample.permute(1, 0, 2, 3)
+                        sample = torch.nn.functional.interpolate(
+                            sample,
+                            size=src_video.shape[-2:],
+                            mode='bilinear',
+                            align_corners=False
+                        )
+                        # Permute back to (C, F, H, W)
+                        sample = sample.permute(1, 0, 2, 3)
+                    
+                    # remove source video overlapped frames and merge with new generated sample
+                    sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample], dim = 1)
+                # -----------------------------------------------
+
                 if film_grain_intensity> 0:
                     from postprocessing.film_grain import add_film_grain
                     sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation)

From 2272b413bf819c5619132d2599fac7757b4f8c36 Mon Sep 17 00:00:00 2001
From: Dencel-CleverAI <dencel.cleverai@gmail.com>
Date: Fri, 26 Dec 2025 03:41:00 +0100
Subject: [PATCH 4/7] Exposed Fit Canvas, refined merging upsampled video

---
 wgp.py | 74 ++++++++++++++++++++++++++++++++--------------------------
 1 file changed, 41 insertions(+), 33 deletions(-)

diff --git a/wgp.py b/wgp.py
index d0a74a138..62ca6eb93 100644
--- a/wgp.py
+++ b/wgp.py
@@ -4575,22 +4575,24 @@ def edit_video(
 
     if mode == "edit_postprocessing":
         if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 or film_grain_intensity > 0:                
-            send_cmd("progress", [0, get_latest_status(state,"Upsampling" if len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0 else "Adding Film Grain"  )])
             sample = get_resampled_video(video_source, 0, max_source_video_frames, fps)
             sample = sample.float().div_(127.5).sub_(1.).permute(-1,0,1,2)
             frames_count = sample.shape[1] 
 
         if len(spatial_upsampling) > 0:
+            send_cmd("progress", [0, get_latest_status(state,"Spatial Upsampling")])
             sample = perform_spatial_upsampling(sample, spatial_upsampling)
             configs["spatial_upsampling"] = spatial_upsampling
 
         output_fps  = round(fps)
         if len(temporal_upsampling) > 0:
+            send_cmd("progress", [0, get_latest_status(state,"Temporal Upsampling")])
             sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, None, temporal_upsampling, fps)
             configs["temporal_upsampling"] = temporal_upsampling
             frames_count = sample.shape[1]
 
         if film_grain_intensity > 0:
+            send_cmd("progress", [0, get_latest_status(state,"Film Grain")])
             from postprocessing.film_grain import add_film_grain
             sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation) 
             configs["film_grain_intensity"] = film_grain_intensity
@@ -4598,6 +4600,7 @@ def edit_video(
     else:
         output_fps  = round(fps)
 
+    send_cmd("progress", [0, get_latest_status(state,"Finalizing")])
     any_mmaudio = MMAudio_setting != 0 and server_config.get("mmaudio_enabled", 0) != 0 and frames_count >=output_fps
     if any_mmaudio: download_mmaudio()
 
@@ -5025,17 +5028,12 @@ def remove_temp_filenames(temp_filenames_list):
         # 3. Get Source Properties
         src_fps, src_w, src_h, _ = get_video_info(video_source)
         
-        # 4. Compare with tolerance (16px for dims, 0.1 for FPS)
-        err_msg = []
+        # 4. Compare with tolerance (16px for resolution, 0.1 for FPS)
         if abs(src_w - expected_w) > 16 or abs(src_h - expected_h) > 16:
-             err_msg.append(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}.")
+             raise gr.Error(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}. Please adjust Resolution or Spatial Upsampling to match the source.")
         
         if abs(src_fps - expected_fps) > 0.1:
-             err_msg.append(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps.")
-             
-        if len(err_msg) > 0:
-            err_msg.append("Please adjust Resolution, Spatial Upsampling, Force FPS, or Temporal Upsampling settings to match the source.")
-            raise gr.Error("\n".join(err_msg))
+             raise gr.Error(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps. Please adjust Default Model FPS or Temporal Upsampling to match the source.")
     # -----------------------------------
 
     global wan_model, offloadobj, reload_needed
@@ -5438,13 +5436,14 @@ def remove_temp_filenames(temp_filenames_list):
                     image_start_tensor = convert_image_to_tensor(image_start_tensor)
                     pre_video_guide =  prefix_video = image_start_tensor.unsqueeze(1)
                 else:
-                    prefix_video  = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, block_size = block_size )
-                    prefix_video  = prefix_video.permute(3, 0, 1, 2)
-                    prefix_video  = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
+                    prefix_video = preprocess_video(width=width, height=height,video_in=video_source, max_frames= parsed_keep_frames_video_source , start_frame = 0, fit_canvas= sample_fit_canvas, fit_crop = fit_crop, target_fps = fps, block_size = block_size )
+                    prefix_video = prefix_video.permute(3, 0, 1, 2)
+                    prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
                     if fit_crop or "L" in image_prompt_type: refresh_preview["video_source"] = convert_tensor_to_image(prefix_video, 0) 
 
-                    new_height, new_width = prefix_video.shape[-2:]                    
-                    pre_video_guide =  prefix_video[:, -reuse_frames:]
+                    new_height, new_width = prefix_video.shape[-2:]
+                    #print("Downsampled Video:", str(new_width), str(new_height))                    
+                    pre_video_guide = prefix_video[:, -reuse_frames:]
                 pre_video_frame = convert_tensor_to_image(prefix_video[:, -1])
                 source_video_overlap_frames_count = pre_video_guide.shape[1]
                 source_video_frames_count = prefix_video.shape[1]
@@ -5902,19 +5901,19 @@ def set_header_text(txt):
                     full_generated_audio =  generated_audio if full_generated_audio is None else np.concatenate([full_generated_audio, generated_audio], axis=0)
                     output_new_audio_data = full_generated_audio
 
-                if do_upsampling and not "vae2" in spatial_upsampling:                
-                    send_cmd("progress", [0, get_latest_status(state,"Upsampling")])
-                          
                 if len(spatial_upsampling) > 0:
+                    send_cmd("progress", [0, get_latest_status(state,"Spatial Upsampling")])
                     sample = perform_spatial_upsampling(sample, spatial_upsampling)
                 
                 output_fps  = fps
                 if len(temporal_upsampling) > 0:
+                    send_cmd("progress", [0, get_latest_status(state,"Temporal Upsampling")])
                     sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps)
 
                 # --- MERGE WITH ORIGINAL VIDEO SOURCE IF UPSCALED ---
                 if any_letters(image_prompt_type, "VL") and do_upsampling: 
-                    src_fps, src_w, src_h, _ = get_video_info(video_source) 
+                    send_cmd("progress", [0, get_latest_status(state,"Resizing Video")])
+                    src_fps, src_w, src_h, _ = get_video_info(video_source)
                     src_video = preprocess_video(
                         width=src_w, 
                         height=src_h, 
@@ -5927,35 +5926,42 @@ def set_header_text(txt):
                         block_size=block_size
                     )
                     src_video = src_video.permute(3, 0, 1, 2).float().div_(127.5).sub_(1.) # c, f, h, w
-
+                    #print("Source Video:", str(src_w), str(src_h)) 
+                    #print("Sample Video:", str(sample.shape[-2:]))                    
+                    
                     # Resize sample to match the source's resolution exactly if they differ
-                    if src_video.shape[-2:] != sample.shape[-2:]:
+                    if [src_h, src_w] != sample.shape[-2:]:
                         # Permute to (F, C, H, W) for torch.nn.functional.interpolate
                         sample = sample.permute(1, 0, 2, 3)
                         sample = torch.nn.functional.interpolate(
                             sample,
-                            size=src_video.shape[-2:],
+                            size=[src_h, src_w],
                             mode='bilinear',
                             align_corners=False
                         )
                         # Permute back to (C, F, H, W)
                         sample = sample.permute(1, 0, 2, 3)
                     
-                    # remove source video overlapped frames and merge with new generated sample
-                    sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample], dim = 1)
+                    # last frame of source is first frame of generated sample which can be skipped
+                    # remove source overlapped frame and merge with new generated sample
+                    send_cmd("progress", [0, get_latest_status(state,"Merging Videos")])
+                    sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample[:, 1:]], dim = 1)
                 # -----------------------------------------------
 
                 if film_grain_intensity> 0:
+                    send_cmd("progress", [0, get_latest_status(state,"Film Grain")])
                     from postprocessing.film_grain import add_film_grain
                     sample = add_film_grain(sample, film_grain_intensity, film_grain_saturation)
 
-                if sliding_window :
+                if sliding_window:
+                    send_cmd("progress", [0, get_latest_status(state,"Sliding Window")])
                     if frames_already_processed == None:
                         frames_already_processed = sample
                     else:
                         sample = torch.cat([frames_already_processed, sample], dim=1)
                     frames_already_processed = sample
 
+                send_cmd("progress", [0, get_latest_status(state,"Saving")])
                 time_flag = datetime.fromtimestamp(time.time()).strftime("%Y-%m-%d-%Hh%Mm%Ss")
                 save_prompt = original_prompts[0]
                 if audio_only:
@@ -6027,6 +6033,7 @@ def set_header_text(txt):
 
                 end_time = time.time()
 
+                send_cmd("progress", [0, get_latest_status(state,"Add Meta Data")])
                 inputs.pop("send_cmd")
                 inputs.pop("task")
                 inputs.pop("mode")
@@ -8943,13 +8950,6 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl
                 pace = gr.Slider( 0.2, 1, value=ui_get("pace"), step=0.01, label="Pace", show_reset_button= False)
 
             with gr.Row(visible=not audio_only) as resolution_row:
-                fit_canvas = server_config.get("fit_canvas", 0)
-                if fit_canvas == 1:
-                    label = "Outer Box Resolution (one dimension may be less to preserve video W/H ratio)"
-                elif fit_canvas == 2:
-                    label = "Output Resolution (Input Images wil be Cropped if the W/H ratio is different)"
-                else:
-                    label = "Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)" 
                 current_resolution_choice = ui_get("resolution") if update_form or last_resolution is None else last_resolution
                 model_resolutions = model_def.get("resolutions", None)
                 resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
@@ -8962,8 +8962,16 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl
                 resolution = gr.Dropdown(
                 choices = selected_group_resolutions,
                     value= current_resolution_choice,
-                    label= label,
-                    scale = 5
+                    label= "Format",
+                    scale = 2
+                )
+                fit_canvas = gr.Dropdown(
+                choices=[("Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)", 0),
+                         ("Outer Box Resolution (one dimension may be less to preserve video W/H ratio)", 1),
+                         ("Output Resolution (Input Images wil be Cropped if the W/H ratio is different)", 2)],
+                    value= server_config.get("fit_canvas", 0),
+                    label="Fit Canvas",
+                    scale = 3
                 )
             with gr.Row(visible= not audio_only) as number_frames_row:
                 batch_size = gr.Slider(1, 16, value=ui_get("batch_size"), step=1, label="Number of Images to Generate", visible = image_outputs, show_reset_button= False)

From aacaa8c3bec6becaf1c3368569996b2fb66df040 Mon Sep 17 00:00:00 2001
From: Dencel-CleverAI <dencel.cleverai@gmail.com>
Date: Fri, 26 Dec 2025 04:05:00 +0100
Subject: [PATCH 5/7] Fit Canvas is now interactive and its start value is used
 correctly

---
 wgp.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/wgp.py b/wgp.py
index 62ca6eb93..aa19c4cf1 100644
--- a/wgp.py
+++ b/wgp.py
@@ -8954,6 +8954,7 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl
                 model_resolutions = model_def.get("resolutions", None)
                 resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
                 available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice)
+                current_fit_canvas = server_config.get("fit_canvas", 0)
                 resolution_group = gr.Dropdown(
                 choices = available_groups,
                     value= selected_group,
@@ -8969,9 +8970,10 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl
                 choices=[("Resolution Budget (Pixels will be reallocated to preserve Inputs W/H ratio)", 0),
                          ("Outer Box Resolution (one dimension may be less to preserve video W/H ratio)", 1),
                          ("Output Resolution (Input Images wil be Cropped if the W/H ratio is different)", 2)],
-                    value= server_config.get("fit_canvas", 0),
+                    value= current_fit_canvas,
                     label="Fit Canvas",
-                    scale = 3
+                    scale = 3,
+                    interactive = True
                 )
             with gr.Row(visible= not audio_only) as number_frames_row:
                 batch_size = gr.Slider(1, 16, value=ui_get("batch_size"), step=1, label="Number of Images to Generate", visible = image_outputs, show_reset_button= False)

From bd6933988e5647bb3eac374099369db76190b2c2 Mon Sep 17 00:00:00 2001
From: Dencel-CleverAI <dencel.cleverai@gmail.com>
Date: Sat, 27 Dec 2025 18:16:12 +0100
Subject: [PATCH 6/7] Saved Fit_Canvas model specific, merged videos via ffmpeg

---
 wgp.py | 119 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 46 deletions(-)

diff --git a/wgp.py b/wgp.py
index aa19c4cf1..76fd014d2 100644
--- a/wgp.py
+++ b/wgp.py
@@ -618,7 +618,8 @@ def ret():
 
     outpainting_dims = get_outpainting_dims(video_guide_outpainting)
 
-    if server_config.get("fit_canvas", 0) == 2 and outpainting_dims is not None and any_letters(video_prompt_type, "VKF"):
+    fit_canvas = inputs["fit_canvas"]
+    if fit_canvas == 2 and outpainting_dims is not None and any_letters(video_prompt_type, "VKF"):
         gr.Info("Output Resolution Cropping will be not used for this Generation as it is not compatible with Video Outpainting")
 
     if not model_def.get("motion_amplitude", False): motion_amplitude = 1.
@@ -4901,6 +4902,7 @@ def generate_video(
     prompt,
     negative_prompt,    
     resolution,
+    fit_canvas,
     video_length,
     batch_size,
     seed,
@@ -4999,11 +5001,10 @@ def remove_temp_filenames(temp_filenames_list):
             if temp_filename!= None and os.path.isfile(temp_filename):
                 os.remove(temp_filename)
 
-    # --- CONTINUING VIDEO + UPSAMPLING VALIDATION CHECK ---
+    # --- CONTINUING VIDEO VALIDATION CHECK ---
     # Before starting generation, ensure that if we are continuing a video ("V" or "L")
-    # and using upsampling, the final result matches the source video resolution/FPS.
-    do_upsampling = len(temporal_upsampling) > 0 or len(spatial_upsampling) > 0
-    if video_source and any_letters(image_prompt_type, "VL") and do_upsampling:
+    # the final result matches the source video resolution/FPS.
+    if video_source and any_letters(image_prompt_type, "VL"):
         # 1. Determine Upscaling Multipliers
         s_mult = 1.0
         if "lanczos1.5" in spatial_upsampling: s_mult = 1.5
@@ -5017,7 +5018,6 @@ def remove_temp_filenames(temp_filenames_list):
         # 2. Calculate Expected Target Properties (Model Resolution * Multiplier)
         base_w, base_h = resolution.split("x")
         base_w, base_h = int(base_w), int(base_h)
-        # Note: Model gen usually snaps to block_size (16), assume passed resolution is close to model output
         expected_w = base_w * s_mult
         expected_h = base_h * s_mult
         
@@ -5025,15 +5025,12 @@ def remove_temp_filenames(temp_filenames_list):
         base_fps = get_computed_fps(force_fps, base_model_type_check, video_guide, video_source)
         expected_fps = base_fps * t_mult
         
-        # 3. Get Source Properties
+        # 3. Get Source Properties and compare with tolerance (16px for resolution, 0.1 for FPS)
         src_fps, src_w, src_h, _ = get_video_info(video_source)
-        
-        # 4. Compare with tolerance (16px for resolution, 0.1 for FPS)
         if abs(src_w - expected_w) > 16 or abs(src_h - expected_h) > 16:
-             raise gr.Error(f"Resolution Mismatch: Source is {src_w}x{src_h}, but upscaled result will be approx {int(expected_w)}x{int(expected_h)}. Please adjust Resolution or Spatial Upsampling to match the source.")
-        
+            raise gr.Error(f"Resolution Mismatch: Source is {src_w}x{src_h}, but result will be approx {int(expected_w)}x{int(expected_h)}. Please adjust Resolution or Spatial Upsampling to match the source.")      
         if abs(src_fps - expected_fps) > 0.1:
-             raise gr.Error(f"FPS Mismatch: Source is {src_fps:.2f} fps, but upscaled result will be {expected_fps:.2f} fps. Please adjust Default Model FPS or Temporal Upsampling to match the source.")
+            raise gr.Error(f"FPS Mismatch: Source is {src_fps:.2f} fps, but result will be {expected_fps:.2f} fps. Please adjust Default Model FPS or Temporal Upsampling to match the source.")
     # -----------------------------------
 
     global wan_model, offloadobj, reload_needed
@@ -5239,7 +5236,6 @@ def remove_temp_filenames(temp_filenames_list):
         any_background_ref = 2 if model_def.get("all_image_refs_are_background_ref", False) else 1
 
     outpainting_dims = get_outpainting_dims(video_guide_outpainting)
-    fit_canvas = server_config.get("fit_canvas", 0)
     fit_crop = fit_canvas == 2
     if fit_crop and outpainting_dims is not None:
         fit_crop = False
@@ -5441,8 +5437,7 @@ def remove_temp_filenames(temp_filenames_list):
                     prefix_video = prefix_video.float().div_(127.5).sub_(1.) # c, f, h, w
                     if fit_crop or "L" in image_prompt_type: refresh_preview["video_source"] = convert_tensor_to_image(prefix_video, 0) 
 
-                    new_height, new_width = prefix_video.shape[-2:]
-                    #print("Downsampled Video:", str(new_width), str(new_height))                    
+                    new_height, new_width = prefix_video.shape[-2:]                    
                     pre_video_guide = prefix_video[:, -reuse_frames:]
                 pre_video_frame = convert_tensor_to_image(prefix_video[:, -1])
                 source_video_overlap_frames_count = pre_video_guide.shape[1]
@@ -5880,11 +5875,10 @@ def set_header_text(txt):
                         pre_video_guide =  sample[:, -reuse_frames:].clone()
 
                 if prefix_video != None and window_no == 1:
-                    # Only concatenate low-res prefix if NO continue video upscaling is performed.
-                    # If continue video upscaling is active, we concat high-res original frames LATER.
-                    if not (any_letters(image_prompt_type, "VL") and do_upsampling):
+                    # If continue (last) video is active, we concat high-res original frames LATER.
+                    if not any_letters(image_prompt_type, "VL"):
                         # remove prefix video overlapped frames at the beginning of the generation
-                        sample = torch.cat([ prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1)
+                        sample = torch.cat([prefix_video[:, :-source_video_overlap_frames_count], sample], dim = 1)
                         
                     guide_start_frame -= source_video_overlap_frames_count 
                     if generated_audio is not None:
@@ -5910,27 +5904,13 @@ def set_header_text(txt):
                     send_cmd("progress", [0, get_latest_status(state,"Temporal Upsampling")])
                     sample, previous_last_frame, output_fps = perform_temporal_upsampling(sample, previous_last_frame if sliding_window and window_no > 1 else None, temporal_upsampling, fps)
 
-                # --- MERGE WITH ORIGINAL VIDEO SOURCE IF UPSCALED ---
-                if any_letters(image_prompt_type, "VL") and do_upsampling: 
-                    send_cmd("progress", [0, get_latest_status(state,"Resizing Video")])
+                if any_letters(image_prompt_type, "VL"): 
                     src_fps, src_w, src_h, _ = get_video_info(video_source)
-                    src_video = preprocess_video(
-                        width=src_w, 
-                        height=src_h, 
-                        video_in=video_source, 
-                        max_frames=parsed_keep_frames_video_source, 
-                        start_frame=0, 
-                        fit_canvas=None, 
-                        fit_crop=False, 
-                        target_fps=src_fps, 
-                        block_size=block_size
-                    )
-                    src_video = src_video.permute(3, 0, 1, 2).float().div_(127.5).sub_(1.) # c, f, h, w
-                    #print("Source Video:", str(src_w), str(src_h)) 
-                    #print("Sample Video:", str(sample.shape[-2:]))                    
+                    sample_height, sample_width = sample.shape[-2:]                   
                     
-                    # Resize sample to match the source's resolution exactly if they differ
-                    if [src_h, src_w] != sample.shape[-2:]:
+                    # Resize sample to match the source's resolution exactly if they differ                  
+                    if src_h != sample_height or src_w != sample_width:
+                        send_cmd("progress", [0, get_latest_status(state,"Resizing Video")])
                         # Permute to (F, C, H, W) for torch.nn.functional.interpolate
                         sample = sample.permute(1, 0, 2, 3)
                         sample = torch.nn.functional.interpolate(
@@ -5941,12 +5921,6 @@ def set_header_text(txt):
                         )
                         # Permute back to (C, F, H, W)
                         sample = sample.permute(1, 0, 2, 3)
-                    
-                    # last frame of source is first frame of generated sample which can be skipped
-                    # remove source overlapped frame and merge with new generated sample
-                    send_cmd("progress", [0, get_latest_status(state,"Merging Videos")])
-                    sample = torch.cat([src_video[:, :-source_video_overlap_frames_count], sample[:, 1:]], dim = 1)
-                # -----------------------------------------------
 
                 if film_grain_intensity> 0:
                     send_cmd("progress", [0, get_latest_status(state,"Film Grain")])
@@ -6030,7 +6004,17 @@ def set_header_text(txt):
 
                 else:
                     save_video( tensor=sample[None], save_file=video_path, fps=output_fps, nrow=1, normalize=True, value_range=(-1, 1),  codec_type= server_config.get("video_output_codec", None), container= container)
-
+                
+                # If continue (last) video is chosen             
+                if any_letters(image_prompt_type, "VL"):
+                    send_cmd("progress", [0, get_latest_status(state,"Merging Videos")])
+                    # Add "_Preview" to the file name of the new generated sample
+                    base, ext = os.path.splitext(video_path)
+                    video_preview_path = base + "_Preview" + ext
+                    os.rename(video_path, video_preview_path)
+                    # Merge the saved video with the last one via ffmpeg, which saves a lot of RAM in comparison to torch.cat
+                    combine_videos(video_source, video_preview_path, video_path, trim_end_frames1=source_video_overlap_frames_count, trim_start_frames2=1, fps=output_fps)
+                
                 end_time = time.time()
 
                 send_cmd("progress", [0, get_latest_status(state,"Add Meta Data")])
@@ -6116,6 +6100,48 @@ def set_header_text(txt):
 
     remove_temp_filenames(temp_filenames_list)
 
+def combine_videos(
+    video1_path,
+    video2_path,
+    output_path, 
+    trim_end_frames1=0,
+    trim_start_frames2=0,
+    fps=16,
+    vcodec='libx264',
+    crf=10,
+    preset='veryfast',
+    audio_bitrate='192k'):
+    
+    import ffmpeg
+    
+    # Calculate trim duration in seconds
+    trim_end_seconds1 = trim_end_frames1 / fps
+    trim_start_seconds2 = trim_start_frames2 / fps
+    
+    probe1 = ffmpeg.probe(video1_path)
+    probe2 = ffmpeg.probe(video2_path)
+    duration1 = float(probe1['streams'][0]['duration'])
+    
+    # Trim the end of the first and the beginning of the second video, then join them
+    input1 = ffmpeg.input(video1_path, ss=0, t=duration1 - trim_end_seconds1)
+    input2 = ffmpeg.input(video2_path, ss=trim_start_seconds2)   
+    v1 = input1.video
+    v2 = input2.video
+    joined_video = ffmpeg.concat(v1, v2, v=1, a=0)
+    
+    # Check if videos have audio
+    has_audio1 = any(s['codec_type'] == 'audio' for s in probe1['streams'])
+    has_audio2 = any(s['codec_type'] == 'audio' for s in probe2['streams'])
+    if has_audio1 and has_audio2:
+        a1 = input1.audio
+        a2 = input2.audio
+        joined_audio = ffmpeg.concat(a1, a2, v=0, a=1)
+        output = ffmpeg.output(joined_video, joined_audio, output_path, vcodec=vcodec, crf=crf, preset=preset, audio_bitrate=audio_bitrate)
+    else:
+        output = ffmpeg.output(joined_video, output_path, vcodec=vcodec, crf=crf, preset=preset)
+    
+    ffmpeg.run(output, overwrite_output=True)
+
 def prepare_generate_video(state):    
 
     if state.get("validate_success",0) != 1:
@@ -7666,6 +7692,7 @@ def save_inputs(
             prompt,
             negative_prompt,
             resolution,
+            fit_canvas,
             video_length,
             batch_size,
             seed,
@@ -8954,7 +8981,7 @@ def get_image_gallery(label ="", value = None, single_image_mode = False, visibl
                 model_resolutions = model_def.get("resolutions", None)
                 resolution_choices, current_resolution_choice = get_resolution_choices(current_resolution_choice, model_resolutions)
                 available_groups, selected_group_resolutions, selected_group = group_resolutions(model_def,resolution_choices, current_resolution_choice)
-                current_fit_canvas = server_config.get("fit_canvas", 0)
+                current_fit_canvas = ui_get("fit_canvas", 0)
                 resolution_group = gr.Dropdown(
                 choices = available_groups,
                     value= selected_group,

From 83d28b4a7f5e9d0b0d03d4ea556ecfcf9c7bd946 Mon Sep 17 00:00:00 2001
From: Dencel-CleverAI <dencel.cleverai@gmail.com>
Date: Sun, 28 Dec 2025 20:14:31 +0100
Subject: [PATCH 7/7] ffmpeg is now silent and preview gets deleted

---
 wgp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/wgp.py b/wgp.py
index 76fd014d2..c46804915 100644
--- a/wgp.py
+++ b/wgp.py
@@ -6014,7 +6014,7 @@ def set_header_text(txt):
                     os.rename(video_path, video_preview_path)
                     # Merge the saved video with the last one via ffmpeg, which saves a lot of RAM in comparison to torch.cat
                     combine_videos(video_source, video_preview_path, video_path, trim_end_frames1=source_video_overlap_frames_count, trim_start_frames2=1, fps=output_fps)
-                
+                    os.remove(video_preview_path)
                 end_time = time.time()
 
                 send_cmd("progress", [0, get_latest_status(state,"Add Meta Data")])
@@ -6140,7 +6140,7 @@ def combine_videos(
     else:
         output = ffmpeg.output(joined_video, output_path, vcodec=vcodec, crf=crf, preset=preset)
     
-    ffmpeg.run(output, overwrite_output=True)
+    ffmpeg.run(output, overwrite_output=True, quiet=True)
 
 def prepare_generate_video(state):