Skip to content

Conversation

@yiyixuxu
Copy link
Collaborator

@yiyixuxu yiyixuxu commented Feb 2, 2026

This PR:

1. Split the Wan modular pipeline into separate pipelines per task:

  • WanModularPipeline -> WanBlocks (wan2.1_T2V)
  • WanImage2VideoModularPipeline -> WanImage2VideoBlocks (wan2.1+I2V + FLF2V)
  • Wan22ModularPipeline -> Wan22Blocks (wan2.2_T2V)
  • Wan22Image2VideoModularPipeline -> Wan22Image2VideoBlocks (wan2.2_I2V)

Unlike image models where a single checkpoint can support multiple workflows, video models are usually trained for specific tasks. So there's no need to use AutoPipelineBlocks since a checkpoint cannot be reused across different workflows

2. Refactored the I2V and FLF2V pipelines so that their denoise block both accept image_condition_latents input:

  • Moved the masking logic into the VAE encoder steps
  • Both I2V and FLF2V now output image_condition_latents after encoding and masking
  • Downstream denoising blocks can be shared since they receive the same input (right now it is different, first_frame_latents for I2V and first_last_frame_latents for FLF2V)

3. Refactored MODULAR_PIPELINE_MAPPING

so that we can map one standard pipeline to more than one modular pipeline based on the config. e.g. you can map WanPipelne to WanModularPipeline vs Wan22ModularPipeline based on boundary_ratio.Previously, the mapping was a simple model_name → pipeline_class dict so it has to be 1:1 map.

test script for wan

import os
import shutil
import gc

from diffusers import ModularPipeline
from diffusers.utils import export_to_video, load_image
import torch

output_name_prefix = "yiyi_test_modular_wan"
if os.path.exists(output_name_prefix):
    shutil.rmtree(output_name_prefix)
os.makedirs(output_name_prefix, exist_ok=True)

device = "cuda:0"


# test t2v

prompt = "A cat and a dog baking a cake together in a kitchen. The cat is carefully measuring flour, while the dog is stirring the batter with a wooden spoon. The kitchen is cozy, with sunlight streaming through the window."
negative_prompt = "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards"


# model_ids = ["Wan-AI/Wan2.1-T2V-1.3B-Diffusers", "Wan-AI/Wan2.1-T2V-14B-Diffusers", "Wan-AI/Wan2.2-T2V-A14B-Diffusers"]
model_ids = []
for model_id in model_ids:
    print("=" * 100)
    print(f"Testing {model_id}")
    pipeline = ModularPipeline.from_pretrained(model_id)
    pipeline.load_components(torch_dtype={"default": torch.bfloat16, "vae": torch.float32})
    pipeline.to(device)

    print(pipeline)

    video = pipeline(
        prompt=prompt, 
        negative_prompt=negative_prompt, 
        num_inference_steps=30,
        generator=torch.Generator().manual_seed(0)
        ).videos[0]
    output_name = f"{model_id.replace('/', '_')}.mp4"
    export_to_video(video, os.path.join(output_name_prefix, output_name), fps=16)
    print(f"Saved video to {os.path.join(output_name_prefix, output_name)}")
    del pipeline
    gc.collect()
    torch.cuda.empty_cache()

# test i2v
first_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_first_frame.png")
prompt = "CG animation style, a small blue bird takes off from the ground, flapping its wings. The bird's feathers are delicate, with a unique pattern on its chest. The background shows a blue sky with white clouds under bright sunshine. The camera follows the bird upward, capturing its flight and the vastness of the sky from a close-up, low-angle perspective."

# model_ids = ["Wan-AI/Wan2.1-I2V-14B-720P-Diffusers", "Wan-AI/Wan2.1-I2V-14B-480P-Diffusers", "Wan-AI/Wan2.2-I2V-A14B-Diffusers"]
model_ids = []
for model_id in model_ids:
    print("=" * 100)
    print(f"Testing {model_id}")
    pipeline = ModularPipeline.from_pretrained(model_id)
    pipeline.load_components(torch_dtype={"default": torch.bfloat16, "vae": torch.float32})
    pipeline.to(device)

    if "720P" in model_id:
        height = 720
        width = 1280
    else:
        height = 480
        width = 832
    video = pipeline(
        prompt=prompt, 
        image=first_frame,
        height = height,
        width = width,
        num_inference_steps=40,
        generator=torch.Generator().manual_seed(0),
        output="videos"
        )[0]
    output_name = f"{model_id.replace('/', '_')}.mp4"
    export_to_video(video, os.path.join(output_name_prefix, output_name), fps=16)
    print(f"Saved video to {os.path.join(output_name_prefix, output_name)}")
    del pipeline
    gc.collect()
    torch.cuda.empty_cache()


# test flf2v
last_frame = load_image("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/flf2v_input_last_frame.png")
# model_ids = ["Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"]
model_ids = []
for model_id in model_ids:
    print("=" * 100)
    print(f"Testing {model_id}")
    pipeline = ModularPipeline.from_pretrained(model_id)
    pipeline.load_components(torch_dtype={"default": torch.bfloat16, "image_encoder": torch.float32, "vae": torch.float32})
    pipeline.to(device)

    video = pipeline(
        prompt=prompt, 
        image=first_frame,
        last_image=last_frame,
        num_inference_steps=30,
        height=720,
        width=1280,
        generator=torch.Generator().manual_seed(0)
        ).videos[0]
    output_name = f"{model_id.replace('/', '_')}.mp4"
    export_to_video(video, os.path.join(output_name_prefix, output_name), fps=16)
    print(f"Saved video to {os.path.join(output_name_prefix, output_name)}")


# test a modular setting 

print("=" * 100)
print("Testing modular settings")

model_ids = ["Wan-AI/Wan2.1-I2V-14B-480P-Diffusers", "Wan-AI/Wan2.1-FLF2V-14B-720P-diffusers"]
for model_id in model_ids:
    print("=" * 100)
    print(f"Testing {model_id}")
    blocks = ModularPipeline.from_pretrained(model_id).blocks
    print(blocks)

    text_encoder_node = blocks.sub_blocks["text_encoder"].init_pipeline(model_id)
    image_encoder_node = blocks.sub_blocks["image_encoder"].init_pipeline(model_id)
    vae_encoder_node = blocks.sub_blocks["vae_encoder"].init_pipeline(model_id)
    denoise_node = blocks.sub_blocks["denoise"].init_pipeline(model_id)
    decoder_node = blocks.sub_blocks["decode"].init_pipeline(model_id)

    text_encoder_node.load_components(torch_dtype=torch.bfloat16)
    text_encoder_node.to(device)


    text_embeddings = text_encoder_node(prompt=prompt, negative_prompt=negative_prompt).get_by_kwargs("denoiser_input_fields")
    text_encoder_node.to("cpu")

    image_encoder_node.load_components(torch_dtype=torch.float32)
    image_encoder_node.to(device)
    
    if "FLF2V" in model_id:
        image_inputs = {
            "image": first_frame,
            "last_image": last_frame,
        }
    else:
        image_inputs = {
            "image": first_frame,
        }
    image_embeds = image_encoder_node(**image_inputs).image_embeds
    image_encoder_node.to("cpu")

    vae_encoder_node.load_components(torch_dtype=torch.float32)
    vae_encoder_node.to(device)

    image_condition_latents = vae_encoder_node(**image_inputs).image_condition_latents
    vae_encoder_node.to("cpu")


    denoise_node.load_components(torch_dtype=torch.bfloat16)
    denoise_node.to(device)

    latents = denoise_node(
        **text_embeddings, 
        image_condition_latents=image_condition_latents,
        image_embeds=image_embeds,
        num_inference_steps=30, 
        generator=torch.Generator().manual_seed(0)
        ).latents

    denoise_node.to("cpu")
    decoder_node.update_components(vae=vae_encoder_node.components["vae"])
    decoder_node.to(device)

    video = decoder_node(latents=latents).videos[0]
    output_name = f"{model_id.replace('/', '_')}_modular_settings.mp4"
    export_to_video(video, os.path.join(output_name_prefix, output_name), fps=16)
    print(f"Saved video to {os.path.join(output_name_prefix, output_name)}")

@HuggingFaceDocBuilderDev

The docs for this PR live here. All of your documentation changes will be reflected on that endpoint. The docs are available until 30 days after the last update.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

2 participants