--- base_model: - Qwen/Qwen-Image-Edit-2509 tags: - lora - image - interpolation - lineart --- # Qwen-Image-Edit-2509 Lineart Interpolation This is a LoRA weight for lineart interpolation, trained on randomly selected 10% of the train subset of Mixamo 240 dataset. The number of steps was 3000. ⚠️ Notes - Still an attempt phase. - Verification not enough, but works in some examples. ## Quick start ```python import os import math import torch import numpy as np from PIL import Image from diffusers import ( QwenImageEditPipeline, FlowMatchEulerDiscreteScheduler, ) from diffusers.utils.torch_utils import randn_tensor # --------- ユーザー環境に合わせてここを設定 --------- BASE_MODEL_ID = "Qwen/Qwen-Image-Edit-2509" # 学習時と同じ LORA_DIR = "" DEVICE = "cuda" DTYPE = torch.bfloat16 CONTROL1_IMAGE_PATH = CONTROL2_IMAGE_PATH = PROMPT = " middle frame" #Don't change NEGATIVE_PROMPT = " " NUM_STEPS = 30 SEED = 0 # =================== def calculate_dimensions(target_area, ratio): width = math.sqrt(target_area * ratio) height = width / ratio width = round(width / 32) * 32 height = round(height / 32) * 32 return int(width), int(height), None def calculate_shift( image_seq_len, base_seq_len: int = 256, max_seq_len: int = 4096, base_shift: float = 0.5, max_shift: float = 1.15, ): m = (max_shift - base_shift) / (max_seq_len - base_seq_len) b = base_shift - m * base_seq_len mu = image_seq_len * m + b return mu def retrieve_timesteps( scheduler, num_inference_steps: int = None, device: torch.device | str | None = None, timesteps=None, sigmas=None, **kwargs, ): if timesteps is not None and sigmas is not None: raise ValueError("Only one of `timesteps` or `sigmas` can be passed.") if timesteps is not None: scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) elif sigmas is not None: scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs) timesteps = scheduler.timesteps num_inference_steps = len(timesteps) else: scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) timesteps = scheduler.timesteps return timesteps, num_inference_steps def main(): torch.manual_seed(SEED) # ---------- 1. ベースパイプライン + LoRA 読み込み ---------- pipe: QwenImageEditPipeline = QwenImageEditPipeline.from_pretrained( BASE_MODEL_ID, torch_dtype=DTYPE, ).to(DEVICE) pipe.load_lora_weights(LORA_DIR, adapter_name="my_lora") pipe.set_adapters(["my_lora"], adapter_weights=[1.0]) transformer = pipe.transformer.to(DEVICE, dtype=DTYPE) vae = pipe.vae.to(DEVICE, dtype=DTYPE) scheduler: FlowMatchEulerDiscreteScheduler = pipe.scheduler vae_scale_factor = pipe.vae_scale_factor # ------------------------------------------------------------ # ---------- 2. コントロール画像読み込み & 前処理 ---------- control1_img = Image.open(CONTROL1_IMAGE_PATH).convert("RGB") control2_img = Image.open(CONTROL2_IMAGE_PATH).convert("RGB") # Qwen公式と同じ resize ロジック image_size = control1_img.size # (W, H) calculated_width, calculated_height, _ = calculate_dimensions( 1024 * 1024, image_size[0] / image_size[1] ) # VAE + patch pack 用に multiple_of で揃える multiple_of = vae_scale_factor * 2 width = calculated_width // multiple_of * multiple_of height = calculated_height // multiple_of * multiple_of # 公式と同じ image_processor の使い方 control1_resized = pipe.image_processor.resize(control1_img, calculated_height, calculated_width) control2_resized = pipe.image_processor.resize(control2_img, calculated_height, calculated_width) control1_px = pipe.image_processor.preprocess( control1_resized, calculated_height, calculated_width ).unsqueeze(2) # [B,3,1,H,W] control2_px = pipe.image_processor.preprocess( control2_resized, calculated_height, calculated_width ).unsqueeze(2) control1_px = control1_px.to(DEVICE, DTYPE) control2_px = control2_px.to(DEVICE, DTYPE) bsz = control1_px.shape[0] num_channels_latents = transformer.config.in_channels // 4 # = vae.config.z_dim # ---------- 3. コントロール画像を latent token に ---------- with torch.no_grad(): # 公式の _encode_vae_image と同等の処理 ctrl1_latents_5d = pipe._encode_vae_image(control1_px, generator=None) # [B, C,1,H',W'] ctrl2_latents_5d = pipe._encode_vae_image(control2_px, generator=None) H_lat, W_lat = ctrl1_latents_5d.shape[3], ctrl1_latents_5d.shape[4] # pack して transformer 入力用 token 形式に ctrl1_tokens = QwenImageEditPipeline._pack_latents( ctrl1_latents_5d, bsz, num_channels_latents, H_lat, W_lat ) # [B, N, C*4] ctrl2_tokens = QwenImageEditPipeline._pack_latents( ctrl2_latents_5d, bsz, num_channels_latents, H_lat, W_lat ) # ---------- 4. target latent をランダム初期化 ---------- # 公式の prepare_latents と同じ形状 height_lat = 2 * (height // (vae_scale_factor * 2)) width_lat = 2 * (width // (vae_scale_factor * 2)) shape = (bsz, 1, num_channels_latents, height_lat, width_lat) latents_5d = randn_tensor(shape, device=DEVICE, dtype=DTYPE) latents = QwenImageEditPipeline._pack_latents( latents_5d, bsz, num_channels_latents, height_lat, width_lat ) # ここからはずっと [B,N,C*4] で回す # ---------- 5. テキスト埋め込み ---------- with torch.no_grad(): prompt_embeds, prompt_embeds_mask = pipe.encode_prompt( image=control1_resized, # 公式と同様: resize 済み画像 prompt=[PROMPT], device=DEVICE, num_images_per_prompt=1, max_sequence_length=1024, ) txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() # 3ストリーム分の img_shapes(rotary 用) img_shapes = [[ (1, height_lat // 2, width_lat // 2), # target (1, height_lat // 2, width_lat // 2), # control1 (1, height_lat // 2, width_lat // 2), # control2 ]] * bsz # 必要であれば rotary 埋め込みを事前計算する実装の場合: # image_rotary_emb = transformer.pos_embed(img_shapes, txt_seq_lens, device=DEVICE) # ---------- 6. scheduler timesteps 準備 (公式準拠) ---------- sigmas = np.linspace(1.0, 1.0 / NUM_STEPS, NUM_STEPS) image_seq_len = latents.shape[1] # token 数 mu = calculate_shift( image_seq_len, scheduler.config.get("base_image_seq_len", 256), scheduler.config.get("max_image_seq_len", 4096), scheduler.config.get("base_shift", 0.5), scheduler.config.get("max_shift", 1.15), ) timesteps, _ = retrieve_timesteps( scheduler, NUM_STEPS, device=DEVICE, sigmas=sigmas, mu=mu, ) scheduler.set_begin_index(0) # guidance は使わない(multi-control だけ) guidance = None # ---------- 7. 反復推論ループ ---------- with torch.no_grad(): for i, t in enumerate(timesteps): # transformer 入力: target + control1 + control2 を token 軸で concat latent_model_input = torch.cat([latents, ctrl1_tokens, ctrl2_tokens], dim=1) # timestep をバッチ分にブロードキャスト timestep_batch = t.expand(latents.shape[0]).to(latents.dtype) model_pred_all = transformer( hidden_states=latent_model_input, timestep=timestep_batch / 1000.0, # 学習時と同じスケール guidance=guidance, encoder_hidden_states=prompt_embeds, encoder_hidden_states_mask=prompt_embeds_mask, img_shapes=img_shapes, # あなたの diffusers 版ではこれでOK txt_seq_lens=txt_seq_lens, # もし新しい API なら: # image_rotary_emb=image_rotary_emb, # attention_kwargs=None, return_dict=False, )[0] # [B, N_total, C*4] # 先頭の target 分だけ取り出す model_pred = model_pred_all[:, : latents.size(1)] latents_dtype = latents.dtype latents = scheduler.step(model_pred, t, latents, return_dict=False)[0] if latents.dtype != latents_dtype: latents = latents.to(latents_dtype) # ---------- 8. decode ---------- # packed token -> 5D latent へ latents = QwenImageEditPipeline._unpack_latents( latents, height=height, width=width, vae_scale_factor=vae_scale_factor, ) # [B, C,1,H_lat,W_lat] latents = latents.to(vae.dtype) # latents_mean / std を戻す(公式と同じ) latents_mean = ( torch.tensor(vae.config.latents_mean) .view(1, vae.config.z_dim, 1, 1, 1) .to(latents.device, latents.dtype) ) latents_std = 1.0 / torch.tensor(vae.config.latents_std).view( 1, vae.config.z_dim, 1, 1, 1 ).to(latents.device, latents.dtype) latents = latents / latents_std + latents_mean # [B,C,1,H_lat,W_lat] # decode (T=1 の 0 フレームだけ使う) with torch.no_grad(): image_latents = latents decoded = vae.decode(image_latents, return_dict=False)[0] # [B,3,1,H,W] images = decoded[:, :, 0, :, :] # [B,3,H,W] # Qwen の image_processor で [-1,1] -> PIL images = pipe.image_processor.postprocess(images, output_type="pil") out: Image.Image = images[0] out.save("multi_control_from_controls.png") print("saved to multi_control_from_controls.png") if __name__ == "__main__": main() ``` ## Acknowledgement - @lisiyao21 [AnimeInbet](https://github.com/lisiyao21/AnimeInbet) - @Qwen [Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509)