HunyuanVideoのLoRAを作る

Ryo Shimizu

2024年12月24日読了時間: 9分

Tencentの開発したHunyuanVideoは、手軽に使えてしかも画質も良いオープンソースの動画生成AIです。

当社の社長であるAIスーパーコンピュータ継之助を使ってHunyuanVideoのLoRAを作ってみました。

まず、使用したのはdiffusion-pipeというツールです。

https://github.com/tdrussell/diffusion-pipe

Diffusion-pipeはHunyuanVideoだけでなく、LTXやFluxのファインチューニングもできます。

このツールを使う前に、まずデータセットを準備します。データセットは以前作ったPixtralを用いたものをちょっと改造して使います。

Diffusion-pipeのLoRAは、CogVideoのように動画から学習するのではなく静止画から学習するようになっているのでまず動画から静止画を切り出します。

import subprocess
import os
import argparse

def extract_frames(video_path, output_folder, interval=6):
    # 出力フォルダが存在しない場合は作成
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # 入力ファイル名を取得（拡張子なし）
    video_filename = os.path.splitext(os.path.basename(video_path))[0]

    # ffmpegコマンドを構築
    command = [
        'ffmpeg',
        '-i', video_path,
        '-vf', f'fps=1/{interval}',
        f'{output_folder}/{video_filename}_%04d.png'
    ]

    # ffmpegを実行
    subprocess.run(command, check=True)

def main():
    parser = argparse.ArgumentParser(description='動画から一定間隔で画像を抽出するスクリプト')
    parser.add_argument('video_path', help='動画ファイルのパス')
    parser.add_argument('output_folder', help='出力フォルダのパス')
    parser.add_argument('-i', '--interval', type=int, default=6, help='フレーム抽出間隔（秒）、デフォルトは6秒')

    args = parser.parse_args()

    try:
        extract_frames(args.video_path, args.output_folder, args.interval)
        print(f"画像の抽出が完了しました。{args.output_folder}フォルダを確認してください。")
    except subprocess.CalledProcessError:
        print("エラーが発生しました。ffmpegがインストールされていることを確認してください。")
    except Exception as e:
        print(f"エラーが発生しました: {str(e)}")

if __name__ == "__main__":
    main()

次に、切り出した静止画の一枚一枚にキャプションをつけていきます。

import os
import cv2
from PIL import Image
import pytesseract
from mistral_inference.transformer import Transformer
from mistral_inference.generate import generate
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from huggingface_hub import snapshot_download
from pathlib import Path


# モデルのダウンロードと準備
mistral_models_path = Path.home().joinpath('mistral_models', 'Pixtral')
mistral_models_path.mkdir(parents=True, exist_ok=True)

snapshot_download(repo_id="mistral-community/pixtral-12b-240910", 
                  allow_patterns=["params.json", "consolidated.safetensors", "tekken.json"], 
                  local_dir=mistral_models_path)

# トークナイザーとモデルのロード
tokenizer = MistralTokenizer.from_file(f"{mistral_models_path}/tekken.json")
model = Transformer.from_folder(mistral_models_path)


import os
import cv2
import base64
from io import BytesIO
from PIL import Image

def image_to_base64(image):
    buffered = BytesIO()
    image.save(buffered, format="PNG")
    img_str = base64.b64encode(buffered.getvalue()).decode()
    return f"data:image/png;base64,{img_str}"

def recognize(base64_image):
    prompt="""Possible objects in the photo include the main character, a man, a young girl, air pirates, an airplane, a river, the sea, a mature woman, a building, a car, and blueprints. Based on this, explain in detail what is shown on the screen. In particular, explain the positional relationship, such as what pose the person is in, what is on the right and left, top and bottom of the screen, and estimate the camera angle."""
    completion_request = ChatCompletionRequest(
        messages=[UserMessage(content=[ImageURLChunk(image_url=base64_image), TextChunk(text=prompt)])]
    )
    
    encoded = tokenizer.encode_chat_completion(completion_request)
    images = encoded.images
    tokens = encoded.tokens

    out_tokens, _ = generate([tokens], model, images=[images], max_tokens=1024, temperature=0.35, eos_id=tokenizer.instruct_tokenizer.tokenizer.eos_id)
    result = tokenizer.decode(out_tokens[0])
    print(result)
    return result

def extract_frames(video_path):
    video = cv2.VideoCapture(video_path)
    total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))
    frames = []

    # 冒頭のフレーム
    video.set(cv2.CAP_PROP_POS_FRAMES, 0)
    ret, frame = video.read()
    if ret:
        frames.append(frame)

    # 中盤のフレーム
    middle_frame = total_frames // 2
    video.set(cv2.CAP_PROP_POS_FRAMES, middle_frame)
    ret, frame = video.read()
    if ret:
        frames.append(frame)

    # 最後のフレーム
    video.set(cv2.CAP_PROP_POS_FRAMES, total_frames - 1)
    ret, frame = video.read()
    if ret:
        frames.append(frame)

    video.release()
    return frames

def process_videos(input_dir, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for filename in os.listdir(input_dir):
        if filename.endswith('.mp4'):
            video_path = os.path.join(input_dir, filename)
            
            frames = extract_frames(video_path)
            
            recognized_texts = []
            for i, frame in enumerate(frames):
                image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
                base64_image = image_to_base64(image)
                recognized_text = recognize(base64_image)
                recognized_texts.append(f"Frame {i+1}: {recognized_text}")

            output_filename = os.path.splitext(filename)[0] + '.txt'
            output_path = os.path.join(output_dir, output_filename)

            with open(output_path, 'w', encoding='utf-8') as f:
                f.write("\n\n".join(recognized_texts))

            print(f"Processed: {filename} -> {output_filename}")
        if filename.endswith('.png'):
            print(f"Processing: {filename}")
            image = Image.open(f"{input_dir}/{filename}")
            base64_image = image_to_base64(image)
            recognized_text = recognize(base64_image)
            
            output_filename = os.path.splitext(filename)[0] + '.txt'
            output_path = os.path.join(output_dir, output_filename)

            with open(output_path, 'w', encoding='utf-8') as f:
                f.write(recognized_text)

# 使用例
input_directory = 'out'
output_directory = 'txt'
process_videos(input_directory, output_directory)

これでデータセットが自動的に生成されます。

今回は1600枚の画像を学習させました。

HunyuanVideoでは、動画ではなく静止画を学習させることから、CogVideoXのように「動き」まで学習するのは難しいのかもしれません。

また、今回はトリガーワードをつけるのを忘れてしまいました。本来なら、データセット作成時にキャプションにトリガーワードをつけておくべきですが、それでもうまくいきました。

次に、examples/dataset.tomlをコピーして必要事項を改変します。

# Resolutions to train on, given as the side length of a square image. You can have multiple sizes here.
resolutions = [512]

# Enable aspect ratio bucketing.
enable_ar_bucket = true
# Min and max aspect ratios, given as width/height ratio.
min_ar = 0.5
max_ar = 2.0
# Total number of aspect ratio buckets, evenly spaced (in log space) between min_ar and max_ar.
num_ar_buckets = 7

# For video training, you need to configure frame buckets (similar to aspect ratio buckets). There will always
# be a frame bucket of 1 for images. Videos will be assigned to the first frame bucket that the video is greater than or equal to in length.
# But videos are never assigned to the image frame bucket (1); if the video is very short it would just be dropped.
frame_buckets = [1, 33, 65]

[[directory]]
# Path to directory of images/videos, and corresponding caption files. The caption files should match the media file name, but with a .txt extension.
# A missing caption file will log a warning, but then just train using an empty caption.
path = '/mnt/raid6/git/diffusion-pipe/dataset'
# The dataset will act like it is duplicated this many times.
num_repeats = 10


# You can list multiple directories.

# [[directory]]
# path = '/home/anon/data/images/something_else'
# num_repeats = 5

今回は黄色で示したデータセットのパスだけを変更しました

さらに、学習に関する条件をexamples/hunyuan_video.tomlを改造します。

# Output path for training runs. Each training run makes a new directory in here.
output_dir = '/mnt/raid6/git/diffusion-pipe/data/hunyuan_video_test'

# Dataset config file.
dataset = 'dataset.toml'
# You can have separate eval datasets. Give them a name for Tensorboard metrics.
# eval_datasets = [
#     {name = 'something', config = 'path/to/eval_dataset.toml'},
# ]

# training settings

# I usually set this to a really high value because I don't know how long I want to train.
epochs = 1000
# Batch size of a single forward/backward pass for one GPU.
micro_batch_size_per_gpu = 1
# Pipeline parallelism degree. A single instance of the model is divided across this many GPUs.
pipeline_stages = 1
# Number of micro-batches sent through the pipeline for each training step.
# If pipeline_stages > 1, a higher GAS means better GPU utilization due to smaller pipeline bubbles (where GPUs aren't overlapping computation).
gradient_accumulation_steps = 4
# Grad norm clipping.
gradient_clipping = 1.0
# Learning rate warmup.
warmup_steps = 100

# eval settings

eval_every_n_epochs = 1
eval_before_first_step = true
# Might want to set these lower for eval so that less images get dropped (eval dataset size is usually much smaller than training set).
# Each size bucket of images/videos is rounded down to the nearest multiple of the global batch size, so higher global batch size means
# more dropped images. Usually doesn't matter for training but the eval set is much smaller so it can matter.
eval_micro_batch_size_per_gpu = 1
eval_gradient_accumulation_steps = 1

# misc settings

# Probably want to set this a bit higher if you have a smaller dataset so you don't end up with a million saved models.
save_every_n_epochs = 2
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
#checkpoint_every_n_epochs = 1
checkpoint_every_n_minutes = 120
# Always set to true unless you have a huge amount of VRAM.
activation_checkpointing = true
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
partition_method = 'parameters'
# dtype for saving the LoRA or model, if different from training dtype
save_dtype = 'bfloat16'
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
caching_batch_size = 1
# How often deepspeed logs to console.
steps_per_print = 1
# How to extract video clips for training from a single input video file.
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
# number of frames for that bucket.
# single_beginning: one clip starting at the beginning of the video
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
# default is single_middle
video_clip_mode = 'single_middle'

[model]
# flux, ltx-video, or hunyuan-video
type = 'hunyuan-video'
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
save_every_n_epochs = 2
# Can checkpoint the training state every n number of epochs or minutes. Set only one of these. You can resume from checkpoints using the --resume_from_checkpoint flag.
#checkpoint_every_n_epochs = 1
checkpoint_every_n_minutes = 120
# Always set to true unless you have a huge amount of VRAM.
activation_checkpointing = true
# Controls how Deepspeed decides how to divide layers across GPUs. Probably don't change this.
partition_method = 'parameters'
# dtype for saving the LoRA or model, if different from training dtype
save_dtype = 'bfloat16'
# Batch size for caching latents and text embeddings. Increasing can lead to higher GPU utilization during caching phase but uses more memory.
caching_batch_size = 1
# How often deepspeed logs to console.
steps_per_print = 1
# How to extract video clips for training from a single input video file.
# The video file is first assigned to one of the configured frame buckets, but then we must extract one or more clips of exactly the right
# number of frames for that bucket.
# single_beginning: one clip starting at the beginning of the video
# single_middle: one clip from the middle of the video (cutting off the start and end equally)
# multiple_overlapping: extract the minimum number of clips to cover the full range of the video. They might overlap some.
# default is single_middle
video_clip_mode = 'single_middle'

[model]
# flux, ltx-video, or hunyuan-video
type = 'hunyuan-video'
# Can load Hunyuan Video entirely from the ckpt path set up for the official inference scripts.
#ckpt_path = '/home/anon/HunyuanVideo/ckpts'
# Or you can load it by pointing to all the ComfyUI files.
transformer_path = '/mnt/raid6/git/diffusion-pipe/hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors'
vae_path = '/mnt/raid6/git/diffusion-pipe/hunyuan_video_vae_bf16.safetensors.1'
llm_path = '/mnt/raid6/git/HunyuanVideo/ckpts/text_encoder'
clip_path = '/mnt/raid6/git/HunyuanVideo/ckpts/text_encoder_2'
# Base dtype used for all models.
dtype = 'bfloat16'
# Hunyuan Video supports fp8 for the transformer when training LoRA.
transformer_dtype = 'float8'
# How to sample timesteps to train on. Can be logit_normal or uniform.
timestep_sample_method = 'logit_normal'

# flux example
# [model]
# type = 'flux'
# # Path to Huggingface Diffusers directory for Flux
# diffusers_path = '/data2/imagegen_models/FLUX.1-dev'
# # You can override the transformer from a BFL format checkpoint.
# transformer_path = '/data2/imagegen_models/flux-dev-single-files/consolidated_s6700-schnell.safetensors'
# dtype = 'bfloat16'
# flux_shift = true

[adapter]
type = 'lora'
rank = 32
# Dtype for the LoRA weights you are training.
dtype = 'bfloat16'
# You can initialize the lora weights from a previously trained lora.
#init_from_existing = '/data/diffusion_pipe_training_runs/something/epoch50'

[optimizer]
# AdamW from the optimi library is a good default since it automatically uses Kahan summation when training bfloat16 weights.
# Look at train.py for other options. You could also easily edit the file and add your own.
type = 'adamw_optimi'
lr = 2e-5
betas = [0.9, 0.99]
weight_decay = 0.01
eps = 1e-8

基本的にはダウンロードしたモデルファイルのパスを指定するだけです。

diffusion-pipeは分散学習にも対応しているので本来継之助であればA100 80GBx8のフルパワーで学習が可能なのですが、他の用途にも使いたかったので、あえてA100 80GBx1で学習してみました。学習には28エポックで一週間弱掛かりました。

学習そのものは他のものに比べるとトラブルも少なく、ちょっとmpi4pyのインストールでつまづいた程度です。

実際にLoRAを適用するには、ComfyUI-HunyuanVideoWrapperを使います。

https://github.com/kijai/ComfyUI-HunyuanVideoWrapper

「HunyuanVideo Lora Select」というノードでLoRAを読み込んで、「HunyuanVideo Model Loader」のloraに入力します。プロンプトにトリガーワードを入れるのを忘れずに

LoRAファインチューニング前のHunyuanVideoの出力は以下のような感じです。プロンプトは「high quality nature video of a red panda balancing on a bamboo stick while a bird lands on the panda's head, there's a waterfall in the background,anime style」

LoRAファインチューニング後は同じプロンプトでも以下のようになりました。

HunyuanVideoのLoRAを作る

最新記事