| model: | |
| scale_factor: 1.15258426 | |
| disable_first_stage_autocast: true | |
| log_keys: | |
| - txt | |
| denoiser_config: | |
| target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser | |
| params: | |
| num_idx: 1000 | |
| quantize_c_noise: false | |
| weighting_config: | |
| target: sgm.modules.diffusionmodules.denoiser_weighting.EpsWeighting | |
| scaling_config: | |
| target: sgm.modules.diffusionmodules.denoiser_scaling.VideoScaling | |
| discretization_config: | |
| target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization | |
| params: | |
| shift_scale: 3.0 | |
| network_config: | |
| target: dit_video_concat.DiffusionTransformer | |
| params: | |
| time_embed_dim: 512 | |
| elementwise_affine: true | |
| num_frames: 49 | |
| time_compressed_rate: 4 | |
| latent_width: 90 | |
| latent_height: 60 | |
| num_layers: 30 | |
| patch_size: 2 | |
| in_channels: 16 | |
| out_channels: 16 | |
| hidden_size: 1920 | |
| adm_in_channels: 256 | |
| num_attention_heads: 30 | |
| transformer_args: | |
| checkpoint_activations: true | |
| vocab_size: 1 | |
| max_sequence_length: 64 | |
| layernorm_order: pre | |
| skip_init: false | |
| model_parallel_size: 1 | |
| is_decoder: false | |
| modules: | |
| pos_embed_config: | |
| target: dit_video_concat.Basic3DPositionEmbeddingMixin | |
| params: | |
| text_length: 226 | |
| height_interpolation: 1.875 | |
| width_interpolation: 1.875 | |
| patch_embed_config: | |
| target: dit_video_concat.ImagePatchEmbeddingMixin | |
| params: | |
| text_hidden_size: 4096 | |
| adaln_layer_config: | |
| target: dit_video_concat.AdaLNMixin | |
| params: | |
| qk_ln: true | |
| final_layer_config: | |
| target: dit_video_concat.FinalLayerMixin | |
| conditioner_config: | |
| target: sgm.modules.GeneralConditioner | |
| params: | |
| emb_models: | |
| - is_trainable: false | |
| input_key: txt | |
| ucg_rate: 0.1 | |
| target: sgm.modules.encoders.modules.FrozenT5Embedder | |
| params: | |
| model_dir: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/t5-v1_1-xxl | |
| max_length: 226 | |
| first_stage_config: | |
| target: vae_modules.autoencoder.VideoAutoencoderInferenceWrapper | |
| params: | |
| cp_size: 1 | |
| ckpt_path: /mnt/lustre/sichenyang.p/code/vla/CogVideo/sat/CogVideoX-2b-sat/vae/3d-vae.pt | |
| ignore_keys: | |
| - loss | |
| loss_config: | |
| target: torch.nn.Identity | |
| regularizer_config: | |
| target: vae_modules.regularizers.DiagonalGaussianRegularizer | |
| encoder_config: | |
| target: vae_modules.cp_enc_dec.ContextParallelEncoder3D | |
| params: | |
| double_z: true | |
| z_channels: 16 | |
| resolution: 256 | |
| in_channels: 3 | |
| out_ch: 3 | |
| ch: 128 | |
| ch_mult: | |
| - 1 | |
| - 2 | |
| - 2 | |
| - 4 | |
| attn_resolutions: [] | |
| num_res_blocks: 3 | |
| dropout: 0.0 | |
| gather_norm: true | |
| decoder_config: | |
| target: vae_modules.cp_enc_dec.ContextParallelDecoder3D | |
| params: | |
| double_z: true | |
| z_channels: 16 | |
| resolution: 256 | |
| in_channels: 3 | |
| out_ch: 3 | |
| ch: 128 | |
| ch_mult: | |
| - 1 | |
| - 2 | |
| - 2 | |
| - 4 | |
| attn_resolutions: [] | |
| num_res_blocks: 3 | |
| dropout: 0.0 | |
| gather_norm: false | |
| loss_fn_config: | |
| target: sgm.modules.diffusionmodules.loss.VideoDiffusionLoss | |
| params: | |
| offset_noise_level: 0 | |
| sigma_sampler_config: | |
| target: sgm.modules.diffusionmodules.sigma_sampling.DiscreteSampling | |
| params: | |
| uniform_sampling: true | |
| num_idx: 1000 | |
| discretization_config: | |
| target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization | |
| params: | |
| shift_scale: 3.0 | |
| sampler_config: | |
| target: sgm.modules.diffusionmodules.sampling.VPSDEDPMPP2MSampler | |
| params: | |
| num_steps: 50 | |
| verbose: true | |
| discretization_config: | |
| target: sgm.modules.diffusionmodules.discretizer.ZeroSNRDDPMDiscretization | |
| params: | |
| shift_scale: 3.0 | |
| guider_config: | |
| target: sgm.modules.diffusionmodules.guiders.DynamicCFG | |
| params: | |
| scale: 6 | |
| exp: 5 | |
| num_steps: 50 | |
| args: | |
| checkpoint_activations: true | |
| model_parallel_size: 1 | |
| experiment_name: dense_exp_6layer_gating_0.00002lr_all_continue | |
| mode: finetune | |
| load: /mnt/petrelfs/sichenyang.p/code/vla/CogVideo/sat_scy/ckpts_2b_lora/dense_exp_6layer_gating_0.00002lr_all_continue-09-20-12-08 | |
| no_load_rng: true | |
| train_iters: 100000 | |
| eval_iters: 1 | |
| eval_interval: 100 | |
| eval_batch_size: 1 | |
| save: ckpts_2b_lora | |
| save_interval: 1000 | |
| log_interval: 20 | |
| train_data: | |
| - /mnt/petrelfs/sichenyang.p/code/video_project/assets/data/mix_high_quality/vimeo+youtube+vecteezy+gen3.json | |
| valid_data: | |
| - /mnt/lustre/sichenyang.p/code/SD3_Vid/dataset_collection/data/gen3/all.json | |
| split: 1,0,0 | |
| num_workers: 8 | |
| force_train: true | |
| only_log_video_latents: true | |
| data: | |
| target: data_video.PetrelDataset | |
| params: | |
| video_size: | |
| - 480 | |
| - 720 | |
| fps: 8 | |
| max_num_frames: 49 | |
| skip_frms_num: 3.0 | |
| deepspeed: | |
| train_micro_batch_size_per_gpu: 2 | |
| gradient_accumulation_steps: 1 | |
| steps_per_print: 50 | |
| gradient_clipping: 0.1 | |
| zero_optimization: | |
| stage: 2 | |
| cpu_offload: false | |
| contiguous_gradients: false | |
| overlap_comm: true | |
| reduce_scatter: true | |
| reduce_bucket_size: 1000000000 | |
| allgather_bucket_size: 1000000000 | |
| load_from_fp32_weights: false | |
| zero_allow_untested_optimizer: true | |
| bf16: | |
| enabled: false | |
| fp16: | |
| enabled: true | |
| loss_scale: 0 | |
| loss_scale_window: 400 | |
| hysteresis: 2 | |
| min_loss_scale: 1 | |
| optimizer: | |
| type: sat.ops.FusedEmaAdam | |
| params: | |
| lr: 2.0e-05 | |
| betas: | |
| - 0.9 | |
| - 0.95 | |
| eps: 1.0e-08 | |
| weight_decay: 0.0001 | |
| activation_checkpointing: | |
| partition_activations: false | |
| contiguous_memory_optimization: false | |
| wall_clock_breakdown: false | |