diffusers库中stable Diffusion模块的解析
diffusers库中stable Diffusion模块的解析
diffusers中,stable Diffusion v1.5主要由以下几个部分组成
Out[3]: dict_keys(['vae', 'text_encoder', 'tokenizer', 'unet', 'scheduler', 'safety_checker', 'feature_extractor'])
下面给出具体的结构说明。
“text_encoder block”
CLIPTextModel((text_model): CLIPTextTransformer((embeddings): CLIPTextEmbeddings((token_embedding): Embedding(49408, 768)(position_embedding): Embedding(77, 768))(encoder): CLIPEncoder((layers): ModuleList((0-11): 12 x CLIPEncoderLayer((self_attn): CLIPAttention((k_proj): Linear(in_features=768, out_features=768, bias=True)(v_proj): Linear(in_features=768, out_features=768, bias=True)(q_proj): Linear(in_features=768, out_features=768, bias=True)(out_proj): Linear(in_features=768, out_features=768, bias=True))(layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)(mlp): CLIPMLP((activation_fn): QuickGELUActivation()(fc1): Linear(in_features=768, out_features=3072, bias=True)(fc2): Linear(in_features=3072, out_features=768, bias=True))(layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True))))(final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True))
)
“vae block”
AutoencoderKL((encoder): Encoder((conv_in): Conv2d(3, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(down_blocks): ModuleList((0): DownEncoderBlock2D((resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 128, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(downsamplers): ModuleList((0): Downsample2D((conv): LoRACompatibleConv(128, 128, kernel_size=(3, 3), stride=(2, 2)))))(1): DownEncoderBlock2D((resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 128, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(128, 256, kernel_size=(1, 1), stride=(1, 1)))(1): ResnetBlock2D((norm1): GroupNorm(32, 256, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(downsamplers): ModuleList((0): Downsample2D((conv): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(2, 2)))))(2): DownEncoderBlock2D((resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 256, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(256, 512, kernel_size=(1, 1), stride=(1, 1)))(1): ResnetBlock2D((norm1): GroupNorm(32, 512, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(downsamplers): ModuleList((0): Downsample2D((conv): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(2, 2)))))(3): DownEncoderBlock2D((resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 512, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))))(mid_block): UNetMidBlock2D((attentions): ModuleList((0): Attention((group_norm): GroupNorm(32, 512, eps=1e-06, affine=True)(to_q): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(to_k): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(to_v): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(1): Dropout(p=0.0, inplace=False))))(resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 512, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU())))(conv_norm_out): GroupNorm(32, 512, eps=1e-06, affine=True)(conv_act): SiLU()(conv_out): Conv2d(512, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))(decoder): Decoder((conv_in): Conv2d(4, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(up_blocks): ModuleList((0-1): 2 x UpDecoderBlock2D((resnets): ModuleList((0-2): 3 x ResnetBlock2D((norm1): GroupNorm(32, 512, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(upsamplers): ModuleList((0): Upsample2D((conv): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))))(2): UpDecoderBlock2D((resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 512, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(512, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(512, 256, kernel_size=(1, 1), stride=(1, 1)))(1-2): 2 x ResnetBlock2D((norm1): GroupNorm(32, 256, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 256, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(upsamplers): ModuleList((0): Upsample2D((conv): LoRACompatibleConv(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))))(3): UpDecoderBlock2D((resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 256, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(256, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(256, 128, kernel_size=(1, 1), stride=(1, 1)))(1-2): 2 x ResnetBlock2D((norm1): GroupNorm(32, 128, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 128, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))))(mid_block): UNetMidBlock2D((attentions): ModuleList((0): Attention((group_norm): GroupNorm(32, 512, eps=1e-06, affine=True)(to_q): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(to_k): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(to_v): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=512, out_features=512, bias=True)(1): Dropout(p=0.0, inplace=False))))(resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 512, eps=1e-06, affine=True)(conv1): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(norm2): GroupNorm(32, 512, eps=1e-06, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU())))(conv_norm_out): GroupNorm(32, 128, eps=1e-06, affine=True)(conv_act): SiLU()(conv_out): Conv2d(128, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))(quant_conv): Conv2d(8, 8, kernel_size=(1, 1), stride=(1, 1))(post_quant_conv): Conv2d(4, 4, kernel_size=(1, 1), stride=(1, 1))
)
“unet block”
UNet2DConditionModel((conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_proj): Timesteps()(time_embedding): TimestepEmbedding((linear_1): LoRACompatibleLinear(in_features=320, out_features=1280, bias=True)(act): SiLU()(linear_2): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True))(down_blocks): ModuleList((0): CrossAttnDownBlock2D((attentions): ModuleList((0-1): 2 x Transformer2DModel((norm): GroupNorm(32, 320, eps=1e-06, affine=True)(proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))(transformer_blocks): ModuleList((0): BasicTransformerBlock((norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)(attn1): Attention((to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)(attn2): Attention((to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)(to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)(ff): FeedForward((net): ModuleList((0): GEGLU((proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True))(1): Dropout(p=0.0, inplace=False)(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)))))(proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))))(resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 320, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(downsamplers): ModuleList((0): Downsample2D((conv): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)))))(1): CrossAttnDownBlock2D((attentions): ModuleList((0-1): 2 x Transformer2DModel((norm): GroupNorm(32, 640, eps=1e-06, affine=True)(proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))(transformer_blocks): ModuleList((0): BasicTransformerBlock((norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)(attn1): Attention((to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)(attn2): Attention((to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)(to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)(ff): FeedForward((net): ModuleList((0): GEGLU((proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True))(1): Dropout(p=0.0, inplace=False)(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)))))(proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))))(resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 320, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(320, 640, kernel_size=(1, 1), stride=(1, 1)))(1): ResnetBlock2D((norm1): GroupNorm(32, 640, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(downsamplers): ModuleList((0): Downsample2D((conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)))))(2): CrossAttnDownBlock2D((attentions): ModuleList((0-1): 2 x Transformer2DModel((norm): GroupNorm(32, 1280, eps=1e-06, affine=True)(proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))(transformer_blocks): ModuleList((0): BasicTransformerBlock((norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(attn1): Attention((to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(attn2): Attention((to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(ff): FeedForward((net): ModuleList((0): GEGLU((proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True))(1): Dropout(p=0.0, inplace=False)(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)))))(proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))))(resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 640, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(640, 1280, kernel_size=(1, 1), stride=(1, 1)))(1): ResnetBlock2D((norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))(downsamplers): ModuleList((0): Downsample2D((conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)))))(3): DownBlock2D((resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()))))(up_blocks): ModuleList((0): UpBlock2D((resnets): ModuleList((0-2): 3 x ResnetBlock2D((norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1))))(upsamplers): ModuleList((0): Upsample2D((conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))))(1): CrossAttnUpBlock2D((attentions): ModuleList((0-2): 3 x Transformer2DModel((norm): GroupNorm(32, 1280, eps=1e-06, affine=True)(proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))(transformer_blocks): ModuleList((0): BasicTransformerBlock((norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(attn1): Attention((to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(attn2): Attention((to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(ff): FeedForward((net): ModuleList((0): GEGLU((proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True))(1): Dropout(p=0.0, inplace=False)(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)))))(proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))))(resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(2560, 1280, kernel_size=(1, 1), stride=(1, 1)))(2): ResnetBlock2D((norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(1920, 1280, kernel_size=(1, 1), stride=(1, 1))))(upsamplers): ModuleList((0): Upsample2D((conv): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))))(2): CrossAttnUpBlock2D((attentions): ModuleList((0-2): 3 x Transformer2DModel((norm): GroupNorm(32, 640, eps=1e-06, affine=True)(proj_in): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))(transformer_blocks): ModuleList((0): BasicTransformerBlock((norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)(attn1): Attention((to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_k): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_v): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)(attn2): Attention((to_q): LoRACompatibleLinear(in_features=640, out_features=640, bias=False)(to_k): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)(to_v): LoRACompatibleLinear(in_features=768, out_features=640, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=640, out_features=640, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)(ff): FeedForward((net): ModuleList((0): GEGLU((proj): LoRACompatibleLinear(in_features=640, out_features=5120, bias=True))(1): Dropout(p=0.0, inplace=False)(2): LoRACompatibleLinear(in_features=2560, out_features=640, bias=True)))))(proj_out): LoRACompatibleConv(640, 640, kernel_size=(1, 1), stride=(1, 1))))(resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(1920, 640, kernel_size=(1, 1), stride=(1, 1)))(1): ResnetBlock2D((norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(1280, 640, kernel_size=(1, 1), stride=(1, 1)))(2): ResnetBlock2D((norm1): GroupNorm(32, 960, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=640, bias=True)(norm2): GroupNorm(32, 640, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(960, 640, kernel_size=(1, 1), stride=(1, 1))))(upsamplers): ModuleList((0): Upsample2D((conv): LoRACompatibleConv(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)))))(3): CrossAttnUpBlock2D((attentions): ModuleList((0-2): 3 x Transformer2DModel((norm): GroupNorm(32, 320, eps=1e-06, affine=True)(proj_in): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))(transformer_blocks): ModuleList((0): BasicTransformerBlock((norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)(attn1): Attention((to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_k): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_v): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)(attn2): Attention((to_q): LoRACompatibleLinear(in_features=320, out_features=320, bias=False)(to_k): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)(to_v): LoRACompatibleLinear(in_features=768, out_features=320, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=320, out_features=320, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)(ff): FeedForward((net): ModuleList((0): GEGLU((proj): LoRACompatibleLinear(in_features=320, out_features=2560, bias=True))(1): Dropout(p=0.0, inplace=False)(2): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)))))(proj_out): LoRACompatibleConv(320, 320, kernel_size=(1, 1), stride=(1, 1))))(resnets): ModuleList((0): ResnetBlock2D((norm1): GroupNorm(32, 960, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(960, 320, kernel_size=(1, 1), stride=(1, 1)))(1-2): 2 x ResnetBlock2D((norm1): GroupNorm(32, 640, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=320, bias=True)(norm2): GroupNorm(32, 320, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU()(conv_shortcut): LoRACompatibleConv(640, 320, kernel_size=(1, 1), stride=(1, 1))))))(mid_block): UNetMidBlock2DCrossAttn((attentions): ModuleList((0): Transformer2DModel((norm): GroupNorm(32, 1280, eps=1e-06, affine=True)(proj_in): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))(transformer_blocks): ModuleList((0): BasicTransformerBlock((norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(attn1): Attention((to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_k): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_v): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(attn2): Attention((to_q): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=False)(to_k): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)(to_v): LoRACompatibleLinear(in_features=768, out_features=1280, bias=False)(to_out): ModuleList((0): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(1): Dropout(p=0.0, inplace=False)))(norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)(ff): FeedForward((net): ModuleList((0): GEGLU((proj): LoRACompatibleLinear(in_features=1280, out_features=10240, bias=True))(1): Dropout(p=0.0, inplace=False)(2): LoRACompatibleLinear(in_features=5120, out_features=1280, bias=True)))))(proj_out): LoRACompatibleConv(1280, 1280, kernel_size=(1, 1), stride=(1, 1))))(resnets): ModuleList((0-1): 2 x ResnetBlock2D((norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)(conv1): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(time_emb_proj): LoRACompatibleLinear(in_features=1280, out_features=1280, bias=True)(norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)(dropout): Dropout(p=0.0, inplace=False)(conv2): LoRACompatibleConv(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))(nonlinearity): SiLU())))(conv_norm_out): GroupNorm(32, 320, eps=1e-05, affine=True)(conv_act): SiLU()(conv_out): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
)
“feature extractor block”
CLIPImageProcessor {"crop_size": {"height": 224,"width": 224},"do_center_crop": true,"do_convert_rgb": true,"do_normalize": true,"do_rescale": true,"do_resize": true,"feature_extractor_type": "CLIPFeatureExtractor","image_mean": [0.48145466,0.4578275,0.40821073],"image_processor_type": "CLIPImageProcessor","image_std": [0.26862954,0.26130258,0.27577711],"resample": 3,"rescale_factor": 0.00392156862745098,"size": {"shortest_edge": 224},"use_square_size": false
}
“tokenizer block”
CLIPTokenizer(name_or_path='/home/tiger/.cache/huggingface/hub/models--runwayml--stable-diffusion-v1-5/snapshots/1d0c4ebf6ff58a5caecab40fa1406526bca4b5b9/tokenizer', vocab_size=49408, model_max_length=77, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True), added_tokens_decoder={49406: AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),49407: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}
“safety_checker block”
StableDiffusionSafetyChecker((vision_model): CLIPVisionModel((vision_model): CLIPVisionTransformer((embeddings): CLIPVisionEmbeddings((patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)(position_embedding): Embedding(257, 1024))(pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)(encoder): CLIPEncoder((layers): ModuleList((0-23): 24 x CLIPEncoderLayer((self_attn): CLIPAttention((k_proj): Linear(in_features=1024, out_features=1024, bias=True)(v_proj): Linear(in_features=1024, out_features=1024, bias=True)(q_proj): Linear(in_features=1024, out_features=1024, bias=True)(out_proj): Linear(in_features=1024, out_features=1024, bias=True))(layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)(mlp): CLIPMLP((activation_fn): QuickGELUActivation()(fc1): Linear(in_features=1024, out_features=4096, bias=True)(fc2): Linear(in_features=4096, out_features=1024, bias=True))(layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True))))(post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)))(visual_projection): Linear(in_features=1024, out_features=768, bias=False)
)
“scheduler block”
PNDMScheduler {"_class_name": "PNDMScheduler","_diffusers_version": "0.22.3","beta_end": 0.012,"beta_schedule": "scaled_linear","beta_start": 0.00085,"clip_sample": false,"num_train_timesteps": 1000,"prediction_type": "epsilon","set_alpha_to_one": false,"skip_prk_steps": true,"steps_offset": 1,"timestep_spacing": "leading","trained_betas": null
}