|
- import torch
- from collections import OrderedDict
-
- ModelPath="/userhome/fairseq/fairseq/model_dir/418M_last_checkpoint.pt"
-
- chk=torch.load(ModelPath)
- print(chk.keys())
-
- Encoder_layers=chk['args'].encoder_layers
- Decoder_layers=chk['args'].decoder_layers
- Num_experts=8
-
- #dict_keys(['args', 'model', 'optimizer_history', 'extra_state', 'last_optimizer_state'])
-
- model_ori=chk['model']
-
- Hidden=model_ori['encoder.embed_tokens.weight'].shape[1]
-
- model = OrderedDict()
-
- para_names=['encoder.version', 'encoder.embed_tokens.weight', 'encoder.embed_positions._float_tensor', 'encoder.layers.0.self_attn.k_proj.weight', 'encoder.layers.0.self_attn.k_proj.bias', 'encoder.layers.0.self_attn.v_proj.weight', 'encoder.layers.0.self_attn.v_proj.bias', 'encoder.layers.0.self_attn.q_proj.weight', 'encoder.layers.0.self_attn.q_proj.bias', 'encoder.layers.0.self_attn.out_proj.weight', 'encoder.layers.0.self_attn.out_proj.bias', 'encoder.layers.0.self_attn_layer_norm.weight', 'encoder.layers.0.self_attn_layer_norm.bias', 'encoder.layers.0.MoeMLP.gate.gate.weight', 'encoder.layers.0.MoeMLP.gate.gate.bias', 'encoder.layers.0.MoeMLP.experts.htoh4.weight', 'encoder.layers.0.MoeMLP.experts.htoh4.bias', 'encoder.layers.0.MoeMLP.experts.h4toh.weight', 'encoder.layers.0.MoeMLP.experts.h4toh.bias', 'encoder.layers.0.final_layer_norm.weight', 'encoder.layers.0.final_layer_norm.bias', 'encoder.layers.1.self_attn.k_proj.weight', 'encoder.layers.1.self_attn.k_proj.bias', 'encoder.layers.1.self_attn.v_proj.weight', 'encoder.layers.1.self_attn.v_proj.bias', 'encoder.layers.1.self_attn.q_proj.weight', 'encoder.layers.1.self_attn.q_proj.bias', 'encoder.layers.1.self_attn.out_proj.weight', 'encoder.layers.1.self_attn.out_proj.bias', 'encoder.layers.1.self_attn_layer_norm.weight', 'encoder.layers.1.self_attn_layer_norm.bias', 'encoder.layers.1.MoeMLP.gate.gate.weight', 'encoder.layers.1.MoeMLP.gate.gate.bias', 'encoder.layers.1.MoeMLP.experts.htoh4.weight', 'encoder.layers.1.MoeMLP.experts.htoh4.bias', 'encoder.layers.1.MoeMLP.experts.h4toh.weight', 'encoder.layers.1.MoeMLP.experts.h4toh.bias', 'encoder.layers.1.final_layer_norm.weight', 'encoder.layers.1.final_layer_norm.bias', 'encoder.layers.2.self_attn.k_proj.weight', 'encoder.layers.2.self_attn.k_proj.bias', 'encoder.layers.2.self_attn.v_proj.weight', 'encoder.layers.2.self_attn.v_proj.bias', 'encoder.layers.2.self_attn.q_proj.weight', 'encoder.layers.2.self_attn.q_proj.bias', 'encoder.layers.2.self_attn.out_proj.weight', 'encoder.layers.2.self_attn.out_proj.bias', 'encoder.layers.2.self_attn_layer_norm.weight', 'encoder.layers.2.self_attn_layer_norm.bias', 'encoder.layers.2.MoeMLP.gate.gate.weight', 'encoder.layers.2.MoeMLP.gate.gate.bias', 'encoder.layers.2.MoeMLP.experts.htoh4.weight', 'encoder.layers.2.MoeMLP.experts.htoh4.bias', 'encoder.layers.2.MoeMLP.experts.h4toh.weight', 'encoder.layers.2.MoeMLP.experts.h4toh.bias', 'encoder.layers.2.final_layer_norm.weight', 'encoder.layers.2.final_layer_norm.bias', 'encoder.layers.3.self_attn.k_proj.weight', 'encoder.layers.3.self_attn.k_proj.bias', 'encoder.layers.3.self_attn.v_proj.weight', 'encoder.layers.3.self_attn.v_proj.bias', 'encoder.layers.3.self_attn.q_proj.weight', 'encoder.layers.3.self_attn.q_proj.bias', 'encoder.layers.3.self_attn.out_proj.weight', 'encoder.layers.3.self_attn.out_proj.bias', 'encoder.layers.3.self_attn_layer_norm.weight', 'encoder.layers.3.self_attn_layer_norm.bias', 'encoder.layers.3.MoeMLP.gate.gate.weight', 'encoder.layers.3.MoeMLP.gate.gate.bias', 'encoder.layers.3.MoeMLP.experts.htoh4.weight', 'encoder.layers.3.MoeMLP.experts.htoh4.bias', 'encoder.layers.3.MoeMLP.experts.h4toh.weight', 'encoder.layers.3.MoeMLP.experts.h4toh.bias', 'encoder.layers.3.final_layer_norm.weight', 'encoder.layers.3.final_layer_norm.bias', 'encoder.layers.4.self_attn.k_proj.weight', 'encoder.layers.4.self_attn.k_proj.bias', 'encoder.layers.4.self_attn.v_proj.weight', 'encoder.layers.4.self_attn.v_proj.bias', 'encoder.layers.4.self_attn.q_proj.weight', 'encoder.layers.4.self_attn.q_proj.bias', 'encoder.layers.4.self_attn.out_proj.weight', 'encoder.layers.4.self_attn.out_proj.bias', 'encoder.layers.4.self_attn_layer_norm.weight', 'encoder.layers.4.self_attn_layer_norm.bias', 'encoder.layers.4.MoeMLP.gate.gate.weight', 'encoder.layers.4.MoeMLP.gate.gate.bias', 'encoder.layers.4.MoeMLP.experts.htoh4.weight', 'encoder.layers.4.MoeMLP.experts.htoh4.bias', 'encoder.layers.4.MoeMLP.experts.h4toh.weight', 'encoder.layers.4.MoeMLP.experts.h4toh.bias', 'encoder.layers.4.final_layer_norm.weight', 'encoder.layers.4.final_layer_norm.bias', 'encoder.layers.5.self_attn.k_proj.weight', 'encoder.layers.5.self_attn.k_proj.bias', 'encoder.layers.5.self_attn.v_proj.weight', 'encoder.layers.5.self_attn.v_proj.bias', 'encoder.layers.5.self_attn.q_proj.weight', 'encoder.layers.5.self_attn.q_proj.bias', 'encoder.layers.5.self_attn.out_proj.weight', 'encoder.layers.5.self_attn.out_proj.bias', 'encoder.layers.5.self_attn_layer_norm.weight', 'encoder.layers.5.self_attn_layer_norm.bias', 'encoder.layers.5.MoeMLP.gate.gate.weight', 'encoder.layers.5.MoeMLP.gate.gate.bias', 'encoder.layers.5.MoeMLP.experts.htoh4.weight', 'encoder.layers.5.MoeMLP.experts.htoh4.bias', 'encoder.layers.5.MoeMLP.experts.h4toh.weight', 'encoder.layers.5.MoeMLP.experts.h4toh.bias', 'encoder.layers.5.final_layer_norm.weight', 'encoder.layers.5.final_layer_norm.bias', 'encoder.layers.6.self_attn.k_proj.weight', 'encoder.layers.6.self_attn.k_proj.bias', 'encoder.layers.6.self_attn.v_proj.weight', 'encoder.layers.6.self_attn.v_proj.bias', 'encoder.layers.6.self_attn.q_proj.weight', 'encoder.layers.6.self_attn.q_proj.bias', 'encoder.layers.6.self_attn.out_proj.weight', 'encoder.layers.6.self_attn.out_proj.bias', 'encoder.layers.6.self_attn_layer_norm.weight', 'encoder.layers.6.self_attn_layer_norm.bias', 'encoder.layers.6.MoeMLP.gate.gate.weight', 'encoder.layers.6.MoeMLP.gate.gate.bias', 'encoder.layers.6.MoeMLP.experts.htoh4.weight', 'encoder.layers.6.MoeMLP.experts.htoh4.bias', 'encoder.layers.6.MoeMLP.experts.h4toh.weight', 'encoder.layers.6.MoeMLP.experts.h4toh.bias', 'encoder.layers.6.final_layer_norm.weight', 'encoder.layers.6.final_layer_norm.bias', 'encoder.layers.7.self_attn.k_proj.weight', 'encoder.layers.7.self_attn.k_proj.bias', 'encoder.layers.7.self_attn.v_proj.weight', 'encoder.layers.7.self_attn.v_proj.bias', 'encoder.layers.7.self_attn.q_proj.weight', 'encoder.layers.7.self_attn.q_proj.bias', 'encoder.layers.7.self_attn.out_proj.weight', 'encoder.layers.7.self_attn.out_proj.bias', 'encoder.layers.7.self_attn_layer_norm.weight', 'encoder.layers.7.self_attn_layer_norm.bias', 'encoder.layers.7.MoeMLP.gate.gate.weight', 'encoder.layers.7.MoeMLP.gate.gate.bias', 'encoder.layers.7.MoeMLP.experts.htoh4.weight', 'encoder.layers.7.MoeMLP.experts.htoh4.bias', 'encoder.layers.7.MoeMLP.experts.h4toh.weight', 'encoder.layers.7.MoeMLP.experts.h4toh.bias', 'encoder.layers.7.final_layer_norm.weight', 'encoder.layers.7.final_layer_norm.bias', 'encoder.layers.8.self_attn.k_proj.weight', 'encoder.layers.8.self_attn.k_proj.bias', 'encoder.layers.8.self_attn.v_proj.weight', 'encoder.layers.8.self_attn.v_proj.bias', 'encoder.layers.8.self_attn.q_proj.weight', 'encoder.layers.8.self_attn.q_proj.bias', 'encoder.layers.8.self_attn.out_proj.weight', 'encoder.layers.8.self_attn.out_proj.bias', 'encoder.layers.8.self_attn_layer_norm.weight', 'encoder.layers.8.self_attn_layer_norm.bias', 'encoder.layers.8.MoeMLP.gate.gate.weight', 'encoder.layers.8.MoeMLP.gate.gate.bias', 'encoder.layers.8.MoeMLP.experts.htoh4.weight', 'encoder.layers.8.MoeMLP.experts.htoh4.bias', 'encoder.layers.8.MoeMLP.experts.h4toh.weight', 'encoder.layers.8.MoeMLP.experts.h4toh.bias', 'encoder.layers.8.final_layer_norm.weight', 'encoder.layers.8.final_layer_norm.bias', 'encoder.layers.9.self_attn.k_proj.weight', 'encoder.layers.9.self_attn.k_proj.bias', 'encoder.layers.9.self_attn.v_proj.weight', 'encoder.layers.9.self_attn.v_proj.bias', 'encoder.layers.9.self_attn.q_proj.weight', 'encoder.layers.9.self_attn.q_proj.bias', 'encoder.layers.9.self_attn.out_proj.weight', 'encoder.layers.9.self_attn.out_proj.bias', 'encoder.layers.9.self_attn_layer_norm.weight', 'encoder.layers.9.self_attn_layer_norm.bias', 'encoder.layers.9.MoeMLP.gate.gate.weight', 'encoder.layers.9.MoeMLP.gate.gate.bias', 'encoder.layers.9.MoeMLP.experts.htoh4.weight', 'encoder.layers.9.MoeMLP.experts.htoh4.bias', 'encoder.layers.9.MoeMLP.experts.h4toh.weight', 'encoder.layers.9.MoeMLP.experts.h4toh.bias', 'encoder.layers.9.final_layer_norm.weight', 'encoder.layers.9.final_layer_norm.bias', 'encoder.layers.10.self_attn.k_proj.weight', 'encoder.layers.10.self_attn.k_proj.bias', 'encoder.layers.10.self_attn.v_proj.weight', 'encoder.layers.10.self_attn.v_proj.bias', 'encoder.layers.10.self_attn.q_proj.weight', 'encoder.layers.10.self_attn.q_proj.bias', 'encoder.layers.10.self_attn.out_proj.weight', 'encoder.layers.10.self_attn.out_proj.bias', 'encoder.layers.10.self_attn_layer_norm.weight', 'encoder.layers.10.self_attn_layer_norm.bias', 'encoder.layers.10.MoeMLP.gate.gate.weight', 'encoder.layers.10.MoeMLP.gate.gate.bias', 'encoder.layers.10.MoeMLP.experts.htoh4.weight', 'encoder.layers.10.MoeMLP.experts.htoh4.bias', 'encoder.layers.10.MoeMLP.experts.h4toh.weight', 'encoder.layers.10.MoeMLP.experts.h4toh.bias', 'encoder.layers.10.final_layer_norm.weight', 'encoder.layers.10.final_layer_norm.bias', 'encoder.layers.11.self_attn.k_proj.weight', 'encoder.layers.11.self_attn.k_proj.bias', 'encoder.layers.11.self_attn.v_proj.weight', 'encoder.layers.11.self_attn.v_proj.bias', 'encoder.layers.11.self_attn.q_proj.weight', 'encoder.layers.11.self_attn.q_proj.bias', 'encoder.layers.11.self_attn.out_proj.weight', 'encoder.layers.11.self_attn.out_proj.bias', 'encoder.layers.11.self_attn_layer_norm.weight', 'encoder.layers.11.self_attn_layer_norm.bias', 'encoder.layers.11.MoeMLP.gate.gate.weight', 'encoder.layers.11.MoeMLP.gate.gate.bias', 'encoder.layers.11.MoeMLP.experts.htoh4.weight', 'encoder.layers.11.MoeMLP.experts.htoh4.bias', 'encoder.layers.11.MoeMLP.experts.h4toh.weight', 'encoder.layers.11.MoeMLP.experts.h4toh.bias', 'encoder.layers.11.final_layer_norm.weight', 'encoder.layers.11.final_layer_norm.bias', 'encoder.layer_norm.weight', 'encoder.layer_norm.bias', 'decoder.version', 'decoder.embed_tokens.weight', 'decoder.embed_positions._float_tensor', 'decoder.layers.0.self_attn.k_proj.weight', 'decoder.layers.0.self_attn.k_proj.bias', 'decoder.layers.0.self_attn.v_proj.weight', 'decoder.layers.0.self_attn.v_proj.bias', 'decoder.layers.0.self_attn.q_proj.weight', 'decoder.layers.0.self_attn.q_proj.bias', 'decoder.layers.0.self_attn.out_proj.weight', 'decoder.layers.0.self_attn.out_proj.bias', 'decoder.layers.0.self_attn_layer_norm.weight', 'decoder.layers.0.self_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.k_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.MoeMLP.gate.gate.weight', 'decoder.layers.0.MoeMLP.gate.gate.bias', 'decoder.layers.0.MoeMLP.experts.htoh4.weight', 'decoder.layers.0.MoeMLP.experts.htoh4.bias', 'decoder.layers.0.MoeMLP.experts.h4toh.weight', 'decoder.layers.0.MoeMLP.experts.h4toh.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.1.self_attn.k_proj.weight', 'decoder.layers.1.self_attn.k_proj.bias', 'decoder.layers.1.self_attn.v_proj.weight', 'decoder.layers.1.self_attn.v_proj.bias', 'decoder.layers.1.self_attn.q_proj.weight', 'decoder.layers.1.self_attn.q_proj.bias', 'decoder.layers.1.self_attn.out_proj.weight', 'decoder.layers.1.self_attn.out_proj.bias', 'decoder.layers.1.self_attn_layer_norm.weight', 'decoder.layers.1.self_attn_layer_norm.bias', 'decoder.layers.1.encoder_attn.k_proj.weight', 'decoder.layers.1.encoder_attn.k_proj.bias', 'decoder.layers.1.encoder_attn.v_proj.weight', 'decoder.layers.1.encoder_attn.v_proj.bias', 'decoder.layers.1.encoder_attn.q_proj.weight', 'decoder.layers.1.encoder_attn.q_proj.bias', 'decoder.layers.1.encoder_attn.out_proj.weight', 'decoder.layers.1.encoder_attn.out_proj.bias', 'decoder.layers.1.encoder_attn_layer_norm.weight', 'decoder.layers.1.encoder_attn_layer_norm.bias', 'decoder.layers.1.MoeMLP.gate.gate.weight', 'decoder.layers.1.MoeMLP.gate.gate.bias', 'decoder.layers.1.MoeMLP.experts.htoh4.weight', 'decoder.layers.1.MoeMLP.experts.htoh4.bias', 'decoder.layers.1.MoeMLP.experts.h4toh.weight', 'decoder.layers.1.MoeMLP.experts.h4toh.bias', 'decoder.layers.1.final_layer_norm.weight', 'decoder.layers.1.final_layer_norm.bias', 'decoder.layers.2.self_attn.k_proj.weight', 'decoder.layers.2.self_attn.k_proj.bias', 'decoder.layers.2.self_attn.v_proj.weight', 'decoder.layers.2.self_attn.v_proj.bias', 'decoder.layers.2.self_attn.q_proj.weight', 'decoder.layers.2.self_attn.q_proj.bias', 'decoder.layers.2.self_attn.out_proj.weight', 'decoder.layers.2.self_attn.out_proj.bias', 'decoder.layers.2.self_attn_layer_norm.weight', 'decoder.layers.2.self_attn_layer_norm.bias', 'decoder.layers.2.encoder_attn.k_proj.weight', 'decoder.layers.2.encoder_attn.k_proj.bias', 'decoder.layers.2.encoder_attn.v_proj.weight', 'decoder.layers.2.encoder_attn.v_proj.bias', 'decoder.layers.2.encoder_attn.q_proj.weight', 'decoder.layers.2.encoder_attn.q_proj.bias', 'decoder.layers.2.encoder_attn.out_proj.weight', 'decoder.layers.2.encoder_attn.out_proj.bias', 'decoder.layers.2.encoder_attn_layer_norm.weight', 'decoder.layers.2.encoder_attn_layer_norm.bias', 'decoder.layers.2.MoeMLP.gate.gate.weight', 'decoder.layers.2.MoeMLP.gate.gate.bias', 'decoder.layers.2.MoeMLP.experts.htoh4.weight', 'decoder.layers.2.MoeMLP.experts.htoh4.bias', 'decoder.layers.2.MoeMLP.experts.h4toh.weight', 'decoder.layers.2.MoeMLP.experts.h4toh.bias', 'decoder.layers.2.final_layer_norm.weight', 'decoder.layers.2.final_layer_norm.bias', 'decoder.layers.3.self_attn.k_proj.weight', 'decoder.layers.3.self_attn.k_proj.bias', 'decoder.layers.3.self_attn.v_proj.weight', 'decoder.layers.3.self_attn.v_proj.bias', 'decoder.layers.3.self_attn.q_proj.weight', 'decoder.layers.3.self_attn.q_proj.bias', 'decoder.layers.3.self_attn.out_proj.weight', 'decoder.layers.3.self_attn.out_proj.bias', 'decoder.layers.3.self_attn_layer_norm.weight', 'decoder.layers.3.self_attn_layer_norm.bias', 'decoder.layers.3.encoder_attn.k_proj.weight', 'decoder.layers.3.encoder_attn.k_proj.bias', 'decoder.layers.3.encoder_attn.v_proj.weight', 'decoder.layers.3.encoder_attn.v_proj.bias', 'decoder.layers.3.encoder_attn.q_proj.weight', 'decoder.layers.3.encoder_attn.q_proj.bias', 'decoder.layers.3.encoder_attn.out_proj.weight', 'decoder.layers.3.encoder_attn.out_proj.bias', 'decoder.layers.3.encoder_attn_layer_norm.weight', 'decoder.layers.3.encoder_attn_layer_norm.bias', 'decoder.layers.3.MoeMLP.gate.gate.weight', 'decoder.layers.3.MoeMLP.gate.gate.bias', 'decoder.layers.3.MoeMLP.experts.htoh4.weight', 'decoder.layers.3.MoeMLP.experts.htoh4.bias', 'decoder.layers.3.MoeMLP.experts.h4toh.weight', 'decoder.layers.3.MoeMLP.experts.h4toh.bias', 'decoder.layers.3.final_layer_norm.weight', 'decoder.layers.3.final_layer_norm.bias', 'decoder.layers.4.self_attn.k_proj.weight', 'decoder.layers.4.self_attn.k_proj.bias', 'decoder.layers.4.self_attn.v_proj.weight', 'decoder.layers.4.self_attn.v_proj.bias', 'decoder.layers.4.self_attn.q_proj.weight', 'decoder.layers.4.self_attn.q_proj.bias', 'decoder.layers.4.self_attn.out_proj.weight', 'decoder.layers.4.self_attn.out_proj.bias', 'decoder.layers.4.self_attn_layer_norm.weight', 'decoder.layers.4.self_attn_layer_norm.bias', 'decoder.layers.4.encoder_attn.k_proj.weight', 'decoder.layers.4.encoder_attn.k_proj.bias', 'decoder.layers.4.encoder_attn.v_proj.weight', 'decoder.layers.4.encoder_attn.v_proj.bias', 'decoder.layers.4.encoder_attn.q_proj.weight', 'decoder.layers.4.encoder_attn.q_proj.bias', 'decoder.layers.4.encoder_attn.out_proj.weight', 'decoder.layers.4.encoder_attn.out_proj.bias', 'decoder.layers.4.encoder_attn_layer_norm.weight', 'decoder.layers.4.encoder_attn_layer_norm.bias', 'decoder.layers.4.MoeMLP.gate.gate.weight', 'decoder.layers.4.MoeMLP.gate.gate.bias', 'decoder.layers.4.MoeMLP.experts.htoh4.weight', 'decoder.layers.4.MoeMLP.experts.htoh4.bias', 'decoder.layers.4.MoeMLP.experts.h4toh.weight', 'decoder.layers.4.MoeMLP.experts.h4toh.bias', 'decoder.layers.4.final_layer_norm.weight', 'decoder.layers.4.final_layer_norm.bias', 'decoder.layers.5.self_attn.k_proj.weight', 'decoder.layers.5.self_attn.k_proj.bias', 'decoder.layers.5.self_attn.v_proj.weight', 'decoder.layers.5.self_attn.v_proj.bias', 'decoder.layers.5.self_attn.q_proj.weight', 'decoder.layers.5.self_attn.q_proj.bias', 'decoder.layers.5.self_attn.out_proj.weight', 'decoder.layers.5.self_attn.out_proj.bias', 'decoder.layers.5.self_attn_layer_norm.weight', 'decoder.layers.5.self_attn_layer_norm.bias', 'decoder.layers.5.encoder_attn.k_proj.weight', 'decoder.layers.5.encoder_attn.k_proj.bias', 'decoder.layers.5.encoder_attn.v_proj.weight', 'decoder.layers.5.encoder_attn.v_proj.bias', 'decoder.layers.5.encoder_attn.q_proj.weight', 'decoder.layers.5.encoder_attn.q_proj.bias', 'decoder.layers.5.encoder_attn.out_proj.weight', 'decoder.layers.5.encoder_attn.out_proj.bias', 'decoder.layers.5.encoder_attn_layer_norm.weight', 'decoder.layers.5.encoder_attn_layer_norm.bias', 'decoder.layers.5.MoeMLP.gate.gate.weight', 'decoder.layers.5.MoeMLP.gate.gate.bias', 'decoder.layers.5.MoeMLP.experts.htoh4.weight', 'decoder.layers.5.MoeMLP.experts.htoh4.bias', 'decoder.layers.5.MoeMLP.experts.h4toh.weight', 'decoder.layers.5.MoeMLP.experts.h4toh.bias', 'decoder.layers.5.final_layer_norm.weight', 'decoder.layers.5.final_layer_norm.bias', 'decoder.layers.6.self_attn.k_proj.weight', 'decoder.layers.6.self_attn.k_proj.bias', 'decoder.layers.6.self_attn.v_proj.weight', 'decoder.layers.6.self_attn.v_proj.bias', 'decoder.layers.6.self_attn.q_proj.weight', 'decoder.layers.6.self_attn.q_proj.bias', 'decoder.layers.6.self_attn.out_proj.weight', 'decoder.layers.6.self_attn.out_proj.bias', 'decoder.layers.6.self_attn_layer_norm.weight', 'decoder.layers.6.self_attn_layer_norm.bias', 'decoder.layers.6.encoder_attn.k_proj.weight', 'decoder.layers.6.encoder_attn.k_proj.bias', 'decoder.layers.6.encoder_attn.v_proj.weight', 'decoder.layers.6.encoder_attn.v_proj.bias', 'decoder.layers.6.encoder_attn.q_proj.weight', 'decoder.layers.6.encoder_attn.q_proj.bias', 'decoder.layers.6.encoder_attn.out_proj.weight', 'decoder.layers.6.encoder_attn.out_proj.bias', 'decoder.layers.6.encoder_attn_layer_norm.weight', 'decoder.layers.6.encoder_attn_layer_norm.bias', 'decoder.layers.6.MoeMLP.gate.gate.weight', 'decoder.layers.6.MoeMLP.gate.gate.bias', 'decoder.layers.6.MoeMLP.experts.htoh4.weight', 'decoder.layers.6.MoeMLP.experts.htoh4.bias', 'decoder.layers.6.MoeMLP.experts.h4toh.weight', 'decoder.layers.6.MoeMLP.experts.h4toh.bias', 'decoder.layers.6.final_layer_norm.weight', 'decoder.layers.6.final_layer_norm.bias', 'decoder.layers.7.self_attn.k_proj.weight', 'decoder.layers.7.self_attn.k_proj.bias', 'decoder.layers.7.self_attn.v_proj.weight', 'decoder.layers.7.self_attn.v_proj.bias', 'decoder.layers.7.self_attn.q_proj.weight', 'decoder.layers.7.self_attn.q_proj.bias', 'decoder.layers.7.self_attn.out_proj.weight', 'decoder.layers.7.self_attn.out_proj.bias', 'decoder.layers.7.self_attn_layer_norm.weight', 'decoder.layers.7.self_attn_layer_norm.bias', 'decoder.layers.7.encoder_attn.k_proj.weight', 'decoder.layers.7.encoder_attn.k_proj.bias', 'decoder.layers.7.encoder_attn.v_proj.weight', 'decoder.layers.7.encoder_attn.v_proj.bias', 'decoder.layers.7.encoder_attn.q_proj.weight', 'decoder.layers.7.encoder_attn.q_proj.bias', 'decoder.layers.7.encoder_attn.out_proj.weight', 'decoder.layers.7.encoder_attn.out_proj.bias', 'decoder.layers.7.encoder_attn_layer_norm.weight', 'decoder.layers.7.encoder_attn_layer_norm.bias', 'decoder.layers.7.MoeMLP.gate.gate.weight', 'decoder.layers.7.MoeMLP.gate.gate.bias', 'decoder.layers.7.MoeMLP.experts.htoh4.weight', 'decoder.layers.7.MoeMLP.experts.htoh4.bias', 'decoder.layers.7.MoeMLP.experts.h4toh.weight', 'decoder.layers.7.MoeMLP.experts.h4toh.bias', 'decoder.layers.7.final_layer_norm.weight', 'decoder.layers.7.final_layer_norm.bias', 'decoder.layers.8.self_attn.k_proj.weight', 'decoder.layers.8.self_attn.k_proj.bias', 'decoder.layers.8.self_attn.v_proj.weight', 'decoder.layers.8.self_attn.v_proj.bias', 'decoder.layers.8.self_attn.q_proj.weight', 'decoder.layers.8.self_attn.q_proj.bias', 'decoder.layers.8.self_attn.out_proj.weight', 'decoder.layers.8.self_attn.out_proj.bias', 'decoder.layers.8.self_attn_layer_norm.weight', 'decoder.layers.8.self_attn_layer_norm.bias', 'decoder.layers.8.encoder_attn.k_proj.weight', 'decoder.layers.8.encoder_attn.k_proj.bias', 'decoder.layers.8.encoder_attn.v_proj.weight', 'decoder.layers.8.encoder_attn.v_proj.bias', 'decoder.layers.8.encoder_attn.q_proj.weight', 'decoder.layers.8.encoder_attn.q_proj.bias', 'decoder.layers.8.encoder_attn.out_proj.weight', 'decoder.layers.8.encoder_attn.out_proj.bias', 'decoder.layers.8.encoder_attn_layer_norm.weight', 'decoder.layers.8.encoder_attn_layer_norm.bias', 'decoder.layers.8.MoeMLP.gate.gate.weight', 'decoder.layers.8.MoeMLP.gate.gate.bias', 'decoder.layers.8.MoeMLP.experts.htoh4.weight', 'decoder.layers.8.MoeMLP.experts.htoh4.bias', 'decoder.layers.8.MoeMLP.experts.h4toh.weight', 'decoder.layers.8.MoeMLP.experts.h4toh.bias', 'decoder.layers.8.final_layer_norm.weight', 'decoder.layers.8.final_layer_norm.bias', 'decoder.layers.9.self_attn.k_proj.weight', 'decoder.layers.9.self_attn.k_proj.bias', 'decoder.layers.9.self_attn.v_proj.weight', 'decoder.layers.9.self_attn.v_proj.bias', 'decoder.layers.9.self_attn.q_proj.weight', 'decoder.layers.9.self_attn.q_proj.bias', 'decoder.layers.9.self_attn.out_proj.weight', 'decoder.layers.9.self_attn.out_proj.bias', 'decoder.layers.9.self_attn_layer_norm.weight', 'decoder.layers.9.self_attn_layer_norm.bias', 'decoder.layers.9.encoder_attn.k_proj.weight', 'decoder.layers.9.encoder_attn.k_proj.bias', 'decoder.layers.9.encoder_attn.v_proj.weight', 'decoder.layers.9.encoder_attn.v_proj.bias', 'decoder.layers.9.encoder_attn.q_proj.weight', 'decoder.layers.9.encoder_attn.q_proj.bias', 'decoder.layers.9.encoder_attn.out_proj.weight', 'decoder.layers.9.encoder_attn.out_proj.bias', 'decoder.layers.9.encoder_attn_layer_norm.weight', 'decoder.layers.9.encoder_attn_layer_norm.bias', 'decoder.layers.9.MoeMLP.gate.gate.weight', 'decoder.layers.9.MoeMLP.gate.gate.bias', 'decoder.layers.9.MoeMLP.experts.htoh4.weight', 'decoder.layers.9.MoeMLP.experts.htoh4.bias', 'decoder.layers.9.MoeMLP.experts.h4toh.weight', 'decoder.layers.9.MoeMLP.experts.h4toh.bias', 'decoder.layers.9.final_layer_norm.weight', 'decoder.layers.9.final_layer_norm.bias', 'decoder.layers.10.self_attn.k_proj.weight', 'decoder.layers.10.self_attn.k_proj.bias', 'decoder.layers.10.self_attn.v_proj.weight', 'decoder.layers.10.self_attn.v_proj.bias', 'decoder.layers.10.self_attn.q_proj.weight', 'decoder.layers.10.self_attn.q_proj.bias', 'decoder.layers.10.self_attn.out_proj.weight', 'decoder.layers.10.self_attn.out_proj.bias', 'decoder.layers.10.self_attn_layer_norm.weight', 'decoder.layers.10.self_attn_layer_norm.bias', 'decoder.layers.10.encoder_attn.k_proj.weight', 'decoder.layers.10.encoder_attn.k_proj.bias', 'decoder.layers.10.encoder_attn.v_proj.weight', 'decoder.layers.10.encoder_attn.v_proj.bias', 'decoder.layers.10.encoder_attn.q_proj.weight', 'decoder.layers.10.encoder_attn.q_proj.bias', 'decoder.layers.10.encoder_attn.out_proj.weight', 'decoder.layers.10.encoder_attn.out_proj.bias', 'decoder.layers.10.encoder_attn_layer_norm.weight', 'decoder.layers.10.encoder_attn_layer_norm.bias', 'decoder.layers.10.MoeMLP.gate.gate.weight', 'decoder.layers.10.MoeMLP.gate.gate.bias', 'decoder.layers.10.MoeMLP.experts.htoh4.weight', 'decoder.layers.10.MoeMLP.experts.htoh4.bias', 'decoder.layers.10.MoeMLP.experts.h4toh.weight', 'decoder.layers.10.MoeMLP.experts.h4toh.bias', 'decoder.layers.10.final_layer_norm.weight', 'decoder.layers.10.final_layer_norm.bias', 'decoder.layers.11.self_attn.k_proj.weight', 'decoder.layers.11.self_attn.k_proj.bias', 'decoder.layers.11.self_attn.v_proj.weight', 'decoder.layers.11.self_attn.v_proj.bias', 'decoder.layers.11.self_attn.q_proj.weight', 'decoder.layers.11.self_attn.q_proj.bias', 'decoder.layers.11.self_attn.out_proj.weight', 'decoder.layers.11.self_attn.out_proj.bias', 'decoder.layers.11.self_attn_layer_norm.weight', 'decoder.layers.11.self_attn_layer_norm.bias', 'decoder.layers.11.encoder_attn.k_proj.weight', 'decoder.layers.11.encoder_attn.k_proj.bias', 'decoder.layers.11.encoder_attn.v_proj.weight', 'decoder.layers.11.encoder_attn.v_proj.bias', 'decoder.layers.11.encoder_attn.q_proj.weight', 'decoder.layers.11.encoder_attn.q_proj.bias', 'decoder.layers.11.encoder_attn.out_proj.weight', 'decoder.layers.11.encoder_attn.out_proj.bias', 'decoder.layers.11.encoder_attn_layer_norm.weight', 'decoder.layers.11.encoder_attn_layer_norm.bias', 'decoder.layers.11.MoeMLP.gate.gate.weight', 'decoder.layers.11.MoeMLP.gate.gate.bias', 'decoder.layers.11.MoeMLP.experts.htoh4.weight', 'decoder.layers.11.MoeMLP.experts.htoh4.bias', 'decoder.layers.11.MoeMLP.experts.h4toh.weight', 'decoder.layers.11.MoeMLP.experts.h4toh.bias', 'decoder.layers.11.final_layer_norm.weight', 'decoder.layers.11.final_layer_norm.bias', 'decoder.layer_norm.weight', 'decoder.layer_norm.bias', 'decoder.output_projection.weight']
-
- for name in para_names:
-
- if name in model_ori:
- model[name]=model_ori[name]
- elif "MoeMLP.gate.gate.weight" in name:
- model[name]=torch.ones([Num_experts,Hidden])
- elif "MoeMLP.gate.gate.bias" in name:
- model[name]=torch.zeros([Num_experts])
- elif "MoeMLP.experts.htoh4" in name:
- name_ori = name.replace("MoeMLP.experts.htoh4","fc1")
- model[name]=model_ori[name_ori].unsqueeze(dim=0)
- elif "MoeMLP.experts.h4toh" in name:
- name_ori = name.replace("MoeMLP.experts.h4toh","fc2")
- model[name]=model_ori[name_ori].unsqueeze(dim=0)
- elif "decoder.output_projection.weight"==name:
- model[name]=model_ori['encoder.embed_tokens.weight']
- else:
- print(f"No that name:{name}")
-
-
- # last_name="None"
- # for name_ori in model_ori.keys():
- # name = name_ori
- # if ".fc1." in name_ori:
- # name = name_ori.replace(".fc1.",".MoeMLP.experts.htoh4.")
-
- # model[name] = model_ori[name_ori].unsqueeze(dim=0)
- # elif ".fc2." in name_ori:
- # name = name_ori.replace(".fc2.",".MoeMLP.experts.h4toh.")
- # model[name] = model_ori[name_ori].unsqueeze(dim=0)
- # elif "encoder_attn_layer_norm.bias" in last_name:
- # gate_name1 = last_name.replace("encoder_attn_layer_norm.bias","MoeMLP.gate.gate.weight")
- # gate_name2 = last_name.replace("encoder_attn_layer_norm.bias","MoeMLP.gate.gate.bias")
- # model[gate_name1]=torch.ones([Num_experts,Hidden])
- # model[gate_name2]=torch.zeros([Num_experts])
- # elif "self_attn_layer_norm.bias" in last_name:
- # gate_name1 = last_name.replace("self_attn_layer_norm.bias","MoeMLP.gate.gate.weight")
- # gate_name2 = last_name.replace("self_attn_layer_norm.bias","MoeMLP.gate.gate.bias")
- # model[gate_name1]=torch.ones([Num_experts,Hidden])
- # model[gate_name2]=torch.zeros([Num_experts])
- # else:
- # model[name] = model_ori[name_ori]
- # last_name=name_ori
-
-
- # for i in range(Decoder_layers):
- # name = f'decoder.layers{i}.MoeMLP.gate.gate.weight'
- # model[name]=torch.ones([Hidden,Num_experts])
-
- # name = f'decoder.layers.{i}.MoeMLP.gate.gate.bias'
- # model[name]=torch.zeros([Num_experts])
-
- # for i in range(Encoder_layers):
- # name = f'encoder.layers{i}.MoeMLP.gate.gate.weight'
- # model[name]=torch.ones([Hidden,Num_experts])
-
- # name = f'encoder.layers.{i}.MoeMLP.gate.gate.bias'
- # model[name]=torch.zeros([Num_experts])
-
-
-
-
-
- chk['last_optimizer_state']=None
- chk['model'] = model
- chk['args'].arch='transformer_moe_wmt_en_de_big'
-
- for i in range(Num_experts):
- save_dir=f"/userhome/fairseq/fairseq/checkpoint/Moe-fairseq-DDP/checkpoint_last_expert_{i}.pt"
- torch.save(chk, save_dir)
- print(f"saved on {save_dir}")
-
- print(f"saved all!")
-
|