jikuai
/
test
forked from skywalk163/airoot

 
			
							#!/usr/bin/env python
# coding: utf-8
# # 尝试复现BEiT论文
# !pip install yacs

# # 开始论文复现
# # 首先复现模型部分
# 
# 先查找替换，把常见的api替换掉！要注意，Mindspore大部分是元祖，而飞桨里面很多习惯写列表
# ```python
# 飞桨      MindSpore
# nn.Layer  nn.Cell
# nn.functional mindspore.ops
# nn.Linear  nn.Dense
# F.linear 写个MindSpore版本的，叫mslinear ，原来mindspore有Dense啊！
# forward construct
# nn.Conv2D nn.Conv2d
# dropout 0.0。dropout 1.0 有个1-x的关系
# F.softmax(x)  ops.Softmax(axis=-1)(x)
# transpose 这个不用转换
# @符号就是matmul ops.matmul
# paddle.shape mindspore.ops.shape
# 
# paddle.create_parameter(
#             shape=[self.num_relative_distance, num_heads],
#             dtype="float32",
#             default_initializer=zeros_,    
#                                             改成 mindspore.Parameter(ops.Zeros()((1, 1, embed_dim),mindspore.float32))`
# paddle.linspace mindspore.ops.LinSpace() mindspore的linespace不能运行在cpu下
# paddle.zeros([2,3]) ops.Zeros()((2, 3), mindspore.float32)
# paddle.zeros_like ops.zeros_like
# coords = paddle.stack(paddle.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
# paddle.meshgrid ops.Meshgrid(indexing="ij") 并将列表换成元组
# paddle.stack ops.Stack()
# paddle.flatten ops.Flatten()
# coords_flatten.unsqueeze(axis=2 )   ops.ExpandDims()(coords_flatten, 2)  最终效果
#     relative_coords = ops.ExpandDims()(coords_flatten, 2) - ops.ExpandDims()(coords_flatten, 1)
# cancat 用ops.Concat    
# expand ops.BroadcastTo    
# ```
# In[54]:


import numpy as np
from mindspore import ops
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.ops import Identity
# import mindspore.nn.functional as F
import mindspore.ops as F
import mindspore
import mindspore.context as context
# context.set_context(mode=context.PYNATIVE_MODE, device_target="CPU")
context.set_context(mode=context.PYNATIVE_MODE, device_target="Ascend")
# print(f"MindSpore版本：{mindspore.__version__}")

import numpy as np
# 加入计算时间代码
import time
class Timer:  #@save
    """记录多次运行时间"""
    def __init__(self):
        self.times = []
        self.start()

    def start(self):
        """启动计时器"""
        self.tik = time.time()

    def stop(self):
        """停止计时器并将时间记录在列表中"""
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
        """返回平均时间"""
        return sum(self.times) / len(self.times)

    def sum(self):
        """返回时间总和"""
        return sum(self.times)

    def cumsum(self):
        """返回累计时间"""
        return np.array(self.times).cumsum().tolist()


class Benchmark:
    """用于测量运行时间"""
    def __init__(self, description='Done'):
        self.description = description

    def __enter__(self):
        self.timer = Timer()
        return self

    def __exit__(self, *args):
        print(f'{self.description}: {self.timer.stop():.4f} sec')
# In[55]:


def mslinear(x, weight, bias=None):
    '''MindSpore下的linear函数'''
    out = mindspore.ops.matmul(x,weight.T) 
    if bias is not None:
        out += bias 
    return out


#   Copyright (c) 2021 PPViT Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Droppath, reimplement from https://github.com/yueatsprograms/Stochastic_Depth
"""
import mindspore
import mindspore.nn as nn


class DropPath(nn.Cell):
    """DropPath class"""
    def __init__(self, drop_prob=None):
        super().__init__()
        self.drop_prob = drop_prob

    def drop_path(self, inputs):
        """drop path op
        Args:
            input: tensor with arbitrary shape
            drop_prob: float number of drop path probability, default: 0.0
            training: bool, if current mode is training, default: False
        Returns:
            output: output tensor after drop path
        """
        # if prob is 0 or eval mode, return original input
        if self.drop_prob == 0. or not self.training:
            return inputs
        keep_prob = 1 - self.drop_prob
        keep_prob = mindspore.Tensor(keep_prob, dtype='float32')
        shape = (inputs.shape[0], ) + (1, ) * (inputs.ndim - 1)  # shape=(N, 1, 1, 1)
        random_tensor = keep_prob + mindspore.numpy.rand(shape, dtype=inputs.dtype)
        random_tensor = random_tensor.floor() # mask
        output = inputs.divide(keep_prob) * random_tensor # divide to keep same output expectation
        return output

    def construct(self, inputs):
        return self.drop_path(inputs)


# def main():
#     import numpy as np
#     tmp = mindspore.Tensor(np.random.rand(8, 16, 8, 8), dtype=mindspore.float32)
#     dp = DropPath(0.5)
#     out = dp(tmp)
#     print(out.shape)
    
# if __name__ == "__main__":
#     main()


# ## 主代码部分

# In[58]:


# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
BEiT in Paddle

A Paddle Implementation of BEiT as described in:

"BEiT: BERT Pre-Training of Image Transformers"
    - Paper Link: https://arxiv.org/abs/2106.08254 
"""
import math
import copy
from functools import partial
import mindspore
import mindspore.nn as nn
# import mindspore.nn.functional as F
import mindspore.ops as F
# from droppath import DropPath

# 初始化先去掉，以后再做
# trunc_normal_ = nn.initializer.TruncatedNormal(std=0.02)
# zeros_ = nn.initializer.Constant(value=0.0)
# ones_ = nn.initializer.Constant(value=1.0)


class Mlp(nn.Cell):
    """MLP module

    MLP using nn.Dense and activation is GELU, dropout is applied.
    Ops: fc1 -> act -> dropout -> fc2 -> dropout

    """

    def __init__(self,
                 in_features,
                 hidden_features=None,
                 out_features=None,
                 act_layer=nn.GELU,
                 drop=0.0):
        super().__init__()
        out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Dense(in_features, hidden_features)
        self.act = act_layer()
        self.fc2 = nn.Dense(hidden_features, out_features)
        self.drop = nn.Dropout(1-drop)

    def construct(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        return x
    
def main():
    import numpy as np
    tmp = mindspore.Tensor(np.random.rand(8, 16), dtype=mindspore.float32)
    mlp = Mlp(16, 32, 512)
    out = mlp(tmp)
    print(out.shape)
    
if __name__ == "__main__":
    main()


# In[59]:


class PatchEmbed(nn.Cell):
    """2D Image to Patch Embedding

    Apply patch embeddings on input images. Embeddings is implemented using a Conv2D op.

    """
    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 embed_dim=768,
                 norm_layer=None,
                 flatten=True):
        super().__init__()
        tmpimgsize = img_size
        tmppatchsize = patch_size
        img_size = (img_size, img_size)
        patch_size = (patch_size, patch_size)
        self.img_size = img_size
        self.patch_size = patch_size
        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
        self.num_patches = self.grid_size[0] * self.grid_size[1]
        self.flatten = flatten

        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=tmppatchsize, stride=tmppatchsize)
        self.norm = norm_layer((embed_dim,)) if norm_layer else ops.Identity()


    def construct(self, x):
        B, C, H, W = x.shape
        x = self.proj(x)
        if self.flatten:
            B, C, H, W = x.shape
            x = x.reshape([B, C, H*W])
            transpose = ops.Transpose()
            x = transpose(x, (0, 2, 1))
        x = self.norm(x)
        return x


def main():
    import numpy as np
    tmp = mindspore.Tensor(np.random.rand(16, 3, 224, 224), dtype=mindspore.float32)

    patchembed = PatchEmbed(flatten=True)
    out = patchembed(tmp)
    print(out.shape)
#     print(out)
    
if __name__ == "__main__":
    main()


# ## 开始注意力模块

# In[60]:


class Attention(nn.Cell):
    """Attention Layer"""
    def __init__(self,
                 dim,
                 num_heads=8,
                 qkv_bias=False,
                 attn_drop=0.1,
                 proj_drop=0.1,
                 window_size=None,
                 attn_head_dim=None):
        super().__init__()
        self.num_heads = num_heads
        head_dim = dim // num_heads
        if attn_head_dim is not None:
            head_dim = attn_head_dim
        all_head_dim = head_dim * self.num_heads
        self.scale = head_dim ** -0.5

#         self.qkv = nn.Dense(dim, all_head_dim * 3, bias_attr=False)
        self.qkv = nn.Dense(dim, all_head_dim * 3, has_bias=False)
        if qkv_bias:
#             self.q_bias = paddle.create_parameter(
#                 shape=[all_head_dim], dtype="float32", default_initializer=zeros_
#             )
            self.q_bias = mindspore.Parameter(ops.Zeros()((all_head_dim), mindspore.float32))
#             self.v_bias = paddle.create_parameter(
#                 shape=[all_head_dim], dtype="float32", default_initializer=zeros_
#             )
            self.v_bias = mindspore.Parameter(ops.Zeros()((all_head_dim), mindspore.float32))
        else:
            self.q_bias = None
            self.v_bias = None

        if window_size:
            self.window_size = window_size
            self.num_relative_distance = (2 * window_size[0] - 1) * (
                2 * window_size[1] - 1
            ) + 3

#             self.relative_position_bias_table = paddle.create_parameter(
#                 shape=[self.num_relative_distance, num_heads],
#                 dtype="float32",
#                 default_initializer=zeros_,
#             )  # 2*Wh-1 * 2*Ww-1, nH
            # cls to token & token 2 cls & cls to cls
            self.relative_position_bias_table = mindspore.Parameter(ops.Zeros()((self.num_relative_distance, num_heads), mindspore.float32))
            # get pair-wise relative position index for each token inside the window
#             coords_h = paddle.arange(window_size[0])
            coords_h = Tensor(np.arange(window_size[0]), mindspore.int64)
#             coords_w = paddle.arange(window_size[1])
            coords_w = Tensor(np.arange(window_size[1]), mindspore.int64)
            coords = ops.Stack()(ops.Meshgrid(indexing="ij")((coords_h, coords_w)))  # 2, Wh, Ww
#             coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
            coords_flatten = ops.Flatten()(coords)
#             relative_coords = coords_flatten.unsqueeze(
#                 axis=2
#             ) - coords_flatten.unsqueeze(
#                 axis=1
#             )  # 2, Wh*Ww, Wh*Ww #??
            relative_coords = ops.ExpandDims()(coords_flatten, 2) - ops.ExpandDims()(coords_flatten, 1)
            relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
#             print(f"relative_coords[:, :, 0] relative_coords.shape{relative_coords.shape}window_size[0] - 1{window_size[0] - 1}")
#             print(f"==")
            relative_coords =relative_coords.astype(mindspore.float32)
            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
            relative_coords[:, :, 1] += window_size[1] - 1
            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
            relative_position_index = ops.Zeros()(
                (
                    window_size[0] * window_size[1] + 1,
                    window_size[0] * window_size[1] + 1,
                ),
                relative_coords.dtype,
            )
            # Wh*Ww, Wh*Ww
            relative_position_index[1:, 1:] = relative_coords.sum(-1)
            relative_position_index[0, 0:] = self.num_relative_distance - 3
            relative_position_index[0:, 0] = self.num_relative_distance - 2
            relative_position_index[0, 0] = self.num_relative_distance - 1

#             self.register_buffer("relative_position_index", relative_position_index)
            self.relative_position_index = relative_position_index.astype(mindspore.int64)
#             print(f"==relative_position_index .stop_gradient:{relative_position_index.stop_gradient}")

        else:
            self.window_size = None
            self.relative_position_bias_table = None
            self.relative_position_index = None

        self.attn_drop = nn.Dropout(1-attn_drop)
        self.proj = nn.Dense(all_head_dim, dim)
        self.proj_drop = nn.Dropout(1-proj_drop)

#     def construct(self, x, rel_pos_bias):
#         B, N, C = x.shape
#         qkv_bias = None
#         if self.q_bias is not None:
# #             qkv_bias = paddle.concat(
# #                 (self.q_bias, ops.zeros_like(self.v_bias), self.v_bias)
# #             )
# #             print(f"==concat {self.q_bias.shape, ops.zeros_like(self.v_bias).shape, self.v_bias.shape}")
# #             print(f"==concat {self.q_bias.dtype, ops.zeros_like(self.v_bias).dtype, self.v_bias.dtype}")
#             qkv_bias = ops.Concat()((self.q_bias, ops.zeros_like(self.v_bias), self.v_bias))
# #         print(f"==qkv = mslinear {x.shape, self.qkv.weight.shape}")
#         qkv = mslinear(x=x, weight=self.qkv.weight, bias=qkv_bias)

# #         qkv = qkv.reshape([paddle.shape(x)[0], paddle.shape(x)[1], 3, self.num_heads, -1]).transpose([2, 0, 3, 1, 4])
#         qkv = qkv.reshape([B, N, 3, self.num_heads, -1]).transpose([2, 0, 3, 1, 4])
#         # make torchscript happy (cannot use tensor as tuple)
#         q, k, v = qkv[0], qkv[1], qkv[2]

#         q = q * self.scale
# #         print("==q k:", q.shape, k.shape)
# #         attn = q @ k.transpose([0, 1, 3, 2])
#         attn = ops.matmul(q, k.transpose([0, 1, 3, 2]))

#         if self.relative_position_bias_table is not None:
#             relative_position_bias = self.relative_position_bias_table[
#                 self.relative_position_index.reshape([-1])
#             ].reshape(
#                 [
#                     self.window_size[0] * self.window_size[1] + 1,
#                     self.window_size[0] * self.window_size[1] + 1,
#                     -1,
#                 ]
#             )  # Wh*Ww,Wh*Ww,nH
#             relative_position_bias = relative_position_bias.transpose(
#                 [2, 0, 1]
#             )  # nH, Wh*Ww, Wh*Ww

# #             attn = attn + relative_position_bias.unsqueeze(axis=0)
#             attn = attn + ops.ExpandDims()(relative_position_bias, 0)

#         if rel_pos_bias is not None:
#             attn = attn + rel_pos_bias

#         attn = nn.Softmax(axis=-1)(attn)
#         attn = self.attn_drop(attn)
        
# #         x = (attn @ v).transpose([0, 2, 1, 3]).reshape([paddle.shape(x)[0], paddle.shape(x)[1], -1])
#         x = ops.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, -1])
#         x = self.proj(x)
#         x = self.proj_drop(x)
#         return x
    def construct(self, x, rel_pos_bias):
        B, N, C = x.shape
        qkv_bias = None
        if self.q_bias is not None:
            qkv_bias = ops.Concat()((self.q_bias, ops.zeros_like(self.v_bias), self.v_bias))
        qkv = mslinear(x=x, weight=self.qkv.weight, bias=qkv_bias)

        qkv = qkv.reshape([B, N, 3, self.num_heads, -1]).transpose([2, 0, 3, 1, 4])
        # make torchscript happy (cannot use tensor as tuple)
        q, k, v = qkv[0], qkv[1], qkv[2]

        q = q * self.scale
        attn = ops.matmul(q, k.transpose([0, 1, 3, 2]))

        if self.relative_position_bias_table is not None:
            relative_position_bias = self.relative_position_bias_table[
                self.relative_position_index.reshape([-1])
            ].reshape(
                [
                    self.window_size[0] * self.window_size[1] + 1,
                    self.window_size[0] * self.window_size[1] + 1,
                    -1,
                ]
            )  # Wh*Ww,Wh*Ww,nH
            relative_position_bias = relative_position_bias.transpose(
                [2, 0, 1]
            )  # nH, Wh*Ww, Wh*Ww

            attn = attn + ops.ExpandDims()(relative_position_bias, 0)

        if rel_pos_bias is not None:
            attn = attn + rel_pos_bias

        attn = nn.Softmax(axis=-1)(attn)
        attn = self.attn_drop(attn)
        
        x = ops.matmul(attn, v).transpose([0, 2, 1, 3]).reshape([B, N, -1])
        x = self.proj(x)
        x = self.proj_drop(x)
        return x

    
def main():
    import numpy as np
    tmp = mindspore.Tensor(np.random.rand(196, 16, 768), dtype=mindspore.float32)
#     print(tmp.shape, tmp.size)
    attention = Attention(dim=768 )
    out = attention(tmp, rel_pos_bias=0.1)
    print(out.shape)
#     print(out)
    
if __name__ == "__main__":
    main()
    

# In[61]:


class Block(nn.Cell):
    def __init__(self,
                 dim,
                 num_heads,
                 mlp_ratio=4.0,
                 qkv_bias=False,
                 drop=0.1,
                 attn_drop=0.1,
                 drop_path=0.1,
                 init_values=None,
                 act_layer=nn.GELU,
                 norm_layer=nn.LayerNorm,
                 window_size=None,
                 attn_head_dim=None):
        super().__init__()
        self.norm1 = norm_layer((dim,))
        self.attn = Attention(
            dim,
            num_heads=num_heads,
            qkv_bias=qkv_bias,
            attn_drop=1-attn_drop,
            proj_drop=1-drop,
            window_size=window_size,
            attn_head_dim=attn_head_dim,
        )
        self.drop_path = DropPath(drop_path) if drop_path > 0.0 else Identity()
        self.norm2 = norm_layer((dim,))
        mlp_hidden_dim = int(dim * mlp_ratio)
        self.mlp = Mlp(
            in_features=dim,
            hidden_features=mlp_hidden_dim,
            act_layer=act_layer,
            drop=1-drop,
        )

        if init_values:

            self.gamma_1 = mindspore.Parameter(ops.Zeros()((dim), mindspore.float32))

            self.gamma_2 = mindspore.Parameter(ops.Zeros()((dim), mindspore.float32))
        else:
            self.gamma_1, self.gamma_2 = None, None

    def construct(self, x, rel_pos_bias):
        if self.gamma_1 is None:
            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
            x = x + self.drop_path(self.mlp(self.norm2(x)))
        else:
            x = x + self.drop_path(
                self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias)
            )
            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
        return x
    
def main():
    import numpy as np
    tmp = mindspore.Tensor(np.random.rand(196, 16, 768), dtype=mindspore.float32)
#     print(tmp.shape, tmp.size)
    block = Block(dim=768, num_heads=12)
    out = block(tmp, rel_pos_bias=0.1)
    print(out.shape)
#     print(out)
    
if __name__ == "__main__":
    main()


# In[62]:


class RelativePositionBias(nn.Cell):
    def __init__(self, window_size, num_heads):
        super().__init__()
        self.window_size = window_size
        self.num_relative_distance = (2 * window_size[0] - 1) * (
            2 * window_size[1] - 1
        ) + 3

#         self.relative_position_bias_table = paddle.create_parameter(
#             shape=[self.num_relative_distance, num_heads],
#             dtype="float32",
#             default_initializer=zeros_,
#         )  # 2*Wh-1 * 2*Ww-1, nH
        # cls to token & token 2 cls & cls to cls
        self.relative_position_bias_table = mindspore.Parameter(ops.Zeros()((self.num_relative_distance, num_heads), mindspore.float32))
        
        # get pair-wise relative position index for each token inside the window
#         coords_h = paddle.arange(window_size[0])
        coords_h = Tensor(np.arange(window_size[0]), mindspore.int64)
#         coords_w = paddle.arange(window_size[1])
        coords_2 = Tensor(np.arange(window_size[1]), mindspore.int64)
        coords = ops.Stack()(ops.Meshgrid(indexing="ij")((coords_h, coords_w)))  # 2, Wh, Ww
#         coords_flatten = paddle.flatten(coords, 1)  # 2, Wh*Ww
        coords_flatten = ops.Flatten()(coords)
#         relative_coords = coords_flatten.unsqueeze(axis=2) - coords_flatten.unsqueeze(
#             axis=1
#         )  # 2, Wh*Ww, Wh*Ww
        relative_coords = ops.ExpandDims()(coords_flatten, 2) - ops.ExpandDims()(coords_flatten, 1)
        relative_coords = relative_coords.transpose([1, 2, 0])  # Wh*Ww, Wh*Ww, 2
        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
        relative_coords[:, :, 1] += window_size[1] - 1
        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
#         relative_position_index = paddle.zeros(
#             [window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1]
#         )
        relative_position_index = ops.Zeros()(
        (window_size[0] * window_size[1] + 1, window_size[0] * window_size[1] + 1), mindspore.int64)
        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
        relative_position_index[0, 0:] = self.num_relative_distance - 3
        relative_position_index[0:, 0] = self.num_relative_distance - 2
        relative_position_index[0, 0] = self.num_relative_distance - 1

#         self.register_buffer("relative_position_index", relative_position_index)
        self.relative_position_index = relative_position_index.astype(mindspore.int64)
        # trunc_normal_(self.relative_position_bias_table, std=.02)

    def construct(self):
        relative_position_bias = self.relative_position_bias_table[
            self.relative_position_index.reshape([-1])].reshape(
                self.window_size[0] * self.window_size[1] + 1,
                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
        return relative_position_bias.transpose([2, 0, 1])  # nH, Wh*Ww, Wh*Ww


# In[63]:


class Beit(nn.Cell):
    """Beit Layer"""
    def __init__(self,
                 img_size=224,
                 patch_size=16,
                 in_chans=3,
                 num_classes=1000,
                 embed_dim=768,
                 depth=12,
                 num_heads=12,
                 mlp_ratio=4.0,
                 qkv_bias=True,
                 drop_rate=0.0,
                 attn_drop_rate=0.0,
                 drop_path_rate=0.0,
                 norm_layer=partial(nn.LayerNorm, epsilon=1e-6),
                 init_values=None,
                 use_abs_pos_emb=True,
                 use_rel_pos_bias=False,
                 use_shared_rel_pos_bias=False,
                 use_mean_pooling=True,
                 init_scale=0.001):
        super().__init__()
        self.num_classes = num_classes
        # num_features for consistency with other models
        self.num_features = self.embed_dim = embed_dim

        self.patch_embed = PatchEmbed(
            img_size=img_size,
            patch_size=patch_size,
            in_chans=in_chans,
            embed_dim=embed_dim,
        )
        num_patches = self.patch_embed.num_patches

#         self.cls_token = paddle.create_parameter(
#             shape=[1, 1, embed_dim],
#             dtype="float32",
#             default_initializer=trunc_normal_,
#         )
        self.cls_token = mindspore.Parameter(ops.Zeros()((1, 1, embed_dim), mindspore.float32))
        if use_abs_pos_emb:
#             self.pos_embed = paddle.create_parameter(
#                 shape=[1, num_patches + 1, embed_dim],
#                 dtype="float32",
#                 default_initializer=trunc_normal_,
#             )
            self.pos_embed = mindspore.Parameter(ops.Zeros()((1, num_patches + 1, embed_dim), mindspore.float32)) 
        else:
            self.pos_embed = None
        self.pos_drop = nn.Dropout(1-drop_rate)

        if use_shared_rel_pos_bias:
            self.rel_pos_bias = RelativePositionBias(
                window_size=self.patch_embed.grid_size, num_heads=num_heads
            )
        else:
            self.rel_pos_bias = None

        # stochastic depth decay rule
        dpr = [x for x in mindspore.ops.LinSpace()(Tensor(0, mindspore.float32), drop_path_rate, depth)]
        self.use_rel_pos_bias = use_rel_pos_bias
        self.blocks = nn.CellList(
            [
                Block(
                    dim=embed_dim,
                    num_heads=num_heads,
                    mlp_ratio=mlp_ratio,
                    qkv_bias=qkv_bias,
                    drop=1-drop_rate,
                    attn_drop=1-attn_drop_rate,
                    drop_path=dpr[i],
                    norm_layer=norm_layer,
                    init_values=init_values,
                    window_size=self.patch_embed.grid_size if use_rel_pos_bias else None,
                )
                for i in range(depth)
            ]
        )
        self.norm = Identity() if use_mean_pooling else norm_layer((embed_dim,))
        self.fc_norm = norm_layer((embed_dim,)) if use_mean_pooling else None
        self.head = nn.Dense(embed_dim, num_classes) if num_classes > 0 else Identity()

        #初始化，先注释掉
#         self.apply(self._init_weights)
#         self.fix_init_weight()
#         if isinstance(self.head, nn.Dense):
#             trunc_normal_(self.head.weight)
#             self.head.weight.set_value(
#                 self.head.weight.multiply(paddle.to_tensor(init_scale))
#             )
#             self.head.bias.set_value(
#                 self.head.bias.multiply(paddle.to_tensor(init_scale))
#             )

    def fix_init_weight(self):
        def rescale(param, layer_id):
            param.set_value(param.divide(paddle.to_tensor(math.sqrt(2.0 * layer_id))))

        for layer_id, layer in enumerate(self.blocks):
            rescale(layer.attn.proj.weight, layer_id + 1)
            rescale(layer.mlp.fc2.weight, layer_id + 1)

    def _init_weights(self, m):
        if isinstance(m, nn.Dense):
            trunc_normal_(m.weight)
            if isinstance(m, nn.Dense) and m.bias is not None:
                zeros_(m.bias)
        elif isinstance(m, nn.CellNorm):
            zeros_(m.bias)
            ones_(m.weight)

    def get_num_layers(self):
        return len(self.blocks)

    def get_classifier(self):
        return self.head

    def reset_classifier(self, num_classes):
        self.num_classes = num_classes
        self.head = (
            nn.Dense(self.embed_dim, num_classes) if num_classes > 0 else Identity()
        )

    def construct_features(self, x):
        x = self.patch_embed(x)
        batch_size, seq_len, _ = x.shape

        #cls_tokens = self.cls_token.expand([batch_size, 1, self.embed_dim])
        #         cls_tokens = self.cls_token.expand([mindspore.ops.shape(x)[0], 1, self.embed_dim])
        cls_tokens = ops.BroadcastTo((mindspore.ops.shape(x)[0], 1, self.embed_dim))(self.cls_token)
        #cls_tokens = self.cls_token.expand([batch_size, -1, -1])

        #         x = paddle.concat((cls_tokens, x), axis=1)
        x = ops.Concat(axis=1)((cls_tokens, x))

        if self.pos_embed is not None:
            x = x + self.pos_embed
        x = self.pos_drop(x)

        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
        for blk in self.blocks:
            x = blk(x, rel_pos_bias=rel_pos_bias)

        x = self.norm(x)
        if self.fc_norm is not None:
            t = x[:, 1:, :]
            return self.fc_norm(t.mean(1))

        return x[:, 0]

    def construct(self, x):
        x = self.construct_features(x)
        x = self.head(x)
        return x


def build_beit(config):
    """ build beit from config"""
    model = Beit(
        img_size=config.DATA.IMAGE_SIZE,
        num_classes=config.MODEL.NUM_CLASSES,
        patch_size=config.MODEL.PATCH_SIZE,
        embed_dim=config.MODEL.EMBED_DIM,
        depth=config.MODEL.DEPTH,
        num_heads=config.MODEL.NUM_HEADS,
        mlp_ratio=config.MODEL.MLP_RATIO,
        use_abs_pos_emb=config.MODEL.USE_ABS_POS_EMB,
        use_rel_pos_bias=config.MODEL.USE_REL_POS_BIAS,
        init_values=config.MODEL.INIT_VALUES,
        qkv_bias=config.MODEL.QKV_BIAS,
    )
    return model

def main():
    import mindspore
    import numpy 
    tmp = mindspore.Tensor(np.random.rand(8, 3, 224, 224), dtype=mindspore.float32)
#     print(tmp.shape, tmp.size)
    tmpbeit = Beit()
    # with Benchmark("MindSpore速度"):
    out = tmpbeit(tmp)
    print(out.shape)
#     print(out)

    for i in range(9):
        images = Tensor(numpy.random.rand(2**i, 3, 224, 224), mindspore.float32)
        for j in range(3):
            with Benchmark("MindSpore速度"):
                output = tmpbeit(images)
        print(output.shape)
    
if __name__ == "__main__":
    main()
# # 测试模型

# In[64]:


# !pip install yacs


# In[65]:


# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import numpy as np
# 加入计算时间代码
import time
class Timer:  #@save
    """记录多次运行时间"""
    def __init__(self):
        self.times = []
        self.start()

    def start(self):
        """启动计时器"""
        self.tik = time.time()

    def stop(self):
        """停止计时器并将时间记录在列表中"""
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
        """返回平均时间"""
        return sum(self.times) / len(self.times)

    def sum(self):
        """返回时间总和"""
        return sum(self.times)

    def cumsum(self):
        """返回累计时间"""
        return np.array(self.times).cumsum().tolist()


class Benchmark:
    """用于测量运行时间"""
    def __init__(self, description='Done'):
        self.description = description

    def __enter__(self):
        self.timer = Timer()
        return self

    def __exit__(self, *args):
        print(f'{self.description}: {self.timer.stop():.4f} sec')
        
# 测试模型
# Copyright (c) 2021 PPViT Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Configuration
Configurations for (1) data processing, (2) model archtecture, and (3) training settings, etc.
Config can be set by .yaml file or by argparser
"""
import os
from yacs.config import CfgNode as CN
import yaml

_C = CN()
_C.BASE = ['']

# data settings
_C.DATA = CN()
_C.DATA.BATCH_SIZE = 2  # train batch_size on single GPU = 256
_C.DATA.BATCH_SIZE_EVAL = None  # (disabled in update_config) val batch_size on single GPU
_C.DATA.DATA_PATH = '/dataset/imagenet/'  # path to dataset
_C.DATA.DATASET = 'imagenet2012'  # dataset name, currently only support imagenet2012
_C.DATA.IMAGE_SIZE = 224  # input image size e.g., 224
_C.DATA.SECOND_IMAGE_SIZE = 112  # 2nd input image size e.g., 112
_C.DATA.IMAGE_CHANNELS = 3  # input image channels: e.g., 3
_C.DATA.CROP_PCT = 0.875  # input image scale ratio, scale is applied before centercrop in eval mode
_C.DATA.NUM_WORKERS = 1  # number of data loading threads
_C.DATA.IMAGENET_MEAN = [0.5, 0.5, 0.5]  # [0.485, 0.456, 0.406]  # imagenet mean values
_C.DATA.IMAGENET_STD = [0.5, 0.5, 0.5]  # [0.229, 0.224, 0.225]  # imagenet std values

# model general settings
_C.MODEL = CN()
_C.MODEL.TYPE = 'beit'
_C.MODEL.VAE_TYPE = 'dall-e'
_C.MODEL.NAME = 'beit'
_C.MODEL.RESUME = None  # full model path for resume training
_C.MODEL.PRETRAINED = None  # full model path for finetuning
_C.MODEL.NUM_CLASSES = 1000  # num of classes for classifier # 1000
_C.MODEL.DROPOUT = 0.0
_C.MODEL.ATTENTION_DROPOUT = 0.0
_C.MODEL.DROPPATH = 0.1
# model transformer settings
_C.MODEL.PATCH_SIZE = 16
_C.MODEL.EMBED_DIM = 768
_C.MODEL.NUM_HEADS = 12
_C.MODEL.ATTN_HEAD_SIZE = None  # if None, use embed_dim // num_heads as head dim
_C.MODEL.DEPTH = 12
_C.MODEL.QK_SCALE = None
_C.MODEL.QKV_BIAS = True
_C.MODEL.MLP_RATIO = 4.0  # for cait class_token ratio also set to MLP_RATIO
_C.MODEL.USE_ABS_POS_EMB = False
_C.MODEL.USE_REL_POS_BIAS = True
_C.MODEL.INIT_VALUES = 1e-4


# training settings
_C.TRAIN = CN()
_C.TRAIN.LAST_EPOCH = 0
_C.TRAIN.NUM_EPOCHS = 100
_C.TRAIN.WARMUP_EPOCHS = 20
_C.TRAIN.WEIGHT_DECAY = 0.05
_C.TRAIN.LAYER_DECAY = 0.65
_C.TRAIN.BASE_LR = 4e-3
_C.TRAIN.WARMUP_START_LR = 0.0
_C.TRAIN.END_LR = 1e-6
_C.TRAIN.GRAD_CLIP = None
_C.TRAIN.ACCUM_ITER = 1
_C.TRAIN.LINEAR_SCALED_LR = 512

# optimizer
_C.TRAIN.OPTIMIZER = CN()
_C.TRAIN.OPTIMIZER.NAME = 'AdamWDL'
_C.TRAIN.OPTIMIZER.EPS = 1e-8
_C.TRAIN.OPTIMIZER.BETAS = (0.9, 0.999)

# model ema
_C.TRAIN.MODEL_EMA = True
_C.TRAIN.MODEL_EMA_DECAY = 0.9999
_C.TRAIN.MODEL_EMA_FORCE_CPU = False

# data augmentation (optional, check datasets.py)
_C.TRAIN.SMOOTHING = 0.1
_C.TRAIN.COLOR_JITTER = 0.4  # if both auto augment and rand augment are False, use color jitter
_C.TRAIN.AUTO_AUGMENT = False  # rand augment is used if both rand and auto augment are set True
_C.TRAIN.RAND_AUGMENT = True
_C.TRAIN.RAND_AUGMENT_LAYERS = 2
_C.TRAIN.RAND_AUGMENT_MAGNITUDE = 9  # scale from 0 to 9
# mixup params (optional, check datasets.py)
_C.TRAIN.MIXUP_ALPHA = 0.8
_C.TRAIN.MIXUP_PROB = 1.0
_C.TRAIN.MIXUP_SWITCH_PROB = 0.5
_C.TRAIN.MIXUP_MODE = 'batch'
_C.TRAIN.CUTMIX_ALPHA = 1.0
_C.TRAIN.CUTMIX_MINMAX = None
# random erase params (optional, check datasets.py)
_C.TRAIN.RANDOM_ERASE_PROB = 0.25
_C.TRAIN.RANDOM_ERASE_MODE = 'pixel'
_C.TRAIN.RANDOM_ERASE_COUNT = 1
_C.TRAIN.RANDOM_ERASE_SPLIT = False

# misc
_C.SAVE = "./output"  # output folder, saves logs and weights
_C.SAVE_FREQ = 15  # freq to save chpt
_C.REPORT_FREQ = 20  # freq to logging info
_C.VALIDATE_FREQ = 1  # freq to do validation
_C.SEED = 0  # random seed
_C.EVAL = False  # run evaluation only
_C.AMP = False  # auto mix precision training


def _update_config_from_file(config, cfg_file):
    """Load cfg file (.yaml) and update config object

    Args:
        config: config object
        cfg_file: config file (.yaml)
    Return:
        None
    """
    config.defrost()
    with open(cfg_file, 'r') as infile:
        yaml_cfg = yaml.load(infile, Loader=yaml.FullLoader)
    for cfg in yaml_cfg.setdefault('BASE', ['']):
        if cfg:
            _update_config_from_file(
                config, os.path.join(os.path.dirname(cfg_file), cfg)
            )
    config.merge_from_file(cfg_file)
    config.freeze()


def update_config(config, args):
    """Update config by ArgumentParser
    Configs that are often used can be updated from arguments
    Args:
        args: ArgumentParser contains options
    Return:
        config: updated config
    """
    if args.cfg:
        _update_config_from_file(config, args.cfg)
    config.defrost()
    if args.dataset:
        config.DATA.DATASET = args.dataset
    if args.batch_size:
        config.DATA.BATCH_SIZE = args.batch_size
        config.DATA.BATCH_SIZE_EVAL = args.batch_size
    if args.batch_size_eval:
        config.DATA.BATCH_SIZE_EVAL = args.batch_size_eval
    if args.image_size:
        config.DATA.IMAGE_SIZE = args.image_size
    if args.accum_iter:
        config.TRAIN.ACCUM_ITER = args.accum_iter
    if args.data_path:
        config.DATA.DATA_PATH = args.data_path
    if args.output:
        config.SAVE = args.output
    if args.eval:
        config.EVAL = True
    if args.pretrained:
        config.MODEL.PRETRAINED = args.pretrained
    if args.resume:
        config.MODEL.RESUME = args.resume
    if args.last_epoch:
        config.TRAIN.LAST_EPOCH = args.last_epoch
    if args.amp:  # only for training
        config.AMP = not config.EVAL
    # config.freeze()
    return config


def get_config(cfg_file=None):
    """Return a clone of config and optionally overwrite it from yaml file"""
    config = _C.clone()
    if cfg_file:
        _update_config_from_file(config, cfg_file)
    return config

import argparse
def get_arguments():
    """return argumeents, this will overwrite the config by (1) yaml file (2) argument values"""
    parser = argparse.ArgumentParser('BEiT finetune')
    parser.add_argument('-cfg', type=str, default=None)
    parser.add_argument('-dataset', type=str, default=None)
    parser.add_argument('-data_path', type=str, default=None)
    parser.add_argument('-output', type=str, default=None)
    parser.add_argument('-batch_size', type=int, default=None)
    parser.add_argument('-batch_size_eval', type=int, default=None)
    parser.add_argument('-image_size', type=int, default=None)
    parser.add_argument('-accum_iter', type=int, default=None)
    parser.add_argument('-pretrained', type=str, default=None)
    parser.add_argument('-resume', type=str, default=None)
    parser.add_argument('-last_epoch', type=int, default=None)
    parser.add_argument('-eval', action='store_true')
    parser.add_argument('-amp', action='store_true')
    arguments = parser.parse_args(['-cfg', "beit_base_patch16_224.yaml"])
    return arguments

config = update_config(get_config(), get_arguments())
# config = args[0]
build_model = build_beit
model = build_model(config)

def main():
    # images = paddle.randn([8, 3, 224, 224])
    import numpy 
    images = Tensor(numpy.random.rand(2, 3, 224, 224), mindspore.float32)
    label = 2
    with Benchmark("MindSpore速度"):
        output = model(images)
    print(output.shape)

    for i in range(9):
        images = Tensor(numpy.random.rand(2**i, 3, 224, 224), mindspore.float32)
        with Benchmark("MindSpore速度"):
            output = model(images)
        print(output.shape)
    

if __name__ == "__main__":
    main()